xref: /linux/fs/xfs/xfs_file.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 #include "xfs_error.h"
31 #include "xfs_errortag.h"
32 
33 #include <linux/dax.h>
34 #include <linux/falloc.h>
35 #include <linux/backing-dev.h>
36 #include <linux/mman.h>
37 #include <linux/fadvise.h>
38 #include <linux/mount.h>
39 
40 static const struct vm_operations_struct xfs_file_vm_ops;
41 
42 /*
43  * Decide if the given file range is aligned to the size of the fundamental
44  * allocation unit for the file.
45  */
46 bool
47 xfs_is_falloc_aligned(
48 	struct xfs_inode	*ip,
49 	loff_t			pos,
50 	long long int		len)
51 {
52 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip);
53 
54 	if (!is_power_of_2(alloc_unit))
55 		return isaligned_64(pos, alloc_unit) &&
56 		       isaligned_64(len, alloc_unit);
57 
58 	return !((pos | len) & (alloc_unit - 1));
59 }
60 
61 /*
62  * Fsync operations on directories are much simpler than on regular files,
63  * as there is no file data to flush, and thus also no need for explicit
64  * cache flush operations, and there are no non-transaction metadata updates
65  * on directories either.
66  */
67 STATIC int
68 xfs_dir_fsync(
69 	struct file		*file,
70 	loff_t			start,
71 	loff_t			end,
72 	int			datasync)
73 {
74 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
75 
76 	trace_xfs_dir_fsync(ip);
77 	return xfs_log_force_inode(ip);
78 }
79 
80 /*
81  * All metadata updates are logged, which means that we just have to push the
82  * journal to the required sequence number than holds the updates. We track
83  * datasync commits separately to full sync commits, and hence only need to
84  * select the correct sequence number for the log force here.
85  *
86  * We don't have to serialise against concurrent modifications, as we do not
87  * have to wait for modifications that have not yet completed. We define a
88  * transaction commit as completing when the commit sequence number is updated,
89  * hence if the sequence number has not updated, the sync operation has been
90  * run before the commit completed and we don't have to wait for it.
91  *
92  * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
93  * set on the log item until - at least - the journal flush completes. In
94  * reality, they are only cleared when the inode is fully unpinned (i.e.
95  * persistent in the journal and not dirty in the CIL), and so we rely on
96  * xfs_log_force_seq() either skipping sequences that have been persisted or
97  * waiting on sequences that are still in flight to correctly order concurrent
98  * sync operations.
99  */
100 static int
101 xfs_fsync_flush_log(
102 	struct xfs_inode	*ip,
103 	bool			datasync,
104 	int			*log_flushed)
105 {
106 	struct xfs_inode_log_item *iip = ip->i_itemp;
107 	xfs_csn_t		seq = 0;
108 
109 	spin_lock(&iip->ili_lock);
110 	if (datasync)
111 		seq = iip->ili_datasync_seq;
112 	else
113 		seq = iip->ili_commit_seq;
114 	spin_unlock(&iip->ili_lock);
115 
116 	if (!seq)
117 		return 0;
118 
119 	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
120 					  log_flushed);
121 }
122 
123 STATIC int
124 xfs_file_fsync(
125 	struct file		*file,
126 	loff_t			start,
127 	loff_t			end,
128 	int			datasync)
129 {
130 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
131 	struct xfs_mount	*mp = ip->i_mount;
132 	int			error, err2;
133 	int			log_flushed = 0;
134 
135 	trace_xfs_file_fsync(ip);
136 
137 	error = file_write_and_wait_range(file, start, end);
138 	if (error)
139 		return error;
140 
141 	if (xfs_is_shutdown(mp))
142 		return -EIO;
143 
144 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
145 
146 	/*
147 	 * If we have an RT and/or log subvolume we need to make sure to flush
148 	 * the write cache the device used for file data first.  This is to
149 	 * ensure newly written file data make it to disk before logging the new
150 	 * inode size in case of an extending write.
151 	 */
152 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
153 		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
154 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
155 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
156 
157 	/*
158 	 * If the inode has a inode log item attached, it may need the journal
159 	 * flushed to persist any changes the log item might be tracking.
160 	 */
161 	if (ip->i_itemp) {
162 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
163 		if (err2 && !error)
164 			error = err2;
165 	}
166 
167 	/*
168 	 * If we only have a single device, and the log force about was
169 	 * a no-op we might have to flush the data device cache here.
170 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
171 	 * an already allocated file and thus do not have any metadata to
172 	 * commit.
173 	 */
174 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
175 	    mp->m_logdev_targp == mp->m_ddev_targp) {
176 		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
177 		if (err2 && !error)
178 			error = err2;
179 	}
180 
181 	return error;
182 }
183 
184 static int
185 xfs_ilock_iocb(
186 	struct kiocb		*iocb,
187 	unsigned int		lock_mode)
188 {
189 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
190 
191 	if (iocb->ki_flags & IOCB_NOWAIT) {
192 		if (!xfs_ilock_nowait(ip, lock_mode))
193 			return -EAGAIN;
194 	} else {
195 		xfs_ilock(ip, lock_mode);
196 	}
197 
198 	return 0;
199 }
200 
201 static int
202 xfs_ilock_iocb_for_write(
203 	struct kiocb		*iocb,
204 	unsigned int		*lock_mode)
205 {
206 	ssize_t			ret;
207 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
208 
209 	ret = xfs_ilock_iocb(iocb, *lock_mode);
210 	if (ret)
211 		return ret;
212 
213 	/*
214 	 * If a reflink remap is in progress we always need to take the iolock
215 	 * exclusively to wait for it to finish.
216 	 */
217 	if (*lock_mode == XFS_IOLOCK_SHARED &&
218 	    xfs_iflags_test(ip, XFS_IREMAPPING)) {
219 		xfs_iunlock(ip, *lock_mode);
220 		*lock_mode = XFS_IOLOCK_EXCL;
221 		return xfs_ilock_iocb(iocb, *lock_mode);
222 	}
223 
224 	return 0;
225 }
226 
227 STATIC ssize_t
228 xfs_file_dio_read(
229 	struct kiocb		*iocb,
230 	struct iov_iter		*to)
231 {
232 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
233 	ssize_t			ret;
234 
235 	trace_xfs_file_direct_read(iocb, to);
236 
237 	if (!iov_iter_count(to))
238 		return 0; /* skip atime */
239 
240 	file_accessed(iocb->ki_filp);
241 
242 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
243 	if (ret)
244 		return ret;
245 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
246 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
247 
248 	return ret;
249 }
250 
251 static noinline ssize_t
252 xfs_file_dax_read(
253 	struct kiocb		*iocb,
254 	struct iov_iter		*to)
255 {
256 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
257 	ssize_t			ret = 0;
258 
259 	trace_xfs_file_dax_read(iocb, to);
260 
261 	if (!iov_iter_count(to))
262 		return 0; /* skip atime */
263 
264 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
265 	if (ret)
266 		return ret;
267 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
268 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
269 
270 	file_accessed(iocb->ki_filp);
271 	return ret;
272 }
273 
274 STATIC ssize_t
275 xfs_file_buffered_read(
276 	struct kiocb		*iocb,
277 	struct iov_iter		*to)
278 {
279 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
280 	ssize_t			ret;
281 
282 	trace_xfs_file_buffered_read(iocb, to);
283 
284 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
285 	if (ret)
286 		return ret;
287 	ret = generic_file_read_iter(iocb, to);
288 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
289 
290 	return ret;
291 }
292 
293 STATIC ssize_t
294 xfs_file_read_iter(
295 	struct kiocb		*iocb,
296 	struct iov_iter		*to)
297 {
298 	struct inode		*inode = file_inode(iocb->ki_filp);
299 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
300 	ssize_t			ret = 0;
301 
302 	XFS_STATS_INC(mp, xs_read_calls);
303 
304 	if (xfs_is_shutdown(mp))
305 		return -EIO;
306 
307 	if (IS_DAX(inode))
308 		ret = xfs_file_dax_read(iocb, to);
309 	else if (iocb->ki_flags & IOCB_DIRECT)
310 		ret = xfs_file_dio_read(iocb, to);
311 	else
312 		ret = xfs_file_buffered_read(iocb, to);
313 
314 	if (ret > 0)
315 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
316 	return ret;
317 }
318 
319 STATIC ssize_t
320 xfs_file_splice_read(
321 	struct file		*in,
322 	loff_t			*ppos,
323 	struct pipe_inode_info	*pipe,
324 	size_t			len,
325 	unsigned int		flags)
326 {
327 	struct inode		*inode = file_inode(in);
328 	struct xfs_inode	*ip = XFS_I(inode);
329 	struct xfs_mount	*mp = ip->i_mount;
330 	ssize_t			ret = 0;
331 
332 	XFS_STATS_INC(mp, xs_read_calls);
333 
334 	if (xfs_is_shutdown(mp))
335 		return -EIO;
336 
337 	trace_xfs_file_splice_read(ip, *ppos, len);
338 
339 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
340 	ret = filemap_splice_read(in, ppos, pipe, len, flags);
341 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
342 	if (ret > 0)
343 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
344 	return ret;
345 }
346 
347 /*
348  * Take care of zeroing post-EOF blocks when they might exist.
349  *
350  * Returns 0 if successfully, a negative error for a failure, or 1 if this
351  * function dropped the iolock and reacquired it exclusively and the caller
352  * needs to restart the write sanity checks.
353  */
354 static ssize_t
355 xfs_file_write_zero_eof(
356 	struct kiocb		*iocb,
357 	struct iov_iter		*from,
358 	unsigned int		*iolock,
359 	size_t			count,
360 	bool			*drained_dio,
361 	struct xfs_zone_alloc_ctx *ac)
362 {
363 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
364 	loff_t			isize;
365 	int			error;
366 
367 	/*
368 	 * We need to serialise against EOF updates that occur in IO completions
369 	 * here. We want to make sure that nobody is changing the size while
370 	 * we do this check until we have placed an IO barrier (i.e. hold
371 	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
372 	 * spinlock effectively forms a memory barrier once we have
373 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
374 	 * hence be able to correctly determine if we need to run zeroing.
375 	 */
376 	spin_lock(&ip->i_flags_lock);
377 	isize = i_size_read(VFS_I(ip));
378 	if (iocb->ki_pos <= isize) {
379 		spin_unlock(&ip->i_flags_lock);
380 		return 0;
381 	}
382 	spin_unlock(&ip->i_flags_lock);
383 
384 	if (iocb->ki_flags & IOCB_NOWAIT)
385 		return -EAGAIN;
386 
387 	if (!*drained_dio) {
388 		/*
389 		 * If zeroing is needed and we are currently holding the iolock
390 		 * shared, we need to update it to exclusive which implies
391 		 * having to redo all checks before.
392 		 */
393 		if (*iolock == XFS_IOLOCK_SHARED) {
394 			xfs_iunlock(ip, *iolock);
395 			*iolock = XFS_IOLOCK_EXCL;
396 			xfs_ilock(ip, *iolock);
397 			iov_iter_reexpand(from, count);
398 		}
399 
400 		/*
401 		 * We now have an IO submission barrier in place, but AIO can do
402 		 * EOF updates during IO completion and hence we now need to
403 		 * wait for all of them to drain.  Non-AIO DIO will have drained
404 		 * before we are given the XFS_IOLOCK_EXCL, and so for most
405 		 * cases this wait is a no-op.
406 		 */
407 		inode_dio_wait(VFS_I(ip));
408 		*drained_dio = true;
409 		return 1;
410 	}
411 
412 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
413 
414 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
415 	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
416 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
417 
418 	return error;
419 }
420 
421 /*
422  * Common pre-write limit and setup checks.
423  *
424  * Called with the iolock held either shared and exclusive according to
425  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
426  * if called for a direct write beyond i_size.
427  */
428 STATIC ssize_t
429 xfs_file_write_checks(
430 	struct kiocb		*iocb,
431 	struct iov_iter		*from,
432 	unsigned int		*iolock,
433 	struct xfs_zone_alloc_ctx *ac)
434 {
435 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
436 	size_t			count = iov_iter_count(from);
437 	bool			drained_dio = false;
438 	ssize_t			error;
439 
440 restart:
441 	error = generic_write_checks(iocb, from);
442 	if (error <= 0)
443 		return error;
444 
445 	if (iocb->ki_flags & IOCB_NOWAIT) {
446 		error = break_layout(inode, false);
447 		if (error == -EWOULDBLOCK)
448 			error = -EAGAIN;
449 	} else {
450 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
451 	}
452 
453 	if (error)
454 		return error;
455 
456 	/*
457 	 * For changing security info in file_remove_privs() we need i_rwsem
458 	 * exclusively.
459 	 */
460 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
461 		xfs_iunlock(XFS_I(inode), *iolock);
462 		*iolock = XFS_IOLOCK_EXCL;
463 		error = xfs_ilock_iocb(iocb, *iolock);
464 		if (error) {
465 			*iolock = 0;
466 			return error;
467 		}
468 		goto restart;
469 	}
470 
471 	/*
472 	 * If the offset is beyond the size of the file, we need to zero all
473 	 * blocks that fall between the existing EOF and the start of this
474 	 * write.
475 	 *
476 	 * We can do an unlocked check for i_size here safely as I/O completion
477 	 * can only extend EOF.  Truncate is locked out at this point, so the
478 	 * EOF can not move backwards, only forwards. Hence we only need to take
479 	 * the slow path when we are at or beyond the current EOF.
480 	 */
481 	if (iocb->ki_pos > i_size_read(inode)) {
482 		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
483 				&drained_dio, ac);
484 		if (error == 1)
485 			goto restart;
486 		if (error)
487 			return error;
488 	}
489 
490 	return kiocb_modified(iocb);
491 }
492 
493 static ssize_t
494 xfs_zoned_write_space_reserve(
495 	struct xfs_mount		*mp,
496 	struct kiocb			*iocb,
497 	struct iov_iter			*from,
498 	unsigned int			flags,
499 	struct xfs_zone_alloc_ctx	*ac)
500 {
501 	loff_t				count = iov_iter_count(from);
502 	int				error;
503 
504 	if (iocb->ki_flags & IOCB_NOWAIT)
505 		flags |= XFS_ZR_NOWAIT;
506 
507 	/*
508 	 * Check the rlimit and LFS boundary first so that we don't over-reserve
509 	 * by possibly a lot.
510 	 *
511 	 * The generic write path will redo this check later, and it might have
512 	 * changed by then.  If it got expanded we'll stick to our earlier
513 	 * smaller limit, and if it is decreased the new smaller limit will be
514 	 * used and our extra space reservation will be returned after finishing
515 	 * the write.
516 	 */
517 	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
518 	if (error)
519 		return error;
520 
521 	/*
522 	 * Sloppily round up count to file system blocks.
523 	 *
524 	 * This will often reserve an extra block, but that avoids having to look
525 	 * at the start offset, which isn't stable for O_APPEND until taking the
526 	 * iolock.  Also we need to reserve a block each for zeroing the old
527 	 * EOF block and the new start block if they are unaligned.
528 	 *
529 	 * Any remaining block will be returned after the write.
530 	 */
531 	return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
532 			flags, ac);
533 }
534 
535 static int
536 xfs_dio_write_end_io(
537 	struct kiocb		*iocb,
538 	ssize_t			size,
539 	int			error,
540 	unsigned		flags)
541 {
542 	struct inode		*inode = file_inode(iocb->ki_filp);
543 	struct xfs_inode	*ip = XFS_I(inode);
544 	loff_t			offset = iocb->ki_pos;
545 	unsigned int		nofs_flag;
546 
547 	ASSERT(!xfs_is_zoned_inode(ip) ||
548 	       !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
549 
550 	trace_xfs_end_io_direct_write(ip, offset, size);
551 
552 	if (xfs_is_shutdown(ip->i_mount))
553 		return -EIO;
554 
555 	if (error)
556 		return error;
557 	if (!size)
558 		return 0;
559 
560 	/*
561 	 * Capture amount written on completion as we can't reliably account
562 	 * for it on submission.
563 	 */
564 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
565 
566 	/*
567 	 * We can allocate memory here while doing writeback on behalf of
568 	 * memory reclaim.  To avoid memory allocation deadlocks set the
569 	 * task-wide nofs context for the following operations.
570 	 */
571 	nofs_flag = memalloc_nofs_save();
572 
573 	if (flags & IOMAP_DIO_COW) {
574 		if (iocb->ki_flags & IOCB_ATOMIC)
575 			error = xfs_reflink_end_atomic_cow(ip, offset, size);
576 		else
577 			error = xfs_reflink_end_cow(ip, offset, size);
578 		if (error)
579 			goto out;
580 	}
581 
582 	/*
583 	 * Unwritten conversion updates the in-core isize after extent
584 	 * conversion but before updating the on-disk size. Updating isize any
585 	 * earlier allows a racing dio read to find unwritten extents before
586 	 * they are converted.
587 	 */
588 	if (flags & IOMAP_DIO_UNWRITTEN) {
589 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
590 		goto out;
591 	}
592 
593 	/*
594 	 * We need to update the in-core inode size here so that we don't end up
595 	 * with the on-disk inode size being outside the in-core inode size. We
596 	 * have no other method of updating EOF for AIO, so always do it here
597 	 * if necessary.
598 	 *
599 	 * We need to lock the test/set EOF update as we can be racing with
600 	 * other IO completions here to update the EOF. Failing to serialise
601 	 * here can result in EOF moving backwards and Bad Things Happen when
602 	 * that occurs.
603 	 *
604 	 * As IO completion only ever extends EOF, we can do an unlocked check
605 	 * here to avoid taking the spinlock. If we land within the current EOF,
606 	 * then we do not need to do an extending update at all, and we don't
607 	 * need to take the lock to check this. If we race with an update moving
608 	 * EOF, then we'll either still be beyond EOF and need to take the lock,
609 	 * or we'll be within EOF and we don't need to take it at all.
610 	 */
611 	if (offset + size <= i_size_read(inode))
612 		goto out;
613 
614 	spin_lock(&ip->i_flags_lock);
615 	if (offset + size > i_size_read(inode)) {
616 		i_size_write(inode, offset + size);
617 		spin_unlock(&ip->i_flags_lock);
618 		error = xfs_setfilesize(ip, offset, size);
619 	} else {
620 		spin_unlock(&ip->i_flags_lock);
621 	}
622 
623 out:
624 	memalloc_nofs_restore(nofs_flag);
625 	return error;
626 }
627 
628 static const struct iomap_dio_ops xfs_dio_write_ops = {
629 	.end_io		= xfs_dio_write_end_io,
630 };
631 
632 static void
633 xfs_dio_zoned_submit_io(
634 	const struct iomap_iter	*iter,
635 	struct bio		*bio,
636 	loff_t			file_offset)
637 {
638 	struct xfs_mount	*mp = XFS_I(iter->inode)->i_mount;
639 	struct xfs_zone_alloc_ctx *ac = iter->private;
640 	xfs_filblks_t		count_fsb;
641 	struct iomap_ioend	*ioend;
642 
643 	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
644 	if (count_fsb > ac->reserved_blocks) {
645 		xfs_err(mp,
646 "allocation (%lld) larger than reservation (%lld).",
647 			count_fsb, ac->reserved_blocks);
648 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
649 		bio_io_error(bio);
650 		return;
651 	}
652 	ac->reserved_blocks -= count_fsb;
653 
654 	bio->bi_end_io = xfs_end_bio;
655 	ioend = iomap_init_ioend(iter->inode, bio, file_offset,
656 			IOMAP_IOEND_DIRECT);
657 	xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
658 }
659 
660 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
661 	.bio_set	= &iomap_ioend_bioset,
662 	.submit_io	= xfs_dio_zoned_submit_io,
663 	.end_io		= xfs_dio_write_end_io,
664 };
665 
666 /*
667  * Handle block aligned direct I/O writes.
668  */
669 static noinline ssize_t
670 xfs_file_dio_write_aligned(
671 	struct xfs_inode	*ip,
672 	struct kiocb		*iocb,
673 	struct iov_iter		*from,
674 	const struct iomap_ops	*ops,
675 	const struct iomap_dio_ops *dops,
676 	struct xfs_zone_alloc_ctx *ac)
677 {
678 	unsigned int		iolock = XFS_IOLOCK_SHARED;
679 	unsigned int		dio_flags = 0;
680 	ssize_t			ret;
681 
682 	/*
683 	 * For always COW inodes, each bio must be aligned to the file system
684 	 * block size and not just the device sector size because we need to
685 	 * allocate a block-aligned amount of space for each write.
686 	 */
687 	if (xfs_is_always_cow_inode(ip))
688 		dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
689 
690 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
691 	if (ret)
692 		return ret;
693 	ret = xfs_file_write_checks(iocb, from, &iolock, ac);
694 	if (ret)
695 		goto out_unlock;
696 
697 	/*
698 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
699 	 * the iolock back to shared if we had to take the exclusive lock in
700 	 * xfs_file_write_checks() for other reasons.
701 	 */
702 	if (iolock == XFS_IOLOCK_EXCL) {
703 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
704 		iolock = XFS_IOLOCK_SHARED;
705 	}
706 	trace_xfs_file_direct_write(iocb, from);
707 	ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
708 out_unlock:
709 	xfs_iunlock(ip, iolock);
710 	return ret;
711 }
712 
713 /*
714  * Handle block aligned direct I/O writes to zoned devices.
715  */
716 static noinline ssize_t
717 xfs_file_dio_write_zoned(
718 	struct xfs_inode	*ip,
719 	struct kiocb		*iocb,
720 	struct iov_iter		*from)
721 {
722 	struct xfs_zone_alloc_ctx ac = { };
723 	ssize_t			ret;
724 
725 	ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
726 	if (ret < 0)
727 		return ret;
728 	ret = xfs_file_dio_write_aligned(ip, iocb, from,
729 			&xfs_zoned_direct_write_iomap_ops,
730 			&xfs_dio_zoned_write_ops, &ac);
731 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
732 	return ret;
733 }
734 
735 /*
736  * Handle block atomic writes
737  *
738  * Two methods of atomic writes are supported:
739  * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
740  *   disk
741  * - COW-based, which uses a COW fork as a staging extent for data updates
742  *   before atomically updating extent mappings for the range being written
743  *
744  */
745 static noinline ssize_t
746 xfs_file_dio_write_atomic(
747 	struct xfs_inode	*ip,
748 	struct kiocb		*iocb,
749 	struct iov_iter		*from)
750 {
751 	unsigned int		iolock = XFS_IOLOCK_SHARED;
752 	ssize_t			ret, ocount = iov_iter_count(from);
753 	const struct iomap_ops	*dops;
754 
755 	/*
756 	 * HW offload should be faster, so try that first if it is already
757 	 * known that the write length is not too large.
758 	 */
759 	if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
760 		dops = &xfs_atomic_write_cow_iomap_ops;
761 	else
762 		dops = &xfs_direct_write_iomap_ops;
763 
764 retry:
765 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
766 	if (ret)
767 		return ret;
768 
769 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
770 	if (ret)
771 		goto out_unlock;
772 
773 	/* Demote similar to xfs_file_dio_write_aligned() */
774 	if (iolock == XFS_IOLOCK_EXCL) {
775 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
776 		iolock = XFS_IOLOCK_SHARED;
777 	}
778 
779 	trace_xfs_file_direct_write(iocb, from);
780 	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
781 			0, NULL, 0);
782 
783 	/*
784 	 * The retry mechanism is based on the ->iomap_begin method returning
785 	 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
786 	 * possible. The REQ_ATOMIC-based method typically not be possible if
787 	 * the write spans multiple extents or the disk blocks are misaligned.
788 	 */
789 	if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
790 		xfs_iunlock(ip, iolock);
791 		dops = &xfs_atomic_write_cow_iomap_ops;
792 		goto retry;
793 	}
794 
795 out_unlock:
796 	if (iolock)
797 		xfs_iunlock(ip, iolock);
798 	return ret;
799 }
800 
801 /*
802  * Handle block unaligned direct I/O writes
803  *
804  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
805  * them to be done in parallel with reads and other direct I/O writes.  However,
806  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
807  * to do sub-block zeroing and that requires serialisation against other direct
808  * I/O to the same block.  In this case we need to serialise the submission of
809  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
810  * In the case where sub-block zeroing is not required, we can do concurrent
811  * sub-block dios to the same block successfully.
812  *
813  * Optimistically submit the I/O using the shared lock first, but use the
814  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
815  * if block allocation or partial block zeroing would be required.  In that case
816  * we try again with the exclusive lock.
817  */
818 static noinline ssize_t
819 xfs_file_dio_write_unaligned(
820 	struct xfs_inode	*ip,
821 	struct kiocb		*iocb,
822 	struct iov_iter		*from)
823 {
824 	size_t			isize = i_size_read(VFS_I(ip));
825 	size_t			count = iov_iter_count(from);
826 	unsigned int		iolock = XFS_IOLOCK_SHARED;
827 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
828 	ssize_t			ret;
829 
830 	/*
831 	 * Extending writes need exclusivity because of the sub-block zeroing
832 	 * that the DIO code always does for partial tail blocks beyond EOF, so
833 	 * don't even bother trying the fast path in this case.
834 	 */
835 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
836 		if (iocb->ki_flags & IOCB_NOWAIT)
837 			return -EAGAIN;
838 retry_exclusive:
839 		iolock = XFS_IOLOCK_EXCL;
840 		flags = IOMAP_DIO_FORCE_WAIT;
841 	}
842 
843 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
844 	if (ret)
845 		return ret;
846 
847 	/*
848 	 * We can't properly handle unaligned direct I/O to reflink files yet,
849 	 * as we can't unshare a partial block.
850 	 */
851 	if (xfs_is_cow_inode(ip)) {
852 		trace_xfs_reflink_bounce_dio_write(iocb, from);
853 		ret = -ENOTBLK;
854 		goto out_unlock;
855 	}
856 
857 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
858 	if (ret)
859 		goto out_unlock;
860 
861 	/*
862 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
863 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
864 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
865 	 * drain first.
866 	 */
867 	if (flags & IOMAP_DIO_FORCE_WAIT)
868 		inode_dio_wait(VFS_I(ip));
869 
870 	trace_xfs_file_direct_write(iocb, from);
871 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
872 			   &xfs_dio_write_ops, flags, NULL, 0);
873 
874 	/*
875 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
876 	 * layer rejected it for mapping or locking reasons. If we are doing
877 	 * nonblocking user I/O, propagate the error.
878 	 */
879 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
880 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
881 		xfs_iunlock(ip, iolock);
882 		goto retry_exclusive;
883 	}
884 
885 out_unlock:
886 	if (iolock)
887 		xfs_iunlock(ip, iolock);
888 	return ret;
889 }
890 
891 static ssize_t
892 xfs_file_dio_write(
893 	struct kiocb		*iocb,
894 	struct iov_iter		*from)
895 {
896 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
897 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
898 	size_t			count = iov_iter_count(from);
899 
900 	/* direct I/O must be aligned to device logical sector size */
901 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
902 		return -EINVAL;
903 
904 	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
905 		return xfs_file_dio_write_unaligned(ip, iocb, from);
906 	if (xfs_is_zoned_inode(ip))
907 		return xfs_file_dio_write_zoned(ip, iocb, from);
908 	if (iocb->ki_flags & IOCB_ATOMIC)
909 		return xfs_file_dio_write_atomic(ip, iocb, from);
910 	return xfs_file_dio_write_aligned(ip, iocb, from,
911 			&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
912 }
913 
914 static noinline ssize_t
915 xfs_file_dax_write(
916 	struct kiocb		*iocb,
917 	struct iov_iter		*from)
918 {
919 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
920 	struct xfs_inode	*ip = XFS_I(inode);
921 	unsigned int		iolock = XFS_IOLOCK_EXCL;
922 	ssize_t			ret, error = 0;
923 	loff_t			pos;
924 
925 	ret = xfs_ilock_iocb(iocb, iolock);
926 	if (ret)
927 		return ret;
928 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
929 	if (ret)
930 		goto out;
931 
932 	pos = iocb->ki_pos;
933 
934 	trace_xfs_file_dax_write(iocb, from);
935 	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
936 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
937 		i_size_write(inode, iocb->ki_pos);
938 		error = xfs_setfilesize(ip, pos, ret);
939 	}
940 out:
941 	if (iolock)
942 		xfs_iunlock(ip, iolock);
943 	if (error)
944 		return error;
945 
946 	if (ret > 0) {
947 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
948 
949 		/* Handle various SYNC-type writes */
950 		ret = generic_write_sync(iocb, ret);
951 	}
952 	return ret;
953 }
954 
955 STATIC ssize_t
956 xfs_file_buffered_write(
957 	struct kiocb		*iocb,
958 	struct iov_iter		*from)
959 {
960 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
961 	struct xfs_inode	*ip = XFS_I(inode);
962 	ssize_t			ret;
963 	bool			cleared_space = false;
964 	unsigned int		iolock;
965 
966 write_retry:
967 	iolock = XFS_IOLOCK_EXCL;
968 	ret = xfs_ilock_iocb(iocb, iolock);
969 	if (ret)
970 		return ret;
971 
972 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
973 	if (ret)
974 		goto out;
975 
976 	trace_xfs_file_buffered_write(iocb, from);
977 	ret = iomap_file_buffered_write(iocb, from,
978 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
979 			NULL);
980 
981 	/*
982 	 * If we hit a space limit, try to free up some lingering preallocated
983 	 * space before returning an error. In the case of ENOSPC, first try to
984 	 * write back all dirty inodes to free up some of the excess reserved
985 	 * metadata space. This reduces the chances that the eofblocks scan
986 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
987 	 * also behaves as a filter to prevent too many eofblocks scans from
988 	 * running at the same time.  Use a synchronous scan to increase the
989 	 * effectiveness of the scan.
990 	 */
991 	if (ret == -EDQUOT && !cleared_space) {
992 		xfs_iunlock(ip, iolock);
993 		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
994 		cleared_space = true;
995 		goto write_retry;
996 	} else if (ret == -ENOSPC && !cleared_space) {
997 		struct xfs_icwalk	icw = {0};
998 
999 		cleared_space = true;
1000 		xfs_flush_inodes(ip->i_mount);
1001 
1002 		xfs_iunlock(ip, iolock);
1003 		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1004 		xfs_blockgc_free_space(ip->i_mount, &icw);
1005 		goto write_retry;
1006 	}
1007 
1008 out:
1009 	if (iolock)
1010 		xfs_iunlock(ip, iolock);
1011 
1012 	if (ret > 0) {
1013 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1014 		/* Handle various SYNC-type writes */
1015 		ret = generic_write_sync(iocb, ret);
1016 	}
1017 	return ret;
1018 }
1019 
1020 STATIC ssize_t
1021 xfs_file_buffered_write_zoned(
1022 	struct kiocb		*iocb,
1023 	struct iov_iter		*from)
1024 {
1025 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
1026 	struct xfs_mount	*mp = ip->i_mount;
1027 	unsigned int		iolock = XFS_IOLOCK_EXCL;
1028 	bool			cleared_space = false;
1029 	struct xfs_zone_alloc_ctx ac = { };
1030 	ssize_t			ret;
1031 
1032 	ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
1033 	if (ret < 0)
1034 		return ret;
1035 
1036 	ret = xfs_ilock_iocb(iocb, iolock);
1037 	if (ret)
1038 		goto out_unreserve;
1039 
1040 	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1041 	if (ret)
1042 		goto out_unlock;
1043 
1044 	/*
1045 	 * Truncate the iter to the length that we were actually able to
1046 	 * allocate blocks for.  This needs to happen after
1047 	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1048 	 * writes.
1049 	 */
1050 	iov_iter_truncate(from,
1051 			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1052 			(iocb->ki_pos & mp->m_blockmask));
1053 	if (!iov_iter_count(from))
1054 		goto out_unlock;
1055 
1056 retry:
1057 	trace_xfs_file_buffered_write(iocb, from);
1058 	ret = iomap_file_buffered_write(iocb, from,
1059 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1060 			&ac);
1061 	if (ret == -ENOSPC && !cleared_space) {
1062 		/*
1063 		 * Kick off writeback to convert delalloc space and release the
1064 		 * usually too pessimistic indirect block reservations.
1065 		 */
1066 		xfs_flush_inodes(mp);
1067 		cleared_space = true;
1068 		goto retry;
1069 	}
1070 
1071 out_unlock:
1072 	xfs_iunlock(ip, iolock);
1073 out_unreserve:
1074 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1075 	if (ret > 0) {
1076 		XFS_STATS_ADD(mp, xs_write_bytes, ret);
1077 		ret = generic_write_sync(iocb, ret);
1078 	}
1079 	return ret;
1080 }
1081 
1082 STATIC ssize_t
1083 xfs_file_write_iter(
1084 	struct kiocb		*iocb,
1085 	struct iov_iter		*from)
1086 {
1087 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1088 	struct xfs_inode	*ip = XFS_I(inode);
1089 	ssize_t			ret;
1090 	size_t			ocount = iov_iter_count(from);
1091 
1092 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1093 
1094 	if (ocount == 0)
1095 		return 0;
1096 
1097 	if (xfs_is_shutdown(ip->i_mount))
1098 		return -EIO;
1099 
1100 	if (iocb->ki_flags & IOCB_ATOMIC) {
1101 		if (ocount < xfs_get_atomic_write_min(ip))
1102 			return -EINVAL;
1103 
1104 		if (ocount > xfs_get_atomic_write_max(ip))
1105 			return -EINVAL;
1106 
1107 		ret = generic_atomic_write_valid(iocb, from);
1108 		if (ret)
1109 			return ret;
1110 	}
1111 
1112 	if (IS_DAX(inode))
1113 		return xfs_file_dax_write(iocb, from);
1114 
1115 	if (iocb->ki_flags & IOCB_DIRECT) {
1116 		/*
1117 		 * Allow a directio write to fall back to a buffered
1118 		 * write *only* in the case that we're doing a reflink
1119 		 * CoW.  In all other directio scenarios we do not
1120 		 * allow an operation to fall back to buffered mode.
1121 		 */
1122 		ret = xfs_file_dio_write(iocb, from);
1123 		if (ret != -ENOTBLK)
1124 			return ret;
1125 	}
1126 
1127 	if (xfs_is_zoned_inode(ip))
1128 		return xfs_file_buffered_write_zoned(iocb, from);
1129 	return xfs_file_buffered_write(iocb, from);
1130 }
1131 
1132 /* Does this file, inode, or mount want synchronous writes? */
1133 static inline bool xfs_file_sync_writes(struct file *filp)
1134 {
1135 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1136 
1137 	if (xfs_has_wsync(ip->i_mount))
1138 		return true;
1139 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1140 		return true;
1141 	if (IS_SYNC(file_inode(filp)))
1142 		return true;
1143 
1144 	return false;
1145 }
1146 
1147 static int
1148 xfs_falloc_newsize(
1149 	struct file		*file,
1150 	int			mode,
1151 	loff_t			offset,
1152 	loff_t			len,
1153 	loff_t			*new_size)
1154 {
1155 	struct inode		*inode = file_inode(file);
1156 
1157 	if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1158 		return 0;
1159 	*new_size = offset + len;
1160 	return inode_newsize_ok(inode, *new_size);
1161 }
1162 
1163 static int
1164 xfs_falloc_setsize(
1165 	struct file		*file,
1166 	loff_t			new_size)
1167 {
1168 	struct iattr iattr = {
1169 		.ia_valid	= ATTR_SIZE,
1170 		.ia_size	= new_size,
1171 	};
1172 
1173 	if (!new_size)
1174 		return 0;
1175 	return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1176 			&iattr);
1177 }
1178 
1179 static int
1180 xfs_falloc_collapse_range(
1181 	struct file		*file,
1182 	loff_t			offset,
1183 	loff_t			len,
1184 	struct xfs_zone_alloc_ctx *ac)
1185 {
1186 	struct inode		*inode = file_inode(file);
1187 	loff_t			new_size = i_size_read(inode) - len;
1188 	int			error;
1189 
1190 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1191 		return -EINVAL;
1192 
1193 	/*
1194 	 * There is no need to overlap collapse range with EOF, in which case it
1195 	 * is effectively a truncate operation
1196 	 */
1197 	if (offset + len >= i_size_read(inode))
1198 		return -EINVAL;
1199 
1200 	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1201 	if (error)
1202 		return error;
1203 	return xfs_falloc_setsize(file, new_size);
1204 }
1205 
1206 static int
1207 xfs_falloc_insert_range(
1208 	struct file		*file,
1209 	loff_t			offset,
1210 	loff_t			len)
1211 {
1212 	struct inode		*inode = file_inode(file);
1213 	loff_t			isize = i_size_read(inode);
1214 	int			error;
1215 
1216 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1217 		return -EINVAL;
1218 
1219 	/*
1220 	 * New inode size must not exceed ->s_maxbytes, accounting for
1221 	 * possible signed overflow.
1222 	 */
1223 	if (inode->i_sb->s_maxbytes - isize < len)
1224 		return -EFBIG;
1225 
1226 	/* Offset should be less than i_size */
1227 	if (offset >= isize)
1228 		return -EINVAL;
1229 
1230 	error = xfs_falloc_setsize(file, isize + len);
1231 	if (error)
1232 		return error;
1233 
1234 	/*
1235 	 * Perform hole insertion now that the file size has been updated so
1236 	 * that if we crash during the operation we don't leave shifted extents
1237 	 * past EOF and hence losing access to the data that is contained within
1238 	 * them.
1239 	 */
1240 	return xfs_insert_file_space(XFS_I(inode), offset, len);
1241 }
1242 
1243 /*
1244  * Punch a hole and prealloc the range.  We use a hole punch rather than
1245  * unwritten extent conversion for two reasons:
1246  *
1247  *   1.) Hole punch handles partial block zeroing for us.
1248  *   2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1249  *	 virtue of the hole punch.
1250  */
1251 static int
1252 xfs_falloc_zero_range(
1253 	struct file		*file,
1254 	int			mode,
1255 	loff_t			offset,
1256 	loff_t			len,
1257 	struct xfs_zone_alloc_ctx *ac)
1258 {
1259 	struct inode		*inode = file_inode(file);
1260 	struct xfs_inode	*ip = XFS_I(inode);
1261 	unsigned int		blksize = i_blocksize(inode);
1262 	loff_t			new_size = 0;
1263 	int			error;
1264 
1265 	trace_xfs_zero_file_space(ip);
1266 
1267 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1268 	if (error)
1269 		return error;
1270 
1271 	/*
1272 	 * Zero range implements a full zeroing mechanism but is only used in
1273 	 * limited situations. It is more efficient to allocate unwritten
1274 	 * extents than to perform zeroing here, so use an errortag to randomly
1275 	 * force zeroing on DEBUG kernels for added test coverage.
1276 	 */
1277 	if (XFS_TEST_ERROR(ip->i_mount,
1278 			   XFS_ERRTAG_FORCE_ZERO_RANGE)) {
1279 		error = xfs_zero_range(ip, offset, len, ac, NULL);
1280 	} else {
1281 		error = xfs_free_file_space(ip, offset, len, ac);
1282 		if (error)
1283 			return error;
1284 
1285 		len = round_up(offset + len, blksize) -
1286 			round_down(offset, blksize);
1287 		offset = round_down(offset, blksize);
1288 		error = xfs_alloc_file_space(ip, offset, len);
1289 	}
1290 	if (error)
1291 		return error;
1292 	return xfs_falloc_setsize(file, new_size);
1293 }
1294 
1295 static int
1296 xfs_falloc_unshare_range(
1297 	struct file		*file,
1298 	int			mode,
1299 	loff_t			offset,
1300 	loff_t			len)
1301 {
1302 	struct inode		*inode = file_inode(file);
1303 	loff_t			new_size = 0;
1304 	int			error;
1305 
1306 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1307 	if (error)
1308 		return error;
1309 
1310 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1311 	if (error)
1312 		return error;
1313 
1314 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1315 	if (error)
1316 		return error;
1317 	return xfs_falloc_setsize(file, new_size);
1318 }
1319 
1320 static int
1321 xfs_falloc_allocate_range(
1322 	struct file		*file,
1323 	int			mode,
1324 	loff_t			offset,
1325 	loff_t			len)
1326 {
1327 	struct inode		*inode = file_inode(file);
1328 	loff_t			new_size = 0;
1329 	int			error;
1330 
1331 	/*
1332 	 * If always_cow mode we can't use preallocations and thus should not
1333 	 * create them.
1334 	 */
1335 	if (xfs_is_always_cow_inode(XFS_I(inode)))
1336 		return -EOPNOTSUPP;
1337 
1338 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1339 	if (error)
1340 		return error;
1341 
1342 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1343 	if (error)
1344 		return error;
1345 	return xfs_falloc_setsize(file, new_size);
1346 }
1347 
1348 #define	XFS_FALLOC_FL_SUPPORTED						\
1349 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
1350 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
1351 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
1352 		 FALLOC_FL_UNSHARE_RANGE)
1353 
1354 STATIC long
1355 __xfs_file_fallocate(
1356 	struct file		*file,
1357 	int			mode,
1358 	loff_t			offset,
1359 	loff_t			len,
1360 	struct xfs_zone_alloc_ctx *ac)
1361 {
1362 	struct inode		*inode = file_inode(file);
1363 	struct xfs_inode	*ip = XFS_I(inode);
1364 	long			error;
1365 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1366 
1367 	xfs_ilock(ip, iolock);
1368 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1369 	if (error)
1370 		goto out_unlock;
1371 
1372 	/*
1373 	 * Must wait for all AIO to complete before we continue as AIO can
1374 	 * change the file size on completion without holding any locks we
1375 	 * currently hold. We must do this first because AIO can update both
1376 	 * the on disk and in memory inode sizes, and the operations that follow
1377 	 * require the in-memory size to be fully up-to-date.
1378 	 */
1379 	inode_dio_wait(inode);
1380 
1381 	error = file_modified(file);
1382 	if (error)
1383 		goto out_unlock;
1384 
1385 	switch (mode & FALLOC_FL_MODE_MASK) {
1386 	case FALLOC_FL_PUNCH_HOLE:
1387 		error = xfs_free_file_space(ip, offset, len, ac);
1388 		break;
1389 	case FALLOC_FL_COLLAPSE_RANGE:
1390 		error = xfs_falloc_collapse_range(file, offset, len, ac);
1391 		break;
1392 	case FALLOC_FL_INSERT_RANGE:
1393 		error = xfs_falloc_insert_range(file, offset, len);
1394 		break;
1395 	case FALLOC_FL_ZERO_RANGE:
1396 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1397 		break;
1398 	case FALLOC_FL_UNSHARE_RANGE:
1399 		error = xfs_falloc_unshare_range(file, mode, offset, len);
1400 		break;
1401 	case FALLOC_FL_ALLOCATE_RANGE:
1402 		error = xfs_falloc_allocate_range(file, mode, offset, len);
1403 		break;
1404 	default:
1405 		error = -EOPNOTSUPP;
1406 		break;
1407 	}
1408 
1409 	if (!error && xfs_file_sync_writes(file))
1410 		error = xfs_log_force_inode(ip);
1411 
1412 out_unlock:
1413 	xfs_iunlock(ip, iolock);
1414 	return error;
1415 }
1416 
1417 static long
1418 xfs_file_zoned_fallocate(
1419 	struct file		*file,
1420 	int			mode,
1421 	loff_t			offset,
1422 	loff_t			len)
1423 {
1424 	struct xfs_zone_alloc_ctx ac = { };
1425 	struct xfs_inode	*ip = XFS_I(file_inode(file));
1426 	int			error;
1427 
1428 	error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac);
1429 	if (error)
1430 		return error;
1431 	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1432 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1433 	return error;
1434 }
1435 
1436 static long
1437 xfs_file_fallocate(
1438 	struct file		*file,
1439 	int			mode,
1440 	loff_t			offset,
1441 	loff_t			len)
1442 {
1443 	struct inode		*inode = file_inode(file);
1444 
1445 	if (!S_ISREG(inode->i_mode))
1446 		return -EINVAL;
1447 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1448 		return -EOPNOTSUPP;
1449 
1450 	/*
1451 	 * For zoned file systems, zeroing the first and last block of a hole
1452 	 * punch requires allocating a new block to rewrite the remaining data
1453 	 * and new zeroes out of place.  Get a reservations for those before
1454 	 * taking the iolock.  Dip into the reserved pool because we are
1455 	 * expected to be able to punch a hole even on a completely full
1456 	 * file system.
1457 	 */
1458 	if (xfs_is_zoned_inode(XFS_I(inode)) &&
1459 	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1460 		     FALLOC_FL_COLLAPSE_RANGE)))
1461 		return xfs_file_zoned_fallocate(file, mode, offset, len);
1462 	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1463 }
1464 
1465 STATIC int
1466 xfs_file_fadvise(
1467 	struct file	*file,
1468 	loff_t		start,
1469 	loff_t		end,
1470 	int		advice)
1471 {
1472 	struct xfs_inode *ip = XFS_I(file_inode(file));
1473 	int ret;
1474 	int lockflags = 0;
1475 
1476 	/*
1477 	 * Operations creating pages in page cache need protection from hole
1478 	 * punching and similar ops
1479 	 */
1480 	if (advice == POSIX_FADV_WILLNEED) {
1481 		lockflags = XFS_IOLOCK_SHARED;
1482 		xfs_ilock(ip, lockflags);
1483 	}
1484 	ret = generic_fadvise(file, start, end, advice);
1485 	if (lockflags)
1486 		xfs_iunlock(ip, lockflags);
1487 	return ret;
1488 }
1489 
1490 STATIC loff_t
1491 xfs_file_remap_range(
1492 	struct file		*file_in,
1493 	loff_t			pos_in,
1494 	struct file		*file_out,
1495 	loff_t			pos_out,
1496 	loff_t			len,
1497 	unsigned int		remap_flags)
1498 {
1499 	struct inode		*inode_in = file_inode(file_in);
1500 	struct xfs_inode	*src = XFS_I(inode_in);
1501 	struct inode		*inode_out = file_inode(file_out);
1502 	struct xfs_inode	*dest = XFS_I(inode_out);
1503 	struct xfs_mount	*mp = src->i_mount;
1504 	loff_t			remapped = 0;
1505 	xfs_extlen_t		cowextsize;
1506 	int			ret;
1507 
1508 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1509 		return -EINVAL;
1510 
1511 	if (!xfs_has_reflink(mp))
1512 		return -EOPNOTSUPP;
1513 
1514 	if (xfs_is_shutdown(mp))
1515 		return -EIO;
1516 
1517 	/* Prepare and then clone file data. */
1518 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1519 			&len, remap_flags);
1520 	if (ret || len == 0)
1521 		return ret;
1522 
1523 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1524 
1525 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1526 			&remapped);
1527 	if (ret)
1528 		goto out_unlock;
1529 
1530 	/*
1531 	 * Carry the cowextsize hint from src to dest if we're sharing the
1532 	 * entire source file to the entire destination file, the source file
1533 	 * has a cowextsize hint, and the destination file does not.
1534 	 */
1535 	cowextsize = 0;
1536 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1537 	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1538 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1539 	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1540 		cowextsize = src->i_cowextsize;
1541 
1542 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1543 			remap_flags);
1544 	if (ret)
1545 		goto out_unlock;
1546 
1547 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1548 		xfs_log_force_inode(dest);
1549 out_unlock:
1550 	xfs_iunlock2_remapping(src, dest);
1551 	if (ret)
1552 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1553 	/*
1554 	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1555 	 * handle partial results -- either the whole remap succeeds, or we
1556 	 * must say why it did not.  In this case, any error should be returned
1557 	 * to the caller.
1558 	 */
1559 	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1560 		return ret;
1561 	return remapped > 0 ? remapped : ret;
1562 }
1563 
1564 STATIC int
1565 xfs_file_open(
1566 	struct inode	*inode,
1567 	struct file	*file)
1568 {
1569 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1570 		return -EIO;
1571 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1572 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1573 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1574 	return generic_file_open(inode, file);
1575 }
1576 
1577 STATIC int
1578 xfs_dir_open(
1579 	struct inode	*inode,
1580 	struct file	*file)
1581 {
1582 	struct xfs_inode *ip = XFS_I(inode);
1583 	unsigned int	mode;
1584 	int		error;
1585 
1586 	if (xfs_is_shutdown(ip->i_mount))
1587 		return -EIO;
1588 	error = generic_file_open(inode, file);
1589 	if (error)
1590 		return error;
1591 
1592 	/*
1593 	 * If there are any blocks, read-ahead block 0 as we're almost
1594 	 * certain to have the next operation be a read there.
1595 	 */
1596 	mode = xfs_ilock_data_map_shared(ip);
1597 	if (ip->i_df.if_nextents > 0)
1598 		error = xfs_dir3_data_readahead(ip, 0, 0);
1599 	xfs_iunlock(ip, mode);
1600 	return error;
1601 }
1602 
1603 /*
1604  * Don't bother propagating errors.  We're just doing cleanup, and the caller
1605  * ignores the return value anyway.
1606  */
1607 STATIC int
1608 xfs_file_release(
1609 	struct inode		*inode,
1610 	struct file		*file)
1611 {
1612 	struct xfs_inode	*ip = XFS_I(inode);
1613 	struct xfs_mount	*mp = ip->i_mount;
1614 
1615 	/*
1616 	 * If this is a read-only mount or the file system has been shut down,
1617 	 * don't generate I/O.
1618 	 */
1619 	if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1620 		return 0;
1621 
1622 	/*
1623 	 * If we previously truncated this file and removed old data in the
1624 	 * process, we want to initiate "early" writeout on the last close.
1625 	 * This is an attempt to combat the notorious NULL files problem which
1626 	 * is particularly noticeable from a truncate down, buffered (re-)write
1627 	 * (delalloc), followed by a crash.  What we are effectively doing here
1628 	 * is significantly reducing the time window where we'd otherwise be
1629 	 * exposed to that problem.
1630 	 */
1631 	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1632 		xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1633 		if (ip->i_delayed_blks > 0)
1634 			filemap_flush(inode->i_mapping);
1635 	}
1636 
1637 	/*
1638 	 * XFS aggressively preallocates post-EOF space to generate contiguous
1639 	 * allocations for writers that append to the end of the file.
1640 	 *
1641 	 * To support workloads that close and reopen the file frequently, these
1642 	 * preallocations usually persist after a close unless it is the first
1643 	 * close for the inode.  This is a tradeoff to generate tightly packed
1644 	 * data layouts for unpacking tarballs or similar archives that write
1645 	 * one file after another without going back to it while keeping the
1646 	 * preallocation for files that have recurring open/write/close cycles.
1647 	 *
1648 	 * This heuristic is skipped for inodes with the append-only flag as
1649 	 * that flag is rather pointless for inodes written only once.
1650 	 *
1651 	 * There is no point in freeing blocks here for open but unlinked files
1652 	 * as they will be taken care of by the inactivation path soon.
1653 	 *
1654 	 * When releasing a read-only context, don't flush data or trim post-EOF
1655 	 * blocks.  This avoids open/read/close workloads from removing EOF
1656 	 * blocks that other writers depend upon to reduce fragmentation.
1657 	 *
1658 	 * Inodes on the zoned RT device never have preallocations, so skip
1659 	 * taking the locks below.
1660 	 */
1661 	if (!inode->i_nlink ||
1662 	    !(file->f_mode & FMODE_WRITE) ||
1663 	    (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1664 	    xfs_is_zoned_inode(ip))
1665 		return 0;
1666 
1667 	/*
1668 	 * If we can't get the iolock just skip truncating the blocks past EOF
1669 	 * because we could deadlock with the mmap_lock otherwise. We'll get
1670 	 * another chance to drop them once the last reference to the inode is
1671 	 * dropped, so we'll never leak blocks permanently.
1672 	 */
1673 	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1674 	    xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1675 		if (xfs_can_free_eofblocks(ip) &&
1676 		    !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1677 			xfs_free_eofblocks(ip);
1678 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1679 	}
1680 
1681 	return 0;
1682 }
1683 
1684 STATIC int
1685 xfs_file_readdir(
1686 	struct file	*file,
1687 	struct dir_context *ctx)
1688 {
1689 	struct inode	*inode = file_inode(file);
1690 	xfs_inode_t	*ip = XFS_I(inode);
1691 	size_t		bufsize;
1692 
1693 	/*
1694 	 * The Linux API doesn't pass down the total size of the buffer
1695 	 * we read into down to the filesystem.  With the filldir concept
1696 	 * it's not needed for correct information, but the XFS dir2 leaf
1697 	 * code wants an estimate of the buffer size to calculate it's
1698 	 * readahead window and size the buffers used for mapping to
1699 	 * physical blocks.
1700 	 *
1701 	 * Try to give it an estimate that's good enough, maybe at some
1702 	 * point we can change the ->readdir prototype to include the
1703 	 * buffer size.  For now we use the current glibc buffer size.
1704 	 */
1705 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1706 
1707 	return xfs_readdir(NULL, ip, ctx, bufsize);
1708 }
1709 
1710 STATIC loff_t
1711 xfs_file_llseek(
1712 	struct file	*file,
1713 	loff_t		offset,
1714 	int		whence)
1715 {
1716 	struct inode		*inode = file->f_mapping->host;
1717 
1718 	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1719 		return -EIO;
1720 
1721 	switch (whence) {
1722 	default:
1723 		return generic_file_llseek(file, offset, whence);
1724 	case SEEK_HOLE:
1725 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1726 		break;
1727 	case SEEK_DATA:
1728 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1729 		break;
1730 	}
1731 
1732 	if (offset < 0)
1733 		return offset;
1734 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1735 }
1736 
1737 static inline vm_fault_t
1738 xfs_dax_fault_locked(
1739 	struct vm_fault		*vmf,
1740 	unsigned int		order,
1741 	bool			write_fault)
1742 {
1743 	vm_fault_t		ret;
1744 	unsigned long		pfn;
1745 
1746 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1747 		ASSERT(0);
1748 		return VM_FAULT_SIGBUS;
1749 	}
1750 	ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1751 			(write_fault && !vmf->cow_page) ?
1752 				&xfs_dax_write_iomap_ops :
1753 				&xfs_read_iomap_ops);
1754 	if (ret & VM_FAULT_NEEDDSYNC)
1755 		ret = dax_finish_sync_fault(vmf, order, pfn);
1756 	return ret;
1757 }
1758 
1759 static vm_fault_t
1760 xfs_dax_read_fault(
1761 	struct vm_fault		*vmf,
1762 	unsigned int		order)
1763 {
1764 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1765 	vm_fault_t		ret;
1766 
1767 	trace_xfs_read_fault(ip, order);
1768 
1769 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1770 	ret = xfs_dax_fault_locked(vmf, order, false);
1771 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1772 
1773 	return ret;
1774 }
1775 
1776 /*
1777  * Locking for serialisation of IO during page faults. This results in a lock
1778  * ordering of:
1779  *
1780  * mmap_lock (MM)
1781  *   sb_start_pagefault(vfs, freeze)
1782  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1783  *       page_lock (MM)
1784  *         i_lock (XFS - extent map serialisation)
1785  */
1786 static vm_fault_t
1787 __xfs_write_fault(
1788 	struct vm_fault		*vmf,
1789 	unsigned int		order,
1790 	struct xfs_zone_alloc_ctx *ac)
1791 {
1792 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1793 	struct xfs_inode	*ip = XFS_I(inode);
1794 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
1795 	vm_fault_t		ret;
1796 
1797 	trace_xfs_write_fault(ip, order);
1798 
1799 	sb_start_pagefault(inode->i_sb);
1800 	file_update_time(vmf->vma->vm_file);
1801 
1802 	/*
1803 	 * Normally we only need the shared mmaplock, but if a reflink remap is
1804 	 * in progress we take the exclusive lock to wait for the remap to
1805 	 * finish before taking a write fault.
1806 	 */
1807 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1808 	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1809 		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1810 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1811 		lock_mode = XFS_MMAPLOCK_EXCL;
1812 	}
1813 
1814 	if (IS_DAX(inode))
1815 		ret = xfs_dax_fault_locked(vmf, order, true);
1816 	else
1817 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1818 				ac);
1819 	xfs_iunlock(ip, lock_mode);
1820 
1821 	sb_end_pagefault(inode->i_sb);
1822 	return ret;
1823 }
1824 
1825 static vm_fault_t
1826 xfs_write_fault_zoned(
1827 	struct vm_fault		*vmf,
1828 	unsigned int		order)
1829 {
1830 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1831 	unsigned int		len = folio_size(page_folio(vmf->page));
1832 	struct xfs_zone_alloc_ctx ac = { };
1833 	int			error;
1834 	vm_fault_t		ret;
1835 
1836 	/*
1837 	 * This could over-allocate as it doesn't check for truncation.
1838 	 *
1839 	 * But as the overallocation is limited to less than a folio and will be
1840 	 * release instantly that's just fine.
1841 	 */
1842 	error = xfs_zoned_space_reserve(ip->i_mount,
1843 			XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
1844 	if (error < 0)
1845 		return vmf_fs_error(error);
1846 	ret = __xfs_write_fault(vmf, order, &ac);
1847 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1848 	return ret;
1849 }
1850 
1851 static vm_fault_t
1852 xfs_write_fault(
1853 	struct vm_fault		*vmf,
1854 	unsigned int		order)
1855 {
1856 	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1857 		return xfs_write_fault_zoned(vmf, order);
1858 	return __xfs_write_fault(vmf, order, NULL);
1859 }
1860 
1861 static inline bool
1862 xfs_is_write_fault(
1863 	struct vm_fault		*vmf)
1864 {
1865 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1866 	       (vmf->vma->vm_flags & VM_SHARED);
1867 }
1868 
1869 static vm_fault_t
1870 xfs_filemap_fault(
1871 	struct vm_fault		*vmf)
1872 {
1873 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1874 
1875 	/* DAX can shortcut the normal fault path on write faults! */
1876 	if (IS_DAX(inode)) {
1877 		if (xfs_is_write_fault(vmf))
1878 			return xfs_write_fault(vmf, 0);
1879 		return xfs_dax_read_fault(vmf, 0);
1880 	}
1881 
1882 	trace_xfs_read_fault(XFS_I(inode), 0);
1883 	return filemap_fault(vmf);
1884 }
1885 
1886 static vm_fault_t
1887 xfs_filemap_huge_fault(
1888 	struct vm_fault		*vmf,
1889 	unsigned int		order)
1890 {
1891 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1892 		return VM_FAULT_FALLBACK;
1893 
1894 	/* DAX can shortcut the normal fault path on write faults! */
1895 	if (xfs_is_write_fault(vmf))
1896 		return xfs_write_fault(vmf, order);
1897 	return xfs_dax_read_fault(vmf, order);
1898 }
1899 
1900 static vm_fault_t
1901 xfs_filemap_page_mkwrite(
1902 	struct vm_fault		*vmf)
1903 {
1904 	return xfs_write_fault(vmf, 0);
1905 }
1906 
1907 /*
1908  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1909  * on write faults. In reality, it needs to serialise against truncate and
1910  * prepare memory for writing so handle is as standard write fault.
1911  */
1912 static vm_fault_t
1913 xfs_filemap_pfn_mkwrite(
1914 	struct vm_fault		*vmf)
1915 {
1916 	return xfs_write_fault(vmf, 0);
1917 }
1918 
1919 static const struct vm_operations_struct xfs_file_vm_ops = {
1920 	.fault		= xfs_filemap_fault,
1921 	.huge_fault	= xfs_filemap_huge_fault,
1922 	.map_pages	= filemap_map_pages,
1923 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1924 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1925 };
1926 
1927 STATIC int
1928 xfs_file_mmap_prepare(
1929 	struct vm_area_desc	*desc)
1930 {
1931 	struct file		*file = desc->file;
1932 	struct inode		*inode = file_inode(file);
1933 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1934 
1935 	/*
1936 	 * We don't support synchronous mappings for non-DAX files and
1937 	 * for DAX files if underneath dax_device is not synchronous.
1938 	 */
1939 	if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
1940 				      target->bt_daxdev))
1941 		return -EOPNOTSUPP;
1942 
1943 	file_accessed(file);
1944 	desc->vm_ops = &xfs_file_vm_ops;
1945 	if (IS_DAX(inode))
1946 		desc->vm_flags |= VM_HUGEPAGE;
1947 	return 0;
1948 }
1949 
1950 const struct file_operations xfs_file_operations = {
1951 	.llseek		= xfs_file_llseek,
1952 	.read_iter	= xfs_file_read_iter,
1953 	.write_iter	= xfs_file_write_iter,
1954 	.splice_read	= xfs_file_splice_read,
1955 	.splice_write	= iter_file_splice_write,
1956 	.iopoll		= iocb_bio_iopoll,
1957 	.unlocked_ioctl	= xfs_file_ioctl,
1958 #ifdef CONFIG_COMPAT
1959 	.compat_ioctl	= xfs_file_compat_ioctl,
1960 #endif
1961 	.mmap_prepare	= xfs_file_mmap_prepare,
1962 	.open		= xfs_file_open,
1963 	.release	= xfs_file_release,
1964 	.fsync		= xfs_file_fsync,
1965 	.get_unmapped_area = thp_get_unmapped_area,
1966 	.fallocate	= xfs_file_fallocate,
1967 	.fadvise	= xfs_file_fadvise,
1968 	.remap_file_range = xfs_file_remap_range,
1969 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
1970 			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
1971 			  FOP_DONTCACHE,
1972 };
1973 
1974 const struct file_operations xfs_dir_file_operations = {
1975 	.open		= xfs_dir_open,
1976 	.read		= generic_read_dir,
1977 	.iterate_shared	= xfs_file_readdir,
1978 	.llseek		= generic_file_llseek,
1979 	.unlocked_ioctl	= xfs_file_ioctl,
1980 #ifdef CONFIG_COMPAT
1981 	.compat_ioctl	= xfs_file_compat_ioctl,
1982 #endif
1983 	.fsync		= xfs_dir_fsync,
1984 };
1985