xref: /linux/fs/xfs/xfs_file.c (revision 2a5574fc57d13031f869c409181bdeadd75770e1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 
31 #include <linux/dax.h>
32 #include <linux/falloc.h>
33 #include <linux/backing-dev.h>
34 #include <linux/mman.h>
35 #include <linux/fadvise.h>
36 #include <linux/mount.h>
37 
38 static const struct vm_operations_struct xfs_file_vm_ops;
39 
40 /*
41  * Decide if the given file range is aligned to the size of the fundamental
42  * allocation unit for the file.
43  */
44 bool
45 xfs_is_falloc_aligned(
46 	struct xfs_inode	*ip,
47 	loff_t			pos,
48 	long long int		len)
49 {
50 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip);
51 
52 	if (!is_power_of_2(alloc_unit))
53 		return isaligned_64(pos, alloc_unit) &&
54 		       isaligned_64(len, alloc_unit);
55 
56 	return !((pos | len) & (alloc_unit - 1));
57 }
58 
59 /*
60  * Fsync operations on directories are much simpler than on regular files,
61  * as there is no file data to flush, and thus also no need for explicit
62  * cache flush operations, and there are no non-transaction metadata updates
63  * on directories either.
64  */
65 STATIC int
66 xfs_dir_fsync(
67 	struct file		*file,
68 	loff_t			start,
69 	loff_t			end,
70 	int			datasync)
71 {
72 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
73 
74 	trace_xfs_dir_fsync(ip);
75 	return xfs_log_force_inode(ip);
76 }
77 
78 static xfs_csn_t
79 xfs_fsync_seq(
80 	struct xfs_inode	*ip,
81 	bool			datasync)
82 {
83 	if (!xfs_ipincount(ip))
84 		return 0;
85 	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
86 		return 0;
87 	return ip->i_itemp->ili_commit_seq;
88 }
89 
90 /*
91  * All metadata updates are logged, which means that we just have to flush the
92  * log up to the latest LSN that touched the inode.
93  *
94  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
95  * the log force before we clear the ili_fsync_fields field. This ensures that
96  * we don't get a racing sync operation that does not wait for the metadata to
97  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
98  * then all that will happen is the log force will do nothing as the lsn will
99  * already be on disk.  We can't race with setting ili_fsync_fields because that
100  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
101  * shared until after the ili_fsync_fields is cleared.
102  */
103 static  int
104 xfs_fsync_flush_log(
105 	struct xfs_inode	*ip,
106 	bool			datasync,
107 	int			*log_flushed)
108 {
109 	int			error = 0;
110 	xfs_csn_t		seq;
111 
112 	xfs_ilock(ip, XFS_ILOCK_SHARED);
113 	seq = xfs_fsync_seq(ip, datasync);
114 	if (seq) {
115 		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
116 					  log_flushed);
117 
118 		spin_lock(&ip->i_itemp->ili_lock);
119 		ip->i_itemp->ili_fsync_fields = 0;
120 		spin_unlock(&ip->i_itemp->ili_lock);
121 	}
122 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
123 	return error;
124 }
125 
126 STATIC int
127 xfs_file_fsync(
128 	struct file		*file,
129 	loff_t			start,
130 	loff_t			end,
131 	int			datasync)
132 {
133 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
134 	struct xfs_mount	*mp = ip->i_mount;
135 	int			error, err2;
136 	int			log_flushed = 0;
137 
138 	trace_xfs_file_fsync(ip);
139 
140 	error = file_write_and_wait_range(file, start, end);
141 	if (error)
142 		return error;
143 
144 	if (xfs_is_shutdown(mp))
145 		return -EIO;
146 
147 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
148 
149 	/*
150 	 * If we have an RT and/or log subvolume we need to make sure to flush
151 	 * the write cache the device used for file data first.  This is to
152 	 * ensure newly written file data make it to disk before logging the new
153 	 * inode size in case of an extending write.
154 	 */
155 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
156 		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
157 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
158 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
159 
160 	/*
161 	 * Any inode that has dirty modifications in the log is pinned.  The
162 	 * racy check here for a pinned inode will not catch modifications
163 	 * that happen concurrently to the fsync call, but fsync semantics
164 	 * only require to sync previously completed I/O.
165 	 */
166 	if (xfs_ipincount(ip)) {
167 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
168 		if (err2 && !error)
169 			error = err2;
170 	}
171 
172 	/*
173 	 * If we only have a single device, and the log force about was
174 	 * a no-op we might have to flush the data device cache here.
175 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
176 	 * an already allocated file and thus do not have any metadata to
177 	 * commit.
178 	 */
179 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
180 	    mp->m_logdev_targp == mp->m_ddev_targp) {
181 		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
182 		if (err2 && !error)
183 			error = err2;
184 	}
185 
186 	return error;
187 }
188 
189 static int
190 xfs_ilock_iocb(
191 	struct kiocb		*iocb,
192 	unsigned int		lock_mode)
193 {
194 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
195 
196 	if (iocb->ki_flags & IOCB_NOWAIT) {
197 		if (!xfs_ilock_nowait(ip, lock_mode))
198 			return -EAGAIN;
199 	} else {
200 		xfs_ilock(ip, lock_mode);
201 	}
202 
203 	return 0;
204 }
205 
206 static int
207 xfs_ilock_iocb_for_write(
208 	struct kiocb		*iocb,
209 	unsigned int		*lock_mode)
210 {
211 	ssize_t			ret;
212 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
213 
214 	ret = xfs_ilock_iocb(iocb, *lock_mode);
215 	if (ret)
216 		return ret;
217 
218 	/*
219 	 * If a reflink remap is in progress we always need to take the iolock
220 	 * exclusively to wait for it to finish.
221 	 */
222 	if (*lock_mode == XFS_IOLOCK_SHARED &&
223 	    xfs_iflags_test(ip, XFS_IREMAPPING)) {
224 		xfs_iunlock(ip, *lock_mode);
225 		*lock_mode = XFS_IOLOCK_EXCL;
226 		return xfs_ilock_iocb(iocb, *lock_mode);
227 	}
228 
229 	return 0;
230 }
231 
232 STATIC ssize_t
233 xfs_file_dio_read(
234 	struct kiocb		*iocb,
235 	struct iov_iter		*to)
236 {
237 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
238 	ssize_t			ret;
239 
240 	trace_xfs_file_direct_read(iocb, to);
241 
242 	if (!iov_iter_count(to))
243 		return 0; /* skip atime */
244 
245 	file_accessed(iocb->ki_filp);
246 
247 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
248 	if (ret)
249 		return ret;
250 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
251 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
252 
253 	return ret;
254 }
255 
256 static noinline ssize_t
257 xfs_file_dax_read(
258 	struct kiocb		*iocb,
259 	struct iov_iter		*to)
260 {
261 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
262 	ssize_t			ret = 0;
263 
264 	trace_xfs_file_dax_read(iocb, to);
265 
266 	if (!iov_iter_count(to))
267 		return 0; /* skip atime */
268 
269 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
270 	if (ret)
271 		return ret;
272 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
273 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
274 
275 	file_accessed(iocb->ki_filp);
276 	return ret;
277 }
278 
279 STATIC ssize_t
280 xfs_file_buffered_read(
281 	struct kiocb		*iocb,
282 	struct iov_iter		*to)
283 {
284 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
285 	ssize_t			ret;
286 
287 	trace_xfs_file_buffered_read(iocb, to);
288 
289 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
290 	if (ret)
291 		return ret;
292 	ret = generic_file_read_iter(iocb, to);
293 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
294 
295 	return ret;
296 }
297 
298 STATIC ssize_t
299 xfs_file_read_iter(
300 	struct kiocb		*iocb,
301 	struct iov_iter		*to)
302 {
303 	struct inode		*inode = file_inode(iocb->ki_filp);
304 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
305 	ssize_t			ret = 0;
306 
307 	XFS_STATS_INC(mp, xs_read_calls);
308 
309 	if (xfs_is_shutdown(mp))
310 		return -EIO;
311 
312 	if (IS_DAX(inode))
313 		ret = xfs_file_dax_read(iocb, to);
314 	else if (iocb->ki_flags & IOCB_DIRECT)
315 		ret = xfs_file_dio_read(iocb, to);
316 	else
317 		ret = xfs_file_buffered_read(iocb, to);
318 
319 	if (ret > 0)
320 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
321 	return ret;
322 }
323 
324 STATIC ssize_t
325 xfs_file_splice_read(
326 	struct file		*in,
327 	loff_t			*ppos,
328 	struct pipe_inode_info	*pipe,
329 	size_t			len,
330 	unsigned int		flags)
331 {
332 	struct inode		*inode = file_inode(in);
333 	struct xfs_inode	*ip = XFS_I(inode);
334 	struct xfs_mount	*mp = ip->i_mount;
335 	ssize_t			ret = 0;
336 
337 	XFS_STATS_INC(mp, xs_read_calls);
338 
339 	if (xfs_is_shutdown(mp))
340 		return -EIO;
341 
342 	trace_xfs_file_splice_read(ip, *ppos, len);
343 
344 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
345 	ret = filemap_splice_read(in, ppos, pipe, len, flags);
346 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
347 	if (ret > 0)
348 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
349 	return ret;
350 }
351 
352 /*
353  * Take care of zeroing post-EOF blocks when they might exist.
354  *
355  * Returns 0 if successfully, a negative error for a failure, or 1 if this
356  * function dropped the iolock and reacquired it exclusively and the caller
357  * needs to restart the write sanity checks.
358  */
359 static ssize_t
360 xfs_file_write_zero_eof(
361 	struct kiocb		*iocb,
362 	struct iov_iter		*from,
363 	unsigned int		*iolock,
364 	size_t			count,
365 	bool			*drained_dio,
366 	struct xfs_zone_alloc_ctx *ac)
367 {
368 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
369 	loff_t			isize;
370 	int			error;
371 
372 	/*
373 	 * We need to serialise against EOF updates that occur in IO completions
374 	 * here. We want to make sure that nobody is changing the size while
375 	 * we do this check until we have placed an IO barrier (i.e. hold
376 	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
377 	 * spinlock effectively forms a memory barrier once we have
378 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
379 	 * hence be able to correctly determine if we need to run zeroing.
380 	 */
381 	spin_lock(&ip->i_flags_lock);
382 	isize = i_size_read(VFS_I(ip));
383 	if (iocb->ki_pos <= isize) {
384 		spin_unlock(&ip->i_flags_lock);
385 		return 0;
386 	}
387 	spin_unlock(&ip->i_flags_lock);
388 
389 	if (iocb->ki_flags & IOCB_NOWAIT)
390 		return -EAGAIN;
391 
392 	if (!*drained_dio) {
393 		/*
394 		 * If zeroing is needed and we are currently holding the iolock
395 		 * shared, we need to update it to exclusive which implies
396 		 * having to redo all checks before.
397 		 */
398 		if (*iolock == XFS_IOLOCK_SHARED) {
399 			xfs_iunlock(ip, *iolock);
400 			*iolock = XFS_IOLOCK_EXCL;
401 			xfs_ilock(ip, *iolock);
402 			iov_iter_reexpand(from, count);
403 		}
404 
405 		/*
406 		 * We now have an IO submission barrier in place, but AIO can do
407 		 * EOF updates during IO completion and hence we now need to
408 		 * wait for all of them to drain.  Non-AIO DIO will have drained
409 		 * before we are given the XFS_IOLOCK_EXCL, and so for most
410 		 * cases this wait is a no-op.
411 		 */
412 		inode_dio_wait(VFS_I(ip));
413 		*drained_dio = true;
414 		return 1;
415 	}
416 
417 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
418 
419 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
420 	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
421 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
422 
423 	return error;
424 }
425 
426 /*
427  * Common pre-write limit and setup checks.
428  *
429  * Called with the iolock held either shared and exclusive according to
430  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
431  * if called for a direct write beyond i_size.
432  */
433 STATIC ssize_t
434 xfs_file_write_checks(
435 	struct kiocb		*iocb,
436 	struct iov_iter		*from,
437 	unsigned int		*iolock,
438 	struct xfs_zone_alloc_ctx *ac)
439 {
440 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
441 	size_t			count = iov_iter_count(from);
442 	bool			drained_dio = false;
443 	ssize_t			error;
444 
445 restart:
446 	error = generic_write_checks(iocb, from);
447 	if (error <= 0)
448 		return error;
449 
450 	if (iocb->ki_flags & IOCB_NOWAIT) {
451 		error = break_layout(inode, false);
452 		if (error == -EWOULDBLOCK)
453 			error = -EAGAIN;
454 	} else {
455 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
456 	}
457 
458 	if (error)
459 		return error;
460 
461 	/*
462 	 * For changing security info in file_remove_privs() we need i_rwsem
463 	 * exclusively.
464 	 */
465 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
466 		xfs_iunlock(XFS_I(inode), *iolock);
467 		*iolock = XFS_IOLOCK_EXCL;
468 		error = xfs_ilock_iocb(iocb, *iolock);
469 		if (error) {
470 			*iolock = 0;
471 			return error;
472 		}
473 		goto restart;
474 	}
475 
476 	/*
477 	 * If the offset is beyond the size of the file, we need to zero all
478 	 * blocks that fall between the existing EOF and the start of this
479 	 * write.
480 	 *
481 	 * We can do an unlocked check for i_size here safely as I/O completion
482 	 * can only extend EOF.  Truncate is locked out at this point, so the
483 	 * EOF can not move backwards, only forwards. Hence we only need to take
484 	 * the slow path when we are at or beyond the current EOF.
485 	 */
486 	if (iocb->ki_pos > i_size_read(inode)) {
487 		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
488 				&drained_dio, ac);
489 		if (error == 1)
490 			goto restart;
491 		if (error)
492 			return error;
493 	}
494 
495 	return kiocb_modified(iocb);
496 }
497 
498 static ssize_t
499 xfs_zoned_write_space_reserve(
500 	struct xfs_inode		*ip,
501 	struct kiocb			*iocb,
502 	struct iov_iter			*from,
503 	unsigned int			flags,
504 	struct xfs_zone_alloc_ctx	*ac)
505 {
506 	loff_t				count = iov_iter_count(from);
507 	int				error;
508 
509 	if (iocb->ki_flags & IOCB_NOWAIT)
510 		flags |= XFS_ZR_NOWAIT;
511 
512 	/*
513 	 * Check the rlimit and LFS boundary first so that we don't over-reserve
514 	 * by possibly a lot.
515 	 *
516 	 * The generic write path will redo this check later, and it might have
517 	 * changed by then.  If it got expanded we'll stick to our earlier
518 	 * smaller limit, and if it is decreased the new smaller limit will be
519 	 * used and our extra space reservation will be returned after finishing
520 	 * the write.
521 	 */
522 	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
523 	if (error)
524 		return error;
525 
526 	/*
527 	 * Sloppily round up count to file system blocks.
528 	 *
529 	 * This will often reserve an extra block, but that avoids having to look
530 	 * at the start offset, which isn't stable for O_APPEND until taking the
531 	 * iolock.  Also we need to reserve a block each for zeroing the old
532 	 * EOF block and the new start block if they are unaligned.
533 	 *
534 	 * Any remaining block will be returned after the write.
535 	 */
536 	return xfs_zoned_space_reserve(ip,
537 			XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
538 }
539 
540 static int
541 xfs_dio_write_end_io(
542 	struct kiocb		*iocb,
543 	ssize_t			size,
544 	int			error,
545 	unsigned		flags)
546 {
547 	struct inode		*inode = file_inode(iocb->ki_filp);
548 	struct xfs_inode	*ip = XFS_I(inode);
549 	loff_t			offset = iocb->ki_pos;
550 	unsigned int		nofs_flag;
551 
552 	ASSERT(!xfs_is_zoned_inode(ip) ||
553 	       !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
554 
555 	trace_xfs_end_io_direct_write(ip, offset, size);
556 
557 	if (xfs_is_shutdown(ip->i_mount))
558 		return -EIO;
559 
560 	if (error)
561 		return error;
562 	if (!size)
563 		return 0;
564 
565 	/*
566 	 * Capture amount written on completion as we can't reliably account
567 	 * for it on submission.
568 	 */
569 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
570 
571 	/*
572 	 * We can allocate memory here while doing writeback on behalf of
573 	 * memory reclaim.  To avoid memory allocation deadlocks set the
574 	 * task-wide nofs context for the following operations.
575 	 */
576 	nofs_flag = memalloc_nofs_save();
577 
578 	if (flags & IOMAP_DIO_COW) {
579 		if (iocb->ki_flags & IOCB_ATOMIC)
580 			error = xfs_reflink_end_atomic_cow(ip, offset, size);
581 		else
582 			error = xfs_reflink_end_cow(ip, offset, size);
583 		if (error)
584 			goto out;
585 	}
586 
587 	/*
588 	 * Unwritten conversion updates the in-core isize after extent
589 	 * conversion but before updating the on-disk size. Updating isize any
590 	 * earlier allows a racing dio read to find unwritten extents before
591 	 * they are converted.
592 	 */
593 	if (flags & IOMAP_DIO_UNWRITTEN) {
594 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
595 		goto out;
596 	}
597 
598 	/*
599 	 * We need to update the in-core inode size here so that we don't end up
600 	 * with the on-disk inode size being outside the in-core inode size. We
601 	 * have no other method of updating EOF for AIO, so always do it here
602 	 * if necessary.
603 	 *
604 	 * We need to lock the test/set EOF update as we can be racing with
605 	 * other IO completions here to update the EOF. Failing to serialise
606 	 * here can result in EOF moving backwards and Bad Things Happen when
607 	 * that occurs.
608 	 *
609 	 * As IO completion only ever extends EOF, we can do an unlocked check
610 	 * here to avoid taking the spinlock. If we land within the current EOF,
611 	 * then we do not need to do an extending update at all, and we don't
612 	 * need to take the lock to check this. If we race with an update moving
613 	 * EOF, then we'll either still be beyond EOF and need to take the lock,
614 	 * or we'll be within EOF and we don't need to take it at all.
615 	 */
616 	if (offset + size <= i_size_read(inode))
617 		goto out;
618 
619 	spin_lock(&ip->i_flags_lock);
620 	if (offset + size > i_size_read(inode)) {
621 		i_size_write(inode, offset + size);
622 		spin_unlock(&ip->i_flags_lock);
623 		error = xfs_setfilesize(ip, offset, size);
624 	} else {
625 		spin_unlock(&ip->i_flags_lock);
626 	}
627 
628 out:
629 	memalloc_nofs_restore(nofs_flag);
630 	return error;
631 }
632 
633 static const struct iomap_dio_ops xfs_dio_write_ops = {
634 	.end_io		= xfs_dio_write_end_io,
635 };
636 
637 static void
638 xfs_dio_zoned_submit_io(
639 	const struct iomap_iter	*iter,
640 	struct bio		*bio,
641 	loff_t			file_offset)
642 {
643 	struct xfs_mount	*mp = XFS_I(iter->inode)->i_mount;
644 	struct xfs_zone_alloc_ctx *ac = iter->private;
645 	xfs_filblks_t		count_fsb;
646 	struct iomap_ioend	*ioend;
647 
648 	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
649 	if (count_fsb > ac->reserved_blocks) {
650 		xfs_err(mp,
651 "allocation (%lld) larger than reservation (%lld).",
652 			count_fsb, ac->reserved_blocks);
653 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
654 		bio_io_error(bio);
655 		return;
656 	}
657 	ac->reserved_blocks -= count_fsb;
658 
659 	bio->bi_end_io = xfs_end_bio;
660 	ioend = iomap_init_ioend(iter->inode, bio, file_offset,
661 			IOMAP_IOEND_DIRECT);
662 	xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
663 }
664 
665 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
666 	.bio_set	= &iomap_ioend_bioset,
667 	.submit_io	= xfs_dio_zoned_submit_io,
668 	.end_io		= xfs_dio_write_end_io,
669 };
670 
671 /*
672  * Handle block aligned direct I/O writes.
673  */
674 static noinline ssize_t
675 xfs_file_dio_write_aligned(
676 	struct xfs_inode	*ip,
677 	struct kiocb		*iocb,
678 	struct iov_iter		*from,
679 	const struct iomap_ops	*ops,
680 	const struct iomap_dio_ops *dops,
681 	struct xfs_zone_alloc_ctx *ac)
682 {
683 	unsigned int		iolock = XFS_IOLOCK_SHARED;
684 	ssize_t			ret;
685 
686 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
687 	if (ret)
688 		return ret;
689 	ret = xfs_file_write_checks(iocb, from, &iolock, ac);
690 	if (ret)
691 		goto out_unlock;
692 
693 	/*
694 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
695 	 * the iolock back to shared if we had to take the exclusive lock in
696 	 * xfs_file_write_checks() for other reasons.
697 	 */
698 	if (iolock == XFS_IOLOCK_EXCL) {
699 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
700 		iolock = XFS_IOLOCK_SHARED;
701 	}
702 	trace_xfs_file_direct_write(iocb, from);
703 	ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
704 out_unlock:
705 	xfs_iunlock(ip, iolock);
706 	return ret;
707 }
708 
709 /*
710  * Handle block aligned direct I/O writes to zoned devices.
711  */
712 static noinline ssize_t
713 xfs_file_dio_write_zoned(
714 	struct xfs_inode	*ip,
715 	struct kiocb		*iocb,
716 	struct iov_iter		*from)
717 {
718 	struct xfs_zone_alloc_ctx ac = { };
719 	ssize_t			ret;
720 
721 	ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
722 	if (ret < 0)
723 		return ret;
724 	ret = xfs_file_dio_write_aligned(ip, iocb, from,
725 			&xfs_zoned_direct_write_iomap_ops,
726 			&xfs_dio_zoned_write_ops, &ac);
727 	xfs_zoned_space_unreserve(ip, &ac);
728 	return ret;
729 }
730 
731 /*
732  * Handle block atomic writes
733  *
734  * Two methods of atomic writes are supported:
735  * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
736  *   disk
737  * - COW-based, which uses a COW fork as a staging extent for data updates
738  *   before atomically updating extent mappings for the range being written
739  *
740  */
741 static noinline ssize_t
742 xfs_file_dio_write_atomic(
743 	struct xfs_inode	*ip,
744 	struct kiocb		*iocb,
745 	struct iov_iter		*from)
746 {
747 	unsigned int		iolock = XFS_IOLOCK_SHARED;
748 	ssize_t			ret, ocount = iov_iter_count(from);
749 	const struct iomap_ops	*dops;
750 
751 	/*
752 	 * HW offload should be faster, so try that first if it is already
753 	 * known that the write length is not too large.
754 	 */
755 	if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max)
756 		dops = &xfs_atomic_write_cow_iomap_ops;
757 	else
758 		dops = &xfs_direct_write_iomap_ops;
759 
760 retry:
761 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
762 	if (ret)
763 		return ret;
764 
765 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
766 	if (ret)
767 		goto out_unlock;
768 
769 	/* Demote similar to xfs_file_dio_write_aligned() */
770 	if (iolock == XFS_IOLOCK_EXCL) {
771 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772 		iolock = XFS_IOLOCK_SHARED;
773 	}
774 
775 	trace_xfs_file_direct_write(iocb, from);
776 	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
777 			0, NULL, 0);
778 
779 	/*
780 	 * The retry mechanism is based on the ->iomap_begin method returning
781 	 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
782 	 * possible. The REQ_ATOMIC-based method typically not be possible if
783 	 * the write spans multiple extents or the disk blocks are misaligned.
784 	 */
785 	if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
786 		xfs_iunlock(ip, iolock);
787 		dops = &xfs_atomic_write_cow_iomap_ops;
788 		goto retry;
789 	}
790 
791 out_unlock:
792 	if (iolock)
793 		xfs_iunlock(ip, iolock);
794 	return ret;
795 }
796 
797 /*
798  * Handle block unaligned direct I/O writes
799  *
800  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
801  * them to be done in parallel with reads and other direct I/O writes.  However,
802  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
803  * to do sub-block zeroing and that requires serialisation against other direct
804  * I/O to the same block.  In this case we need to serialise the submission of
805  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
806  * In the case where sub-block zeroing is not required, we can do concurrent
807  * sub-block dios to the same block successfully.
808  *
809  * Optimistically submit the I/O using the shared lock first, but use the
810  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
811  * if block allocation or partial block zeroing would be required.  In that case
812  * we try again with the exclusive lock.
813  */
814 static noinline ssize_t
815 xfs_file_dio_write_unaligned(
816 	struct xfs_inode	*ip,
817 	struct kiocb		*iocb,
818 	struct iov_iter		*from)
819 {
820 	size_t			isize = i_size_read(VFS_I(ip));
821 	size_t			count = iov_iter_count(from);
822 	unsigned int		iolock = XFS_IOLOCK_SHARED;
823 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
824 	ssize_t			ret;
825 
826 	/*
827 	 * Extending writes need exclusivity because of the sub-block zeroing
828 	 * that the DIO code always does for partial tail blocks beyond EOF, so
829 	 * don't even bother trying the fast path in this case.
830 	 */
831 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
832 		if (iocb->ki_flags & IOCB_NOWAIT)
833 			return -EAGAIN;
834 retry_exclusive:
835 		iolock = XFS_IOLOCK_EXCL;
836 		flags = IOMAP_DIO_FORCE_WAIT;
837 	}
838 
839 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
840 	if (ret)
841 		return ret;
842 
843 	/*
844 	 * We can't properly handle unaligned direct I/O to reflink files yet,
845 	 * as we can't unshare a partial block.
846 	 */
847 	if (xfs_is_cow_inode(ip)) {
848 		trace_xfs_reflink_bounce_dio_write(iocb, from);
849 		ret = -ENOTBLK;
850 		goto out_unlock;
851 	}
852 
853 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
854 	if (ret)
855 		goto out_unlock;
856 
857 	/*
858 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
859 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
860 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
861 	 * drain first.
862 	 */
863 	if (flags & IOMAP_DIO_FORCE_WAIT)
864 		inode_dio_wait(VFS_I(ip));
865 
866 	trace_xfs_file_direct_write(iocb, from);
867 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
868 			   &xfs_dio_write_ops, flags, NULL, 0);
869 
870 	/*
871 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
872 	 * layer rejected it for mapping or locking reasons. If we are doing
873 	 * nonblocking user I/O, propagate the error.
874 	 */
875 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
876 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
877 		xfs_iunlock(ip, iolock);
878 		goto retry_exclusive;
879 	}
880 
881 out_unlock:
882 	if (iolock)
883 		xfs_iunlock(ip, iolock);
884 	return ret;
885 }
886 
887 static ssize_t
888 xfs_file_dio_write(
889 	struct kiocb		*iocb,
890 	struct iov_iter		*from)
891 {
892 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
893 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
894 	size_t			count = iov_iter_count(from);
895 
896 	/* direct I/O must be aligned to device logical sector size */
897 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
898 		return -EINVAL;
899 
900 	/*
901 	 * For always COW inodes we also must check the alignment of each
902 	 * individual iovec segment, as they could end up with different
903 	 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
904 	 * then overwrite an already written block.
905 	 */
906 	if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
907 	    (xfs_is_always_cow_inode(ip) &&
908 	     (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
909 		return xfs_file_dio_write_unaligned(ip, iocb, from);
910 	if (xfs_is_zoned_inode(ip))
911 		return xfs_file_dio_write_zoned(ip, iocb, from);
912 	if (iocb->ki_flags & IOCB_ATOMIC)
913 		return xfs_file_dio_write_atomic(ip, iocb, from);
914 	return xfs_file_dio_write_aligned(ip, iocb, from,
915 			&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
916 }
917 
918 static noinline ssize_t
919 xfs_file_dax_write(
920 	struct kiocb		*iocb,
921 	struct iov_iter		*from)
922 {
923 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
924 	struct xfs_inode	*ip = XFS_I(inode);
925 	unsigned int		iolock = XFS_IOLOCK_EXCL;
926 	ssize_t			ret, error = 0;
927 	loff_t			pos;
928 
929 	ret = xfs_ilock_iocb(iocb, iolock);
930 	if (ret)
931 		return ret;
932 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
933 	if (ret)
934 		goto out;
935 
936 	pos = iocb->ki_pos;
937 
938 	trace_xfs_file_dax_write(iocb, from);
939 	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
940 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
941 		i_size_write(inode, iocb->ki_pos);
942 		error = xfs_setfilesize(ip, pos, ret);
943 	}
944 out:
945 	if (iolock)
946 		xfs_iunlock(ip, iolock);
947 	if (error)
948 		return error;
949 
950 	if (ret > 0) {
951 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
952 
953 		/* Handle various SYNC-type writes */
954 		ret = generic_write_sync(iocb, ret);
955 	}
956 	return ret;
957 }
958 
959 STATIC ssize_t
960 xfs_file_buffered_write(
961 	struct kiocb		*iocb,
962 	struct iov_iter		*from)
963 {
964 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
965 	struct xfs_inode	*ip = XFS_I(inode);
966 	ssize_t			ret;
967 	bool			cleared_space = false;
968 	unsigned int		iolock;
969 
970 write_retry:
971 	iolock = XFS_IOLOCK_EXCL;
972 	ret = xfs_ilock_iocb(iocb, iolock);
973 	if (ret)
974 		return ret;
975 
976 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
977 	if (ret)
978 		goto out;
979 
980 	trace_xfs_file_buffered_write(iocb, from);
981 	ret = iomap_file_buffered_write(iocb, from,
982 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
983 			NULL);
984 
985 	/*
986 	 * If we hit a space limit, try to free up some lingering preallocated
987 	 * space before returning an error. In the case of ENOSPC, first try to
988 	 * write back all dirty inodes to free up some of the excess reserved
989 	 * metadata space. This reduces the chances that the eofblocks scan
990 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
991 	 * also behaves as a filter to prevent too many eofblocks scans from
992 	 * running at the same time.  Use a synchronous scan to increase the
993 	 * effectiveness of the scan.
994 	 */
995 	if (ret == -EDQUOT && !cleared_space) {
996 		xfs_iunlock(ip, iolock);
997 		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
998 		cleared_space = true;
999 		goto write_retry;
1000 	} else if (ret == -ENOSPC && !cleared_space) {
1001 		struct xfs_icwalk	icw = {0};
1002 
1003 		cleared_space = true;
1004 		xfs_flush_inodes(ip->i_mount);
1005 
1006 		xfs_iunlock(ip, iolock);
1007 		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1008 		xfs_blockgc_free_space(ip->i_mount, &icw);
1009 		goto write_retry;
1010 	}
1011 
1012 out:
1013 	if (iolock)
1014 		xfs_iunlock(ip, iolock);
1015 
1016 	if (ret > 0) {
1017 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1018 		/* Handle various SYNC-type writes */
1019 		ret = generic_write_sync(iocb, ret);
1020 	}
1021 	return ret;
1022 }
1023 
1024 STATIC ssize_t
1025 xfs_file_buffered_write_zoned(
1026 	struct kiocb		*iocb,
1027 	struct iov_iter		*from)
1028 {
1029 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
1030 	struct xfs_mount	*mp = ip->i_mount;
1031 	unsigned int		iolock = XFS_IOLOCK_EXCL;
1032 	bool			cleared_space = false;
1033 	struct xfs_zone_alloc_ctx ac = { };
1034 	ssize_t			ret;
1035 
1036 	ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
1037 	if (ret < 0)
1038 		return ret;
1039 
1040 	ret = xfs_ilock_iocb(iocb, iolock);
1041 	if (ret)
1042 		goto out_unreserve;
1043 
1044 	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1045 	if (ret)
1046 		goto out_unlock;
1047 
1048 	/*
1049 	 * Truncate the iter to the length that we were actually able to
1050 	 * allocate blocks for.  This needs to happen after
1051 	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1052 	 * writes.
1053 	 */
1054 	iov_iter_truncate(from,
1055 			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1056 			(iocb->ki_pos & mp->m_blockmask));
1057 	if (!iov_iter_count(from))
1058 		goto out_unlock;
1059 
1060 retry:
1061 	trace_xfs_file_buffered_write(iocb, from);
1062 	ret = iomap_file_buffered_write(iocb, from,
1063 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1064 			&ac);
1065 	if (ret == -ENOSPC && !cleared_space) {
1066 		/*
1067 		 * Kick off writeback to convert delalloc space and release the
1068 		 * usually too pessimistic indirect block reservations.
1069 		 */
1070 		xfs_flush_inodes(mp);
1071 		cleared_space = true;
1072 		goto retry;
1073 	}
1074 
1075 out_unlock:
1076 	xfs_iunlock(ip, iolock);
1077 out_unreserve:
1078 	xfs_zoned_space_unreserve(ip, &ac);
1079 	if (ret > 0) {
1080 		XFS_STATS_ADD(mp, xs_write_bytes, ret);
1081 		ret = generic_write_sync(iocb, ret);
1082 	}
1083 	return ret;
1084 }
1085 
1086 STATIC ssize_t
1087 xfs_file_write_iter(
1088 	struct kiocb		*iocb,
1089 	struct iov_iter		*from)
1090 {
1091 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1092 	struct xfs_inode	*ip = XFS_I(inode);
1093 	ssize_t			ret;
1094 	size_t			ocount = iov_iter_count(from);
1095 
1096 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1097 
1098 	if (ocount == 0)
1099 		return 0;
1100 
1101 	if (xfs_is_shutdown(ip->i_mount))
1102 		return -EIO;
1103 
1104 	if (IS_DAX(inode))
1105 		return xfs_file_dax_write(iocb, from);
1106 
1107 	if (iocb->ki_flags & IOCB_ATOMIC) {
1108 		if (ocount < xfs_get_atomic_write_min(ip))
1109 			return -EINVAL;
1110 
1111 		if (ocount > xfs_get_atomic_write_max(ip))
1112 			return -EINVAL;
1113 
1114 		ret = generic_atomic_write_valid(iocb, from);
1115 		if (ret)
1116 			return ret;
1117 	}
1118 
1119 	if (iocb->ki_flags & IOCB_DIRECT) {
1120 		/*
1121 		 * Allow a directio write to fall back to a buffered
1122 		 * write *only* in the case that we're doing a reflink
1123 		 * CoW.  In all other directio scenarios we do not
1124 		 * allow an operation to fall back to buffered mode.
1125 		 */
1126 		ret = xfs_file_dio_write(iocb, from);
1127 		if (ret != -ENOTBLK)
1128 			return ret;
1129 	}
1130 
1131 	if (xfs_is_zoned_inode(ip))
1132 		return xfs_file_buffered_write_zoned(iocb, from);
1133 	return xfs_file_buffered_write(iocb, from);
1134 }
1135 
1136 /* Does this file, inode, or mount want synchronous writes? */
1137 static inline bool xfs_file_sync_writes(struct file *filp)
1138 {
1139 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1140 
1141 	if (xfs_has_wsync(ip->i_mount))
1142 		return true;
1143 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1144 		return true;
1145 	if (IS_SYNC(file_inode(filp)))
1146 		return true;
1147 
1148 	return false;
1149 }
1150 
1151 static int
1152 xfs_falloc_newsize(
1153 	struct file		*file,
1154 	int			mode,
1155 	loff_t			offset,
1156 	loff_t			len,
1157 	loff_t			*new_size)
1158 {
1159 	struct inode		*inode = file_inode(file);
1160 
1161 	if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1162 		return 0;
1163 	*new_size = offset + len;
1164 	return inode_newsize_ok(inode, *new_size);
1165 }
1166 
1167 static int
1168 xfs_falloc_setsize(
1169 	struct file		*file,
1170 	loff_t			new_size)
1171 {
1172 	struct iattr iattr = {
1173 		.ia_valid	= ATTR_SIZE,
1174 		.ia_size	= new_size,
1175 	};
1176 
1177 	if (!new_size)
1178 		return 0;
1179 	return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1180 			&iattr);
1181 }
1182 
1183 static int
1184 xfs_falloc_collapse_range(
1185 	struct file		*file,
1186 	loff_t			offset,
1187 	loff_t			len,
1188 	struct xfs_zone_alloc_ctx *ac)
1189 {
1190 	struct inode		*inode = file_inode(file);
1191 	loff_t			new_size = i_size_read(inode) - len;
1192 	int			error;
1193 
1194 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1195 		return -EINVAL;
1196 
1197 	/*
1198 	 * There is no need to overlap collapse range with EOF, in which case it
1199 	 * is effectively a truncate operation
1200 	 */
1201 	if (offset + len >= i_size_read(inode))
1202 		return -EINVAL;
1203 
1204 	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1205 	if (error)
1206 		return error;
1207 	return xfs_falloc_setsize(file, new_size);
1208 }
1209 
1210 static int
1211 xfs_falloc_insert_range(
1212 	struct file		*file,
1213 	loff_t			offset,
1214 	loff_t			len)
1215 {
1216 	struct inode		*inode = file_inode(file);
1217 	loff_t			isize = i_size_read(inode);
1218 	int			error;
1219 
1220 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1221 		return -EINVAL;
1222 
1223 	/*
1224 	 * New inode size must not exceed ->s_maxbytes, accounting for
1225 	 * possible signed overflow.
1226 	 */
1227 	if (inode->i_sb->s_maxbytes - isize < len)
1228 		return -EFBIG;
1229 
1230 	/* Offset should be less than i_size */
1231 	if (offset >= isize)
1232 		return -EINVAL;
1233 
1234 	error = xfs_falloc_setsize(file, isize + len);
1235 	if (error)
1236 		return error;
1237 
1238 	/*
1239 	 * Perform hole insertion now that the file size has been updated so
1240 	 * that if we crash during the operation we don't leave shifted extents
1241 	 * past EOF and hence losing access to the data that is contained within
1242 	 * them.
1243 	 */
1244 	return xfs_insert_file_space(XFS_I(inode), offset, len);
1245 }
1246 
1247 /*
1248  * Punch a hole and prealloc the range.  We use a hole punch rather than
1249  * unwritten extent conversion for two reasons:
1250  *
1251  *   1.) Hole punch handles partial block zeroing for us.
1252  *   2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1253  *	 virtue of the hole punch.
1254  */
1255 static int
1256 xfs_falloc_zero_range(
1257 	struct file		*file,
1258 	int			mode,
1259 	loff_t			offset,
1260 	loff_t			len,
1261 	struct xfs_zone_alloc_ctx *ac)
1262 {
1263 	struct inode		*inode = file_inode(file);
1264 	unsigned int		blksize = i_blocksize(inode);
1265 	loff_t			new_size = 0;
1266 	int			error;
1267 
1268 	trace_xfs_zero_file_space(XFS_I(inode));
1269 
1270 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1271 	if (error)
1272 		return error;
1273 
1274 	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
1275 	if (error)
1276 		return error;
1277 
1278 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
1279 	offset = round_down(offset, blksize);
1280 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1281 	if (error)
1282 		return error;
1283 	return xfs_falloc_setsize(file, new_size);
1284 }
1285 
1286 static int
1287 xfs_falloc_unshare_range(
1288 	struct file		*file,
1289 	int			mode,
1290 	loff_t			offset,
1291 	loff_t			len)
1292 {
1293 	struct inode		*inode = file_inode(file);
1294 	loff_t			new_size = 0;
1295 	int			error;
1296 
1297 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1298 	if (error)
1299 		return error;
1300 
1301 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1302 	if (error)
1303 		return error;
1304 
1305 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1306 	if (error)
1307 		return error;
1308 	return xfs_falloc_setsize(file, new_size);
1309 }
1310 
1311 static int
1312 xfs_falloc_allocate_range(
1313 	struct file		*file,
1314 	int			mode,
1315 	loff_t			offset,
1316 	loff_t			len)
1317 {
1318 	struct inode		*inode = file_inode(file);
1319 	loff_t			new_size = 0;
1320 	int			error;
1321 
1322 	/*
1323 	 * If always_cow mode we can't use preallocations and thus should not
1324 	 * create them.
1325 	 */
1326 	if (xfs_is_always_cow_inode(XFS_I(inode)))
1327 		return -EOPNOTSUPP;
1328 
1329 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1330 	if (error)
1331 		return error;
1332 
1333 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1334 	if (error)
1335 		return error;
1336 	return xfs_falloc_setsize(file, new_size);
1337 }
1338 
1339 #define	XFS_FALLOC_FL_SUPPORTED						\
1340 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
1341 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
1342 		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
1343 
1344 STATIC long
1345 __xfs_file_fallocate(
1346 	struct file		*file,
1347 	int			mode,
1348 	loff_t			offset,
1349 	loff_t			len,
1350 	struct xfs_zone_alloc_ctx *ac)
1351 {
1352 	struct inode		*inode = file_inode(file);
1353 	struct xfs_inode	*ip = XFS_I(inode);
1354 	long			error;
1355 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1356 
1357 	xfs_ilock(ip, iolock);
1358 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1359 	if (error)
1360 		goto out_unlock;
1361 
1362 	/*
1363 	 * Must wait for all AIO to complete before we continue as AIO can
1364 	 * change the file size on completion without holding any locks we
1365 	 * currently hold. We must do this first because AIO can update both
1366 	 * the on disk and in memory inode sizes, and the operations that follow
1367 	 * require the in-memory size to be fully up-to-date.
1368 	 */
1369 	inode_dio_wait(inode);
1370 
1371 	error = file_modified(file);
1372 	if (error)
1373 		goto out_unlock;
1374 
1375 	switch (mode & FALLOC_FL_MODE_MASK) {
1376 	case FALLOC_FL_PUNCH_HOLE:
1377 		error = xfs_free_file_space(ip, offset, len, ac);
1378 		break;
1379 	case FALLOC_FL_COLLAPSE_RANGE:
1380 		error = xfs_falloc_collapse_range(file, offset, len, ac);
1381 		break;
1382 	case FALLOC_FL_INSERT_RANGE:
1383 		error = xfs_falloc_insert_range(file, offset, len);
1384 		break;
1385 	case FALLOC_FL_ZERO_RANGE:
1386 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1387 		break;
1388 	case FALLOC_FL_UNSHARE_RANGE:
1389 		error = xfs_falloc_unshare_range(file, mode, offset, len);
1390 		break;
1391 	case FALLOC_FL_ALLOCATE_RANGE:
1392 		error = xfs_falloc_allocate_range(file, mode, offset, len);
1393 		break;
1394 	default:
1395 		error = -EOPNOTSUPP;
1396 		break;
1397 	}
1398 
1399 	if (!error && xfs_file_sync_writes(file))
1400 		error = xfs_log_force_inode(ip);
1401 
1402 out_unlock:
1403 	xfs_iunlock(ip, iolock);
1404 	return error;
1405 }
1406 
1407 static long
1408 xfs_file_zoned_fallocate(
1409 	struct file		*file,
1410 	int			mode,
1411 	loff_t			offset,
1412 	loff_t			len)
1413 {
1414 	struct xfs_zone_alloc_ctx ac = { };
1415 	struct xfs_inode	*ip = XFS_I(file_inode(file));
1416 	int			error;
1417 
1418 	error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
1419 	if (error)
1420 		return error;
1421 	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1422 	xfs_zoned_space_unreserve(ip, &ac);
1423 	return error;
1424 }
1425 
1426 static long
1427 xfs_file_fallocate(
1428 	struct file		*file,
1429 	int			mode,
1430 	loff_t			offset,
1431 	loff_t			len)
1432 {
1433 	struct inode		*inode = file_inode(file);
1434 
1435 	if (!S_ISREG(inode->i_mode))
1436 		return -EINVAL;
1437 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1438 		return -EOPNOTSUPP;
1439 
1440 	/*
1441 	 * For zoned file systems, zeroing the first and last block of a hole
1442 	 * punch requires allocating a new block to rewrite the remaining data
1443 	 * and new zeroes out of place.  Get a reservations for those before
1444 	 * taking the iolock.  Dip into the reserved pool because we are
1445 	 * expected to be able to punch a hole even on a completely full
1446 	 * file system.
1447 	 */
1448 	if (xfs_is_zoned_inode(XFS_I(inode)) &&
1449 	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1450 		     FALLOC_FL_COLLAPSE_RANGE)))
1451 		return xfs_file_zoned_fallocate(file, mode, offset, len);
1452 	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1453 }
1454 
1455 STATIC int
1456 xfs_file_fadvise(
1457 	struct file	*file,
1458 	loff_t		start,
1459 	loff_t		end,
1460 	int		advice)
1461 {
1462 	struct xfs_inode *ip = XFS_I(file_inode(file));
1463 	int ret;
1464 	int lockflags = 0;
1465 
1466 	/*
1467 	 * Operations creating pages in page cache need protection from hole
1468 	 * punching and similar ops
1469 	 */
1470 	if (advice == POSIX_FADV_WILLNEED) {
1471 		lockflags = XFS_IOLOCK_SHARED;
1472 		xfs_ilock(ip, lockflags);
1473 	}
1474 	ret = generic_fadvise(file, start, end, advice);
1475 	if (lockflags)
1476 		xfs_iunlock(ip, lockflags);
1477 	return ret;
1478 }
1479 
1480 STATIC loff_t
1481 xfs_file_remap_range(
1482 	struct file		*file_in,
1483 	loff_t			pos_in,
1484 	struct file		*file_out,
1485 	loff_t			pos_out,
1486 	loff_t			len,
1487 	unsigned int		remap_flags)
1488 {
1489 	struct inode		*inode_in = file_inode(file_in);
1490 	struct xfs_inode	*src = XFS_I(inode_in);
1491 	struct inode		*inode_out = file_inode(file_out);
1492 	struct xfs_inode	*dest = XFS_I(inode_out);
1493 	struct xfs_mount	*mp = src->i_mount;
1494 	loff_t			remapped = 0;
1495 	xfs_extlen_t		cowextsize;
1496 	int			ret;
1497 
1498 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1499 		return -EINVAL;
1500 
1501 	if (!xfs_has_reflink(mp))
1502 		return -EOPNOTSUPP;
1503 
1504 	if (xfs_is_shutdown(mp))
1505 		return -EIO;
1506 
1507 	/* Prepare and then clone file data. */
1508 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1509 			&len, remap_flags);
1510 	if (ret || len == 0)
1511 		return ret;
1512 
1513 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1514 
1515 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1516 			&remapped);
1517 	if (ret)
1518 		goto out_unlock;
1519 
1520 	/*
1521 	 * Carry the cowextsize hint from src to dest if we're sharing the
1522 	 * entire source file to the entire destination file, the source file
1523 	 * has a cowextsize hint, and the destination file does not.
1524 	 */
1525 	cowextsize = 0;
1526 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1527 	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1528 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1529 	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1530 		cowextsize = src->i_cowextsize;
1531 
1532 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1533 			remap_flags);
1534 	if (ret)
1535 		goto out_unlock;
1536 
1537 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1538 		xfs_log_force_inode(dest);
1539 out_unlock:
1540 	xfs_iunlock2_remapping(src, dest);
1541 	if (ret)
1542 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1543 	/*
1544 	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1545 	 * handle partial results -- either the whole remap succeeds, or we
1546 	 * must say why it did not.  In this case, any error should be returned
1547 	 * to the caller.
1548 	 */
1549 	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1550 		return ret;
1551 	return remapped > 0 ? remapped : ret;
1552 }
1553 
1554 STATIC int
1555 xfs_file_open(
1556 	struct inode	*inode,
1557 	struct file	*file)
1558 {
1559 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1560 		return -EIO;
1561 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1562 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1563 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1564 	return generic_file_open(inode, file);
1565 }
1566 
1567 STATIC int
1568 xfs_dir_open(
1569 	struct inode	*inode,
1570 	struct file	*file)
1571 {
1572 	struct xfs_inode *ip = XFS_I(inode);
1573 	unsigned int	mode;
1574 	int		error;
1575 
1576 	if (xfs_is_shutdown(ip->i_mount))
1577 		return -EIO;
1578 	error = generic_file_open(inode, file);
1579 	if (error)
1580 		return error;
1581 
1582 	/*
1583 	 * If there are any blocks, read-ahead block 0 as we're almost
1584 	 * certain to have the next operation be a read there.
1585 	 */
1586 	mode = xfs_ilock_data_map_shared(ip);
1587 	if (ip->i_df.if_nextents > 0)
1588 		error = xfs_dir3_data_readahead(ip, 0, 0);
1589 	xfs_iunlock(ip, mode);
1590 	return error;
1591 }
1592 
1593 /*
1594  * Don't bother propagating errors.  We're just doing cleanup, and the caller
1595  * ignores the return value anyway.
1596  */
1597 STATIC int
1598 xfs_file_release(
1599 	struct inode		*inode,
1600 	struct file		*file)
1601 {
1602 	struct xfs_inode	*ip = XFS_I(inode);
1603 	struct xfs_mount	*mp = ip->i_mount;
1604 
1605 	/*
1606 	 * If this is a read-only mount or the file system has been shut down,
1607 	 * don't generate I/O.
1608 	 */
1609 	if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1610 		return 0;
1611 
1612 	/*
1613 	 * If we previously truncated this file and removed old data in the
1614 	 * process, we want to initiate "early" writeout on the last close.
1615 	 * This is an attempt to combat the notorious NULL files problem which
1616 	 * is particularly noticeable from a truncate down, buffered (re-)write
1617 	 * (delalloc), followed by a crash.  What we are effectively doing here
1618 	 * is significantly reducing the time window where we'd otherwise be
1619 	 * exposed to that problem.
1620 	 */
1621 	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1622 		xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1623 		if (ip->i_delayed_blks > 0)
1624 			filemap_flush(inode->i_mapping);
1625 	}
1626 
1627 	/*
1628 	 * XFS aggressively preallocates post-EOF space to generate contiguous
1629 	 * allocations for writers that append to the end of the file.
1630 	 *
1631 	 * To support workloads that close and reopen the file frequently, these
1632 	 * preallocations usually persist after a close unless it is the first
1633 	 * close for the inode.  This is a tradeoff to generate tightly packed
1634 	 * data layouts for unpacking tarballs or similar archives that write
1635 	 * one file after another without going back to it while keeping the
1636 	 * preallocation for files that have recurring open/write/close cycles.
1637 	 *
1638 	 * This heuristic is skipped for inodes with the append-only flag as
1639 	 * that flag is rather pointless for inodes written only once.
1640 	 *
1641 	 * There is no point in freeing blocks here for open but unlinked files
1642 	 * as they will be taken care of by the inactivation path soon.
1643 	 *
1644 	 * When releasing a read-only context, don't flush data or trim post-EOF
1645 	 * blocks.  This avoids open/read/close workloads from removing EOF
1646 	 * blocks that other writers depend upon to reduce fragmentation.
1647 	 *
1648 	 * Inodes on the zoned RT device never have preallocations, so skip
1649 	 * taking the locks below.
1650 	 */
1651 	if (!inode->i_nlink ||
1652 	    !(file->f_mode & FMODE_WRITE) ||
1653 	    (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1654 	    xfs_is_zoned_inode(ip))
1655 		return 0;
1656 
1657 	/*
1658 	 * If we can't get the iolock just skip truncating the blocks past EOF
1659 	 * because we could deadlock with the mmap_lock otherwise. We'll get
1660 	 * another chance to drop them once the last reference to the inode is
1661 	 * dropped, so we'll never leak blocks permanently.
1662 	 */
1663 	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1664 	    xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1665 		if (xfs_can_free_eofblocks(ip) &&
1666 		    !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1667 			xfs_free_eofblocks(ip);
1668 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1669 	}
1670 
1671 	return 0;
1672 }
1673 
1674 STATIC int
1675 xfs_file_readdir(
1676 	struct file	*file,
1677 	struct dir_context *ctx)
1678 {
1679 	struct inode	*inode = file_inode(file);
1680 	xfs_inode_t	*ip = XFS_I(inode);
1681 	size_t		bufsize;
1682 
1683 	/*
1684 	 * The Linux API doesn't pass down the total size of the buffer
1685 	 * we read into down to the filesystem.  With the filldir concept
1686 	 * it's not needed for correct information, but the XFS dir2 leaf
1687 	 * code wants an estimate of the buffer size to calculate it's
1688 	 * readahead window and size the buffers used for mapping to
1689 	 * physical blocks.
1690 	 *
1691 	 * Try to give it an estimate that's good enough, maybe at some
1692 	 * point we can change the ->readdir prototype to include the
1693 	 * buffer size.  For now we use the current glibc buffer size.
1694 	 */
1695 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1696 
1697 	return xfs_readdir(NULL, ip, ctx, bufsize);
1698 }
1699 
1700 STATIC loff_t
1701 xfs_file_llseek(
1702 	struct file	*file,
1703 	loff_t		offset,
1704 	int		whence)
1705 {
1706 	struct inode		*inode = file->f_mapping->host;
1707 
1708 	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1709 		return -EIO;
1710 
1711 	switch (whence) {
1712 	default:
1713 		return generic_file_llseek(file, offset, whence);
1714 	case SEEK_HOLE:
1715 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1716 		break;
1717 	case SEEK_DATA:
1718 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1719 		break;
1720 	}
1721 
1722 	if (offset < 0)
1723 		return offset;
1724 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1725 }
1726 
1727 static inline vm_fault_t
1728 xfs_dax_fault_locked(
1729 	struct vm_fault		*vmf,
1730 	unsigned int		order,
1731 	bool			write_fault)
1732 {
1733 	vm_fault_t		ret;
1734 	pfn_t			pfn;
1735 
1736 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1737 		ASSERT(0);
1738 		return VM_FAULT_SIGBUS;
1739 	}
1740 	ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1741 			(write_fault && !vmf->cow_page) ?
1742 				&xfs_dax_write_iomap_ops :
1743 				&xfs_read_iomap_ops);
1744 	if (ret & VM_FAULT_NEEDDSYNC)
1745 		ret = dax_finish_sync_fault(vmf, order, pfn);
1746 	return ret;
1747 }
1748 
1749 static vm_fault_t
1750 xfs_dax_read_fault(
1751 	struct vm_fault		*vmf,
1752 	unsigned int		order)
1753 {
1754 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1755 	vm_fault_t		ret;
1756 
1757 	trace_xfs_read_fault(ip, order);
1758 
1759 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1760 	ret = xfs_dax_fault_locked(vmf, order, false);
1761 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1762 
1763 	return ret;
1764 }
1765 
1766 /*
1767  * Locking for serialisation of IO during page faults. This results in a lock
1768  * ordering of:
1769  *
1770  * mmap_lock (MM)
1771  *   sb_start_pagefault(vfs, freeze)
1772  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1773  *       page_lock (MM)
1774  *         i_lock (XFS - extent map serialisation)
1775  */
1776 static vm_fault_t
1777 __xfs_write_fault(
1778 	struct vm_fault		*vmf,
1779 	unsigned int		order,
1780 	struct xfs_zone_alloc_ctx *ac)
1781 {
1782 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1783 	struct xfs_inode	*ip = XFS_I(inode);
1784 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
1785 	vm_fault_t		ret;
1786 
1787 	trace_xfs_write_fault(ip, order);
1788 
1789 	sb_start_pagefault(inode->i_sb);
1790 	file_update_time(vmf->vma->vm_file);
1791 
1792 	/*
1793 	 * Normally we only need the shared mmaplock, but if a reflink remap is
1794 	 * in progress we take the exclusive lock to wait for the remap to
1795 	 * finish before taking a write fault.
1796 	 */
1797 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1798 	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1799 		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1800 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1801 		lock_mode = XFS_MMAPLOCK_EXCL;
1802 	}
1803 
1804 	if (IS_DAX(inode))
1805 		ret = xfs_dax_fault_locked(vmf, order, true);
1806 	else
1807 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1808 				ac);
1809 	xfs_iunlock(ip, lock_mode);
1810 
1811 	sb_end_pagefault(inode->i_sb);
1812 	return ret;
1813 }
1814 
1815 static vm_fault_t
1816 xfs_write_fault_zoned(
1817 	struct vm_fault		*vmf,
1818 	unsigned int		order)
1819 {
1820 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1821 	unsigned int		len = folio_size(page_folio(vmf->page));
1822 	struct xfs_zone_alloc_ctx ac = { };
1823 	int			error;
1824 	vm_fault_t		ret;
1825 
1826 	/*
1827 	 * This could over-allocate as it doesn't check for truncation.
1828 	 *
1829 	 * But as the overallocation is limited to less than a folio and will be
1830 	 * release instantly that's just fine.
1831 	 */
1832 	error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
1833 			&ac);
1834 	if (error < 0)
1835 		return vmf_fs_error(error);
1836 	ret = __xfs_write_fault(vmf, order, &ac);
1837 	xfs_zoned_space_unreserve(ip, &ac);
1838 	return ret;
1839 }
1840 
1841 static vm_fault_t
1842 xfs_write_fault(
1843 	struct vm_fault		*vmf,
1844 	unsigned int		order)
1845 {
1846 	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1847 		return xfs_write_fault_zoned(vmf, order);
1848 	return __xfs_write_fault(vmf, order, NULL);
1849 }
1850 
1851 static inline bool
1852 xfs_is_write_fault(
1853 	struct vm_fault		*vmf)
1854 {
1855 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1856 	       (vmf->vma->vm_flags & VM_SHARED);
1857 }
1858 
1859 static vm_fault_t
1860 xfs_filemap_fault(
1861 	struct vm_fault		*vmf)
1862 {
1863 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1864 
1865 	/* DAX can shortcut the normal fault path on write faults! */
1866 	if (IS_DAX(inode)) {
1867 		if (xfs_is_write_fault(vmf))
1868 			return xfs_write_fault(vmf, 0);
1869 		return xfs_dax_read_fault(vmf, 0);
1870 	}
1871 
1872 	trace_xfs_read_fault(XFS_I(inode), 0);
1873 	return filemap_fault(vmf);
1874 }
1875 
1876 static vm_fault_t
1877 xfs_filemap_huge_fault(
1878 	struct vm_fault		*vmf,
1879 	unsigned int		order)
1880 {
1881 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1882 		return VM_FAULT_FALLBACK;
1883 
1884 	/* DAX can shortcut the normal fault path on write faults! */
1885 	if (xfs_is_write_fault(vmf))
1886 		return xfs_write_fault(vmf, order);
1887 	return xfs_dax_read_fault(vmf, order);
1888 }
1889 
1890 static vm_fault_t
1891 xfs_filemap_page_mkwrite(
1892 	struct vm_fault		*vmf)
1893 {
1894 	return xfs_write_fault(vmf, 0);
1895 }
1896 
1897 /*
1898  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1899  * on write faults. In reality, it needs to serialise against truncate and
1900  * prepare memory for writing so handle is as standard write fault.
1901  */
1902 static vm_fault_t
1903 xfs_filemap_pfn_mkwrite(
1904 	struct vm_fault		*vmf)
1905 {
1906 	return xfs_write_fault(vmf, 0);
1907 }
1908 
1909 static const struct vm_operations_struct xfs_file_vm_ops = {
1910 	.fault		= xfs_filemap_fault,
1911 	.huge_fault	= xfs_filemap_huge_fault,
1912 	.map_pages	= filemap_map_pages,
1913 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1914 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1915 };
1916 
1917 STATIC int
1918 xfs_file_mmap(
1919 	struct file		*file,
1920 	struct vm_area_struct	*vma)
1921 {
1922 	struct inode		*inode = file_inode(file);
1923 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1924 
1925 	/*
1926 	 * We don't support synchronous mappings for non-DAX files and
1927 	 * for DAX files if underneath dax_device is not synchronous.
1928 	 */
1929 	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1930 		return -EOPNOTSUPP;
1931 
1932 	file_accessed(file);
1933 	vma->vm_ops = &xfs_file_vm_ops;
1934 	if (IS_DAX(inode))
1935 		vm_flags_set(vma, VM_HUGEPAGE);
1936 	return 0;
1937 }
1938 
1939 const struct file_operations xfs_file_operations = {
1940 	.llseek		= xfs_file_llseek,
1941 	.read_iter	= xfs_file_read_iter,
1942 	.write_iter	= xfs_file_write_iter,
1943 	.splice_read	= xfs_file_splice_read,
1944 	.splice_write	= iter_file_splice_write,
1945 	.iopoll		= iocb_bio_iopoll,
1946 	.unlocked_ioctl	= xfs_file_ioctl,
1947 #ifdef CONFIG_COMPAT
1948 	.compat_ioctl	= xfs_file_compat_ioctl,
1949 #endif
1950 	.mmap		= xfs_file_mmap,
1951 	.open		= xfs_file_open,
1952 	.release	= xfs_file_release,
1953 	.fsync		= xfs_file_fsync,
1954 	.get_unmapped_area = thp_get_unmapped_area,
1955 	.fallocate	= xfs_file_fallocate,
1956 	.fadvise	= xfs_file_fadvise,
1957 	.remap_file_range = xfs_file_remap_range,
1958 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
1959 			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
1960 			  FOP_DONTCACHE,
1961 };
1962 
1963 const struct file_operations xfs_dir_file_operations = {
1964 	.open		= xfs_dir_open,
1965 	.read		= generic_read_dir,
1966 	.iterate_shared	= xfs_file_readdir,
1967 	.llseek		= generic_file_llseek,
1968 	.unlocked_ioctl	= xfs_file_ioctl,
1969 #ifdef CONFIG_COMPAT
1970 	.compat_ioctl	= xfs_file_compat_ioctl,
1971 #endif
1972 	.fsync		= xfs_dir_fsync,
1973 };
1974