xref: /linux/fs/xfs/xfs_file.c (revision af2d6148d2a159e1a0862bce5a2c88c1618a2b27)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 
31 #include <linux/dax.h>
32 #include <linux/falloc.h>
33 #include <linux/backing-dev.h>
34 #include <linux/mman.h>
35 #include <linux/fadvise.h>
36 #include <linux/mount.h>
37 
38 static const struct vm_operations_struct xfs_file_vm_ops;
39 
40 /*
41  * Decide if the given file range is aligned to the size of the fundamental
42  * allocation unit for the file.
43  */
44 bool
45 xfs_is_falloc_aligned(
46 	struct xfs_inode	*ip,
47 	loff_t			pos,
48 	long long int		len)
49 {
50 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip);
51 
52 	if (!is_power_of_2(alloc_unit))
53 		return isaligned_64(pos, alloc_unit) &&
54 		       isaligned_64(len, alloc_unit);
55 
56 	return !((pos | len) & (alloc_unit - 1));
57 }
58 
59 /*
60  * Fsync operations on directories are much simpler than on regular files,
61  * as there is no file data to flush, and thus also no need for explicit
62  * cache flush operations, and there are no non-transaction metadata updates
63  * on directories either.
64  */
65 STATIC int
66 xfs_dir_fsync(
67 	struct file		*file,
68 	loff_t			start,
69 	loff_t			end,
70 	int			datasync)
71 {
72 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
73 
74 	trace_xfs_dir_fsync(ip);
75 	return xfs_log_force_inode(ip);
76 }
77 
78 static xfs_csn_t
79 xfs_fsync_seq(
80 	struct xfs_inode	*ip,
81 	bool			datasync)
82 {
83 	if (!xfs_ipincount(ip))
84 		return 0;
85 	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
86 		return 0;
87 	return ip->i_itemp->ili_commit_seq;
88 }
89 
90 /*
91  * All metadata updates are logged, which means that we just have to flush the
92  * log up to the latest LSN that touched the inode.
93  *
94  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
95  * the log force before we clear the ili_fsync_fields field. This ensures that
96  * we don't get a racing sync operation that does not wait for the metadata to
97  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
98  * then all that will happen is the log force will do nothing as the lsn will
99  * already be on disk.  We can't race with setting ili_fsync_fields because that
100  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
101  * shared until after the ili_fsync_fields is cleared.
102  */
103 static  int
104 xfs_fsync_flush_log(
105 	struct xfs_inode	*ip,
106 	bool			datasync,
107 	int			*log_flushed)
108 {
109 	int			error = 0;
110 	xfs_csn_t		seq;
111 
112 	xfs_ilock(ip, XFS_ILOCK_SHARED);
113 	seq = xfs_fsync_seq(ip, datasync);
114 	if (seq) {
115 		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
116 					  log_flushed);
117 
118 		spin_lock(&ip->i_itemp->ili_lock);
119 		ip->i_itemp->ili_fsync_fields = 0;
120 		spin_unlock(&ip->i_itemp->ili_lock);
121 	}
122 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
123 	return error;
124 }
125 
126 STATIC int
127 xfs_file_fsync(
128 	struct file		*file,
129 	loff_t			start,
130 	loff_t			end,
131 	int			datasync)
132 {
133 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
134 	struct xfs_mount	*mp = ip->i_mount;
135 	int			error, err2;
136 	int			log_flushed = 0;
137 
138 	trace_xfs_file_fsync(ip);
139 
140 	error = file_write_and_wait_range(file, start, end);
141 	if (error)
142 		return error;
143 
144 	if (xfs_is_shutdown(mp))
145 		return -EIO;
146 
147 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
148 
149 	/*
150 	 * If we have an RT and/or log subvolume we need to make sure to flush
151 	 * the write cache the device used for file data first.  This is to
152 	 * ensure newly written file data make it to disk before logging the new
153 	 * inode size in case of an extending write.
154 	 */
155 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
156 		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
157 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
158 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
159 
160 	/*
161 	 * Any inode that has dirty modifications in the log is pinned.  The
162 	 * racy check here for a pinned inode will not catch modifications
163 	 * that happen concurrently to the fsync call, but fsync semantics
164 	 * only require to sync previously completed I/O.
165 	 */
166 	if (xfs_ipincount(ip)) {
167 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
168 		if (err2 && !error)
169 			error = err2;
170 	}
171 
172 	/*
173 	 * If we only have a single device, and the log force about was
174 	 * a no-op we might have to flush the data device cache here.
175 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
176 	 * an already allocated file and thus do not have any metadata to
177 	 * commit.
178 	 */
179 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
180 	    mp->m_logdev_targp == mp->m_ddev_targp) {
181 		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
182 		if (err2 && !error)
183 			error = err2;
184 	}
185 
186 	return error;
187 }
188 
189 static int
190 xfs_ilock_iocb(
191 	struct kiocb		*iocb,
192 	unsigned int		lock_mode)
193 {
194 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
195 
196 	if (iocb->ki_flags & IOCB_NOWAIT) {
197 		if (!xfs_ilock_nowait(ip, lock_mode))
198 			return -EAGAIN;
199 	} else {
200 		xfs_ilock(ip, lock_mode);
201 	}
202 
203 	return 0;
204 }
205 
206 static int
207 xfs_ilock_iocb_for_write(
208 	struct kiocb		*iocb,
209 	unsigned int		*lock_mode)
210 {
211 	ssize_t			ret;
212 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
213 
214 	ret = xfs_ilock_iocb(iocb, *lock_mode);
215 	if (ret)
216 		return ret;
217 
218 	/*
219 	 * If a reflink remap is in progress we always need to take the iolock
220 	 * exclusively to wait for it to finish.
221 	 */
222 	if (*lock_mode == XFS_IOLOCK_SHARED &&
223 	    xfs_iflags_test(ip, XFS_IREMAPPING)) {
224 		xfs_iunlock(ip, *lock_mode);
225 		*lock_mode = XFS_IOLOCK_EXCL;
226 		return xfs_ilock_iocb(iocb, *lock_mode);
227 	}
228 
229 	return 0;
230 }
231 
232 STATIC ssize_t
233 xfs_file_dio_read(
234 	struct kiocb		*iocb,
235 	struct iov_iter		*to)
236 {
237 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
238 	ssize_t			ret;
239 
240 	trace_xfs_file_direct_read(iocb, to);
241 
242 	if (!iov_iter_count(to))
243 		return 0; /* skip atime */
244 
245 	file_accessed(iocb->ki_filp);
246 
247 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
248 	if (ret)
249 		return ret;
250 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
251 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
252 
253 	return ret;
254 }
255 
256 static noinline ssize_t
257 xfs_file_dax_read(
258 	struct kiocb		*iocb,
259 	struct iov_iter		*to)
260 {
261 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
262 	ssize_t			ret = 0;
263 
264 	trace_xfs_file_dax_read(iocb, to);
265 
266 	if (!iov_iter_count(to))
267 		return 0; /* skip atime */
268 
269 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
270 	if (ret)
271 		return ret;
272 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
273 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
274 
275 	file_accessed(iocb->ki_filp);
276 	return ret;
277 }
278 
279 STATIC ssize_t
280 xfs_file_buffered_read(
281 	struct kiocb		*iocb,
282 	struct iov_iter		*to)
283 {
284 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
285 	ssize_t			ret;
286 
287 	trace_xfs_file_buffered_read(iocb, to);
288 
289 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
290 	if (ret)
291 		return ret;
292 	ret = generic_file_read_iter(iocb, to);
293 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
294 
295 	return ret;
296 }
297 
298 STATIC ssize_t
299 xfs_file_read_iter(
300 	struct kiocb		*iocb,
301 	struct iov_iter		*to)
302 {
303 	struct inode		*inode = file_inode(iocb->ki_filp);
304 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
305 	ssize_t			ret = 0;
306 
307 	XFS_STATS_INC(mp, xs_read_calls);
308 
309 	if (xfs_is_shutdown(mp))
310 		return -EIO;
311 
312 	if (IS_DAX(inode))
313 		ret = xfs_file_dax_read(iocb, to);
314 	else if (iocb->ki_flags & IOCB_DIRECT)
315 		ret = xfs_file_dio_read(iocb, to);
316 	else
317 		ret = xfs_file_buffered_read(iocb, to);
318 
319 	if (ret > 0)
320 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
321 	return ret;
322 }
323 
324 STATIC ssize_t
325 xfs_file_splice_read(
326 	struct file		*in,
327 	loff_t			*ppos,
328 	struct pipe_inode_info	*pipe,
329 	size_t			len,
330 	unsigned int		flags)
331 {
332 	struct inode		*inode = file_inode(in);
333 	struct xfs_inode	*ip = XFS_I(inode);
334 	struct xfs_mount	*mp = ip->i_mount;
335 	ssize_t			ret = 0;
336 
337 	XFS_STATS_INC(mp, xs_read_calls);
338 
339 	if (xfs_is_shutdown(mp))
340 		return -EIO;
341 
342 	trace_xfs_file_splice_read(ip, *ppos, len);
343 
344 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
345 	ret = filemap_splice_read(in, ppos, pipe, len, flags);
346 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
347 	if (ret > 0)
348 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
349 	return ret;
350 }
351 
352 /*
353  * Take care of zeroing post-EOF blocks when they might exist.
354  *
355  * Returns 0 if successfully, a negative error for a failure, or 1 if this
356  * function dropped the iolock and reacquired it exclusively and the caller
357  * needs to restart the write sanity checks.
358  */
359 static ssize_t
360 xfs_file_write_zero_eof(
361 	struct kiocb		*iocb,
362 	struct iov_iter		*from,
363 	unsigned int		*iolock,
364 	size_t			count,
365 	bool			*drained_dio,
366 	struct xfs_zone_alloc_ctx *ac)
367 {
368 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
369 	loff_t			isize;
370 	int			error;
371 
372 	/*
373 	 * We need to serialise against EOF updates that occur in IO completions
374 	 * here. We want to make sure that nobody is changing the size while
375 	 * we do this check until we have placed an IO barrier (i.e. hold
376 	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
377 	 * spinlock effectively forms a memory barrier once we have
378 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
379 	 * hence be able to correctly determine if we need to run zeroing.
380 	 */
381 	spin_lock(&ip->i_flags_lock);
382 	isize = i_size_read(VFS_I(ip));
383 	if (iocb->ki_pos <= isize) {
384 		spin_unlock(&ip->i_flags_lock);
385 		return 0;
386 	}
387 	spin_unlock(&ip->i_flags_lock);
388 
389 	if (iocb->ki_flags & IOCB_NOWAIT)
390 		return -EAGAIN;
391 
392 	if (!*drained_dio) {
393 		/*
394 		 * If zeroing is needed and we are currently holding the iolock
395 		 * shared, we need to update it to exclusive which implies
396 		 * having to redo all checks before.
397 		 */
398 		if (*iolock == XFS_IOLOCK_SHARED) {
399 			xfs_iunlock(ip, *iolock);
400 			*iolock = XFS_IOLOCK_EXCL;
401 			xfs_ilock(ip, *iolock);
402 			iov_iter_reexpand(from, count);
403 		}
404 
405 		/*
406 		 * We now have an IO submission barrier in place, but AIO can do
407 		 * EOF updates during IO completion and hence we now need to
408 		 * wait for all of them to drain.  Non-AIO DIO will have drained
409 		 * before we are given the XFS_IOLOCK_EXCL, and so for most
410 		 * cases this wait is a no-op.
411 		 */
412 		inode_dio_wait(VFS_I(ip));
413 		*drained_dio = true;
414 		return 1;
415 	}
416 
417 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
418 
419 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
420 	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
421 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
422 
423 	return error;
424 }
425 
426 /*
427  * Common pre-write limit and setup checks.
428  *
429  * Called with the iolock held either shared and exclusive according to
430  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
431  * if called for a direct write beyond i_size.
432  */
433 STATIC ssize_t
434 xfs_file_write_checks(
435 	struct kiocb		*iocb,
436 	struct iov_iter		*from,
437 	unsigned int		*iolock,
438 	struct xfs_zone_alloc_ctx *ac)
439 {
440 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
441 	size_t			count = iov_iter_count(from);
442 	bool			drained_dio = false;
443 	ssize_t			error;
444 
445 restart:
446 	error = generic_write_checks(iocb, from);
447 	if (error <= 0)
448 		return error;
449 
450 	if (iocb->ki_flags & IOCB_NOWAIT) {
451 		error = break_layout(inode, false);
452 		if (error == -EWOULDBLOCK)
453 			error = -EAGAIN;
454 	} else {
455 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
456 	}
457 
458 	if (error)
459 		return error;
460 
461 	/*
462 	 * For changing security info in file_remove_privs() we need i_rwsem
463 	 * exclusively.
464 	 */
465 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
466 		xfs_iunlock(XFS_I(inode), *iolock);
467 		*iolock = XFS_IOLOCK_EXCL;
468 		error = xfs_ilock_iocb(iocb, *iolock);
469 		if (error) {
470 			*iolock = 0;
471 			return error;
472 		}
473 		goto restart;
474 	}
475 
476 	/*
477 	 * If the offset is beyond the size of the file, we need to zero all
478 	 * blocks that fall between the existing EOF and the start of this
479 	 * write.
480 	 *
481 	 * We can do an unlocked check for i_size here safely as I/O completion
482 	 * can only extend EOF.  Truncate is locked out at this point, so the
483 	 * EOF can not move backwards, only forwards. Hence we only need to take
484 	 * the slow path when we are at or beyond the current EOF.
485 	 */
486 	if (iocb->ki_pos > i_size_read(inode)) {
487 		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
488 				&drained_dio, ac);
489 		if (error == 1)
490 			goto restart;
491 		if (error)
492 			return error;
493 	}
494 
495 	return kiocb_modified(iocb);
496 }
497 
498 static ssize_t
499 xfs_zoned_write_space_reserve(
500 	struct xfs_inode		*ip,
501 	struct kiocb			*iocb,
502 	struct iov_iter			*from,
503 	unsigned int			flags,
504 	struct xfs_zone_alloc_ctx	*ac)
505 {
506 	loff_t				count = iov_iter_count(from);
507 	int				error;
508 
509 	if (iocb->ki_flags & IOCB_NOWAIT)
510 		flags |= XFS_ZR_NOWAIT;
511 
512 	/*
513 	 * Check the rlimit and LFS boundary first so that we don't over-reserve
514 	 * by possibly a lot.
515 	 *
516 	 * The generic write path will redo this check later, and it might have
517 	 * changed by then.  If it got expanded we'll stick to our earlier
518 	 * smaller limit, and if it is decreased the new smaller limit will be
519 	 * used and our extra space reservation will be returned after finishing
520 	 * the write.
521 	 */
522 	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
523 	if (error)
524 		return error;
525 
526 	/*
527 	 * Sloppily round up count to file system blocks.
528 	 *
529 	 * This will often reserve an extra block, but that avoids having to look
530 	 * at the start offset, which isn't stable for O_APPEND until taking the
531 	 * iolock.  Also we need to reserve a block each for zeroing the old
532 	 * EOF block and the new start block if they are unaligned.
533 	 *
534 	 * Any remaining block will be returned after the write.
535 	 */
536 	return xfs_zoned_space_reserve(ip,
537 			XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
538 }
539 
540 static int
541 xfs_dio_write_end_io(
542 	struct kiocb		*iocb,
543 	ssize_t			size,
544 	int			error,
545 	unsigned		flags)
546 {
547 	struct inode		*inode = file_inode(iocb->ki_filp);
548 	struct xfs_inode	*ip = XFS_I(inode);
549 	loff_t			offset = iocb->ki_pos;
550 	unsigned int		nofs_flag;
551 
552 	ASSERT(!xfs_is_zoned_inode(ip) ||
553 	       !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
554 
555 	trace_xfs_end_io_direct_write(ip, offset, size);
556 
557 	if (xfs_is_shutdown(ip->i_mount))
558 		return -EIO;
559 
560 	if (error)
561 		return error;
562 	if (!size)
563 		return 0;
564 
565 	/*
566 	 * Capture amount written on completion as we can't reliably account
567 	 * for it on submission.
568 	 */
569 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
570 
571 	/*
572 	 * We can allocate memory here while doing writeback on behalf of
573 	 * memory reclaim.  To avoid memory allocation deadlocks set the
574 	 * task-wide nofs context for the following operations.
575 	 */
576 	nofs_flag = memalloc_nofs_save();
577 
578 	if (flags & IOMAP_DIO_COW) {
579 		if (iocb->ki_flags & IOCB_ATOMIC)
580 			error = xfs_reflink_end_atomic_cow(ip, offset, size);
581 		else
582 			error = xfs_reflink_end_cow(ip, offset, size);
583 		if (error)
584 			goto out;
585 	}
586 
587 	/*
588 	 * Unwritten conversion updates the in-core isize after extent
589 	 * conversion but before updating the on-disk size. Updating isize any
590 	 * earlier allows a racing dio read to find unwritten extents before
591 	 * they are converted.
592 	 */
593 	if (flags & IOMAP_DIO_UNWRITTEN) {
594 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
595 		goto out;
596 	}
597 
598 	/*
599 	 * We need to update the in-core inode size here so that we don't end up
600 	 * with the on-disk inode size being outside the in-core inode size. We
601 	 * have no other method of updating EOF for AIO, so always do it here
602 	 * if necessary.
603 	 *
604 	 * We need to lock the test/set EOF update as we can be racing with
605 	 * other IO completions here to update the EOF. Failing to serialise
606 	 * here can result in EOF moving backwards and Bad Things Happen when
607 	 * that occurs.
608 	 *
609 	 * As IO completion only ever extends EOF, we can do an unlocked check
610 	 * here to avoid taking the spinlock. If we land within the current EOF,
611 	 * then we do not need to do an extending update at all, and we don't
612 	 * need to take the lock to check this. If we race with an update moving
613 	 * EOF, then we'll either still be beyond EOF and need to take the lock,
614 	 * or we'll be within EOF and we don't need to take it at all.
615 	 */
616 	if (offset + size <= i_size_read(inode))
617 		goto out;
618 
619 	spin_lock(&ip->i_flags_lock);
620 	if (offset + size > i_size_read(inode)) {
621 		i_size_write(inode, offset + size);
622 		spin_unlock(&ip->i_flags_lock);
623 		error = xfs_setfilesize(ip, offset, size);
624 	} else {
625 		spin_unlock(&ip->i_flags_lock);
626 	}
627 
628 out:
629 	memalloc_nofs_restore(nofs_flag);
630 	return error;
631 }
632 
633 static const struct iomap_dio_ops xfs_dio_write_ops = {
634 	.end_io		= xfs_dio_write_end_io,
635 };
636 
637 static void
638 xfs_dio_zoned_submit_io(
639 	const struct iomap_iter	*iter,
640 	struct bio		*bio,
641 	loff_t			file_offset)
642 {
643 	struct xfs_mount	*mp = XFS_I(iter->inode)->i_mount;
644 	struct xfs_zone_alloc_ctx *ac = iter->private;
645 	xfs_filblks_t		count_fsb;
646 	struct iomap_ioend	*ioend;
647 
648 	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
649 	if (count_fsb > ac->reserved_blocks) {
650 		xfs_err(mp,
651 "allocation (%lld) larger than reservation (%lld).",
652 			count_fsb, ac->reserved_blocks);
653 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
654 		bio_io_error(bio);
655 		return;
656 	}
657 	ac->reserved_blocks -= count_fsb;
658 
659 	bio->bi_end_io = xfs_end_bio;
660 	ioend = iomap_init_ioend(iter->inode, bio, file_offset,
661 			IOMAP_IOEND_DIRECT);
662 	xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
663 }
664 
665 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
666 	.bio_set	= &iomap_ioend_bioset,
667 	.submit_io	= xfs_dio_zoned_submit_io,
668 	.end_io		= xfs_dio_write_end_io,
669 };
670 
671 /*
672  * Handle block aligned direct I/O writes.
673  */
674 static noinline ssize_t
675 xfs_file_dio_write_aligned(
676 	struct xfs_inode	*ip,
677 	struct kiocb		*iocb,
678 	struct iov_iter		*from,
679 	const struct iomap_ops	*ops,
680 	const struct iomap_dio_ops *dops,
681 	struct xfs_zone_alloc_ctx *ac)
682 {
683 	unsigned int		iolock = XFS_IOLOCK_SHARED;
684 	ssize_t			ret;
685 
686 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
687 	if (ret)
688 		return ret;
689 	ret = xfs_file_write_checks(iocb, from, &iolock, ac);
690 	if (ret)
691 		goto out_unlock;
692 
693 	/*
694 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
695 	 * the iolock back to shared if we had to take the exclusive lock in
696 	 * xfs_file_write_checks() for other reasons.
697 	 */
698 	if (iolock == XFS_IOLOCK_EXCL) {
699 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
700 		iolock = XFS_IOLOCK_SHARED;
701 	}
702 	trace_xfs_file_direct_write(iocb, from);
703 	ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
704 out_unlock:
705 	xfs_iunlock(ip, iolock);
706 	return ret;
707 }
708 
709 /*
710  * Handle block aligned direct I/O writes to zoned devices.
711  */
712 static noinline ssize_t
713 xfs_file_dio_write_zoned(
714 	struct xfs_inode	*ip,
715 	struct kiocb		*iocb,
716 	struct iov_iter		*from)
717 {
718 	struct xfs_zone_alloc_ctx ac = { };
719 	ssize_t			ret;
720 
721 	ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
722 	if (ret < 0)
723 		return ret;
724 	ret = xfs_file_dio_write_aligned(ip, iocb, from,
725 			&xfs_zoned_direct_write_iomap_ops,
726 			&xfs_dio_zoned_write_ops, &ac);
727 	xfs_zoned_space_unreserve(ip, &ac);
728 	return ret;
729 }
730 
731 /*
732  * Handle block atomic writes
733  *
734  * Two methods of atomic writes are supported:
735  * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
736  *   disk
737  * - COW-based, which uses a COW fork as a staging extent for data updates
738  *   before atomically updating extent mappings for the range being written
739  *
740  */
741 static noinline ssize_t
742 xfs_file_dio_write_atomic(
743 	struct xfs_inode	*ip,
744 	struct kiocb		*iocb,
745 	struct iov_iter		*from)
746 {
747 	unsigned int		iolock = XFS_IOLOCK_SHARED;
748 	ssize_t			ret, ocount = iov_iter_count(from);
749 	const struct iomap_ops	*dops;
750 
751 	/*
752 	 * HW offload should be faster, so try that first if it is already
753 	 * known that the write length is not too large.
754 	 */
755 	if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max)
756 		dops = &xfs_atomic_write_cow_iomap_ops;
757 	else
758 		dops = &xfs_direct_write_iomap_ops;
759 
760 retry:
761 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
762 	if (ret)
763 		return ret;
764 
765 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
766 	if (ret)
767 		goto out_unlock;
768 
769 	/* Demote similar to xfs_file_dio_write_aligned() */
770 	if (iolock == XFS_IOLOCK_EXCL) {
771 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772 		iolock = XFS_IOLOCK_SHARED;
773 	}
774 
775 	trace_xfs_file_direct_write(iocb, from);
776 	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
777 			0, NULL, 0);
778 
779 	/*
780 	 * The retry mechanism is based on the ->iomap_begin method returning
781 	 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
782 	 * possible. The REQ_ATOMIC-based method typically not be possible if
783 	 * the write spans multiple extents or the disk blocks are misaligned.
784 	 */
785 	if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
786 		xfs_iunlock(ip, iolock);
787 		dops = &xfs_atomic_write_cow_iomap_ops;
788 		goto retry;
789 	}
790 
791 out_unlock:
792 	if (iolock)
793 		xfs_iunlock(ip, iolock);
794 	return ret;
795 }
796 
797 /*
798  * Handle block unaligned direct I/O writes
799  *
800  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
801  * them to be done in parallel with reads and other direct I/O writes.  However,
802  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
803  * to do sub-block zeroing and that requires serialisation against other direct
804  * I/O to the same block.  In this case we need to serialise the submission of
805  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
806  * In the case where sub-block zeroing is not required, we can do concurrent
807  * sub-block dios to the same block successfully.
808  *
809  * Optimistically submit the I/O using the shared lock first, but use the
810  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
811  * if block allocation or partial block zeroing would be required.  In that case
812  * we try again with the exclusive lock.
813  */
814 static noinline ssize_t
815 xfs_file_dio_write_unaligned(
816 	struct xfs_inode	*ip,
817 	struct kiocb		*iocb,
818 	struct iov_iter		*from)
819 {
820 	size_t			isize = i_size_read(VFS_I(ip));
821 	size_t			count = iov_iter_count(from);
822 	unsigned int		iolock = XFS_IOLOCK_SHARED;
823 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
824 	ssize_t			ret;
825 
826 	/*
827 	 * Extending writes need exclusivity because of the sub-block zeroing
828 	 * that the DIO code always does for partial tail blocks beyond EOF, so
829 	 * don't even bother trying the fast path in this case.
830 	 */
831 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
832 		if (iocb->ki_flags & IOCB_NOWAIT)
833 			return -EAGAIN;
834 retry_exclusive:
835 		iolock = XFS_IOLOCK_EXCL;
836 		flags = IOMAP_DIO_FORCE_WAIT;
837 	}
838 
839 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
840 	if (ret)
841 		return ret;
842 
843 	/*
844 	 * We can't properly handle unaligned direct I/O to reflink files yet,
845 	 * as we can't unshare a partial block.
846 	 */
847 	if (xfs_is_cow_inode(ip)) {
848 		trace_xfs_reflink_bounce_dio_write(iocb, from);
849 		ret = -ENOTBLK;
850 		goto out_unlock;
851 	}
852 
853 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
854 	if (ret)
855 		goto out_unlock;
856 
857 	/*
858 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
859 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
860 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
861 	 * drain first.
862 	 */
863 	if (flags & IOMAP_DIO_FORCE_WAIT)
864 		inode_dio_wait(VFS_I(ip));
865 
866 	trace_xfs_file_direct_write(iocb, from);
867 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
868 			   &xfs_dio_write_ops, flags, NULL, 0);
869 
870 	/*
871 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
872 	 * layer rejected it for mapping or locking reasons. If we are doing
873 	 * nonblocking user I/O, propagate the error.
874 	 */
875 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
876 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
877 		xfs_iunlock(ip, iolock);
878 		goto retry_exclusive;
879 	}
880 
881 out_unlock:
882 	if (iolock)
883 		xfs_iunlock(ip, iolock);
884 	return ret;
885 }
886 
887 static ssize_t
888 xfs_file_dio_write(
889 	struct kiocb		*iocb,
890 	struct iov_iter		*from)
891 {
892 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
893 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
894 	size_t			count = iov_iter_count(from);
895 
896 	/* direct I/O must be aligned to device logical sector size */
897 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
898 		return -EINVAL;
899 
900 	/*
901 	 * For always COW inodes we also must check the alignment of each
902 	 * individual iovec segment, as they could end up with different
903 	 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
904 	 * then overwrite an already written block.
905 	 */
906 	if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
907 	    (xfs_is_always_cow_inode(ip) &&
908 	     (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
909 		return xfs_file_dio_write_unaligned(ip, iocb, from);
910 	if (xfs_is_zoned_inode(ip))
911 		return xfs_file_dio_write_zoned(ip, iocb, from);
912 	if (iocb->ki_flags & IOCB_ATOMIC)
913 		return xfs_file_dio_write_atomic(ip, iocb, from);
914 	return xfs_file_dio_write_aligned(ip, iocb, from,
915 			&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
916 }
917 
918 static noinline ssize_t
919 xfs_file_dax_write(
920 	struct kiocb		*iocb,
921 	struct iov_iter		*from)
922 {
923 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
924 	struct xfs_inode	*ip = XFS_I(inode);
925 	unsigned int		iolock = XFS_IOLOCK_EXCL;
926 	ssize_t			ret, error = 0;
927 	loff_t			pos;
928 
929 	ret = xfs_ilock_iocb(iocb, iolock);
930 	if (ret)
931 		return ret;
932 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
933 	if (ret)
934 		goto out;
935 
936 	pos = iocb->ki_pos;
937 
938 	trace_xfs_file_dax_write(iocb, from);
939 	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
940 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
941 		i_size_write(inode, iocb->ki_pos);
942 		error = xfs_setfilesize(ip, pos, ret);
943 	}
944 out:
945 	if (iolock)
946 		xfs_iunlock(ip, iolock);
947 	if (error)
948 		return error;
949 
950 	if (ret > 0) {
951 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
952 
953 		/* Handle various SYNC-type writes */
954 		ret = generic_write_sync(iocb, ret);
955 	}
956 	return ret;
957 }
958 
959 STATIC ssize_t
960 xfs_file_buffered_write(
961 	struct kiocb		*iocb,
962 	struct iov_iter		*from)
963 {
964 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
965 	struct xfs_inode	*ip = XFS_I(inode);
966 	ssize_t			ret;
967 	bool			cleared_space = false;
968 	unsigned int		iolock;
969 
970 write_retry:
971 	iolock = XFS_IOLOCK_EXCL;
972 	ret = xfs_ilock_iocb(iocb, iolock);
973 	if (ret)
974 		return ret;
975 
976 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
977 	if (ret)
978 		goto out;
979 
980 	trace_xfs_file_buffered_write(iocb, from);
981 	ret = iomap_file_buffered_write(iocb, from,
982 			&xfs_buffered_write_iomap_ops, NULL);
983 
984 	/*
985 	 * If we hit a space limit, try to free up some lingering preallocated
986 	 * space before returning an error. In the case of ENOSPC, first try to
987 	 * write back all dirty inodes to free up some of the excess reserved
988 	 * metadata space. This reduces the chances that the eofblocks scan
989 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
990 	 * also behaves as a filter to prevent too many eofblocks scans from
991 	 * running at the same time.  Use a synchronous scan to increase the
992 	 * effectiveness of the scan.
993 	 */
994 	if (ret == -EDQUOT && !cleared_space) {
995 		xfs_iunlock(ip, iolock);
996 		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
997 		cleared_space = true;
998 		goto write_retry;
999 	} else if (ret == -ENOSPC && !cleared_space) {
1000 		struct xfs_icwalk	icw = {0};
1001 
1002 		cleared_space = true;
1003 		xfs_flush_inodes(ip->i_mount);
1004 
1005 		xfs_iunlock(ip, iolock);
1006 		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1007 		xfs_blockgc_free_space(ip->i_mount, &icw);
1008 		goto write_retry;
1009 	}
1010 
1011 out:
1012 	if (iolock)
1013 		xfs_iunlock(ip, iolock);
1014 
1015 	if (ret > 0) {
1016 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1017 		/* Handle various SYNC-type writes */
1018 		ret = generic_write_sync(iocb, ret);
1019 	}
1020 	return ret;
1021 }
1022 
1023 STATIC ssize_t
1024 xfs_file_buffered_write_zoned(
1025 	struct kiocb		*iocb,
1026 	struct iov_iter		*from)
1027 {
1028 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
1029 	struct xfs_mount	*mp = ip->i_mount;
1030 	unsigned int		iolock = XFS_IOLOCK_EXCL;
1031 	bool			cleared_space = false;
1032 	struct xfs_zone_alloc_ctx ac = { };
1033 	ssize_t			ret;
1034 
1035 	ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
1036 	if (ret < 0)
1037 		return ret;
1038 
1039 	ret = xfs_ilock_iocb(iocb, iolock);
1040 	if (ret)
1041 		goto out_unreserve;
1042 
1043 	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1044 	if (ret)
1045 		goto out_unlock;
1046 
1047 	/*
1048 	 * Truncate the iter to the length that we were actually able to
1049 	 * allocate blocks for.  This needs to happen after
1050 	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1051 	 * writes.
1052 	 */
1053 	iov_iter_truncate(from,
1054 			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1055 			(iocb->ki_pos & mp->m_blockmask));
1056 	if (!iov_iter_count(from))
1057 		goto out_unlock;
1058 
1059 retry:
1060 	trace_xfs_file_buffered_write(iocb, from);
1061 	ret = iomap_file_buffered_write(iocb, from,
1062 			&xfs_buffered_write_iomap_ops, &ac);
1063 	if (ret == -ENOSPC && !cleared_space) {
1064 		/*
1065 		 * Kick off writeback to convert delalloc space and release the
1066 		 * usually too pessimistic indirect block reservations.
1067 		 */
1068 		xfs_flush_inodes(mp);
1069 		cleared_space = true;
1070 		goto retry;
1071 	}
1072 
1073 out_unlock:
1074 	xfs_iunlock(ip, iolock);
1075 out_unreserve:
1076 	xfs_zoned_space_unreserve(ip, &ac);
1077 	if (ret > 0) {
1078 		XFS_STATS_ADD(mp, xs_write_bytes, ret);
1079 		ret = generic_write_sync(iocb, ret);
1080 	}
1081 	return ret;
1082 }
1083 
1084 STATIC ssize_t
1085 xfs_file_write_iter(
1086 	struct kiocb		*iocb,
1087 	struct iov_iter		*from)
1088 {
1089 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1090 	struct xfs_inode	*ip = XFS_I(inode);
1091 	ssize_t			ret;
1092 	size_t			ocount = iov_iter_count(from);
1093 
1094 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1095 
1096 	if (ocount == 0)
1097 		return 0;
1098 
1099 	if (xfs_is_shutdown(ip->i_mount))
1100 		return -EIO;
1101 
1102 	if (IS_DAX(inode))
1103 		return xfs_file_dax_write(iocb, from);
1104 
1105 	if (iocb->ki_flags & IOCB_ATOMIC) {
1106 		if (ocount < xfs_get_atomic_write_min(ip))
1107 			return -EINVAL;
1108 
1109 		if (ocount > xfs_get_atomic_write_max(ip))
1110 			return -EINVAL;
1111 
1112 		ret = generic_atomic_write_valid(iocb, from);
1113 		if (ret)
1114 			return ret;
1115 	}
1116 
1117 	if (iocb->ki_flags & IOCB_DIRECT) {
1118 		/*
1119 		 * Allow a directio write to fall back to a buffered
1120 		 * write *only* in the case that we're doing a reflink
1121 		 * CoW.  In all other directio scenarios we do not
1122 		 * allow an operation to fall back to buffered mode.
1123 		 */
1124 		ret = xfs_file_dio_write(iocb, from);
1125 		if (ret != -ENOTBLK)
1126 			return ret;
1127 	}
1128 
1129 	if (xfs_is_zoned_inode(ip))
1130 		return xfs_file_buffered_write_zoned(iocb, from);
1131 	return xfs_file_buffered_write(iocb, from);
1132 }
1133 
1134 /* Does this file, inode, or mount want synchronous writes? */
1135 static inline bool xfs_file_sync_writes(struct file *filp)
1136 {
1137 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1138 
1139 	if (xfs_has_wsync(ip->i_mount))
1140 		return true;
1141 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1142 		return true;
1143 	if (IS_SYNC(file_inode(filp)))
1144 		return true;
1145 
1146 	return false;
1147 }
1148 
1149 static int
1150 xfs_falloc_newsize(
1151 	struct file		*file,
1152 	int			mode,
1153 	loff_t			offset,
1154 	loff_t			len,
1155 	loff_t			*new_size)
1156 {
1157 	struct inode		*inode = file_inode(file);
1158 
1159 	if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1160 		return 0;
1161 	*new_size = offset + len;
1162 	return inode_newsize_ok(inode, *new_size);
1163 }
1164 
1165 static int
1166 xfs_falloc_setsize(
1167 	struct file		*file,
1168 	loff_t			new_size)
1169 {
1170 	struct iattr iattr = {
1171 		.ia_valid	= ATTR_SIZE,
1172 		.ia_size	= new_size,
1173 	};
1174 
1175 	if (!new_size)
1176 		return 0;
1177 	return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1178 			&iattr);
1179 }
1180 
1181 static int
1182 xfs_falloc_collapse_range(
1183 	struct file		*file,
1184 	loff_t			offset,
1185 	loff_t			len,
1186 	struct xfs_zone_alloc_ctx *ac)
1187 {
1188 	struct inode		*inode = file_inode(file);
1189 	loff_t			new_size = i_size_read(inode) - len;
1190 	int			error;
1191 
1192 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1193 		return -EINVAL;
1194 
1195 	/*
1196 	 * There is no need to overlap collapse range with EOF, in which case it
1197 	 * is effectively a truncate operation
1198 	 */
1199 	if (offset + len >= i_size_read(inode))
1200 		return -EINVAL;
1201 
1202 	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1203 	if (error)
1204 		return error;
1205 	return xfs_falloc_setsize(file, new_size);
1206 }
1207 
1208 static int
1209 xfs_falloc_insert_range(
1210 	struct file		*file,
1211 	loff_t			offset,
1212 	loff_t			len)
1213 {
1214 	struct inode		*inode = file_inode(file);
1215 	loff_t			isize = i_size_read(inode);
1216 	int			error;
1217 
1218 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1219 		return -EINVAL;
1220 
1221 	/*
1222 	 * New inode size must not exceed ->s_maxbytes, accounting for
1223 	 * possible signed overflow.
1224 	 */
1225 	if (inode->i_sb->s_maxbytes - isize < len)
1226 		return -EFBIG;
1227 
1228 	/* Offset should be less than i_size */
1229 	if (offset >= isize)
1230 		return -EINVAL;
1231 
1232 	error = xfs_falloc_setsize(file, isize + len);
1233 	if (error)
1234 		return error;
1235 
1236 	/*
1237 	 * Perform hole insertion now that the file size has been updated so
1238 	 * that if we crash during the operation we don't leave shifted extents
1239 	 * past EOF and hence losing access to the data that is contained within
1240 	 * them.
1241 	 */
1242 	return xfs_insert_file_space(XFS_I(inode), offset, len);
1243 }
1244 
1245 /*
1246  * Punch a hole and prealloc the range.  We use a hole punch rather than
1247  * unwritten extent conversion for two reasons:
1248  *
1249  *   1.) Hole punch handles partial block zeroing for us.
1250  *   2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1251  *	 virtue of the hole punch.
1252  */
1253 static int
1254 xfs_falloc_zero_range(
1255 	struct file		*file,
1256 	int			mode,
1257 	loff_t			offset,
1258 	loff_t			len,
1259 	struct xfs_zone_alloc_ctx *ac)
1260 {
1261 	struct inode		*inode = file_inode(file);
1262 	unsigned int		blksize = i_blocksize(inode);
1263 	loff_t			new_size = 0;
1264 	int			error;
1265 
1266 	trace_xfs_zero_file_space(XFS_I(inode));
1267 
1268 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1269 	if (error)
1270 		return error;
1271 
1272 	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
1273 	if (error)
1274 		return error;
1275 
1276 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
1277 	offset = round_down(offset, blksize);
1278 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1279 	if (error)
1280 		return error;
1281 	return xfs_falloc_setsize(file, new_size);
1282 }
1283 
1284 static int
1285 xfs_falloc_unshare_range(
1286 	struct file		*file,
1287 	int			mode,
1288 	loff_t			offset,
1289 	loff_t			len)
1290 {
1291 	struct inode		*inode = file_inode(file);
1292 	loff_t			new_size = 0;
1293 	int			error;
1294 
1295 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1296 	if (error)
1297 		return error;
1298 
1299 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1300 	if (error)
1301 		return error;
1302 
1303 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1304 	if (error)
1305 		return error;
1306 	return xfs_falloc_setsize(file, new_size);
1307 }
1308 
1309 static int
1310 xfs_falloc_allocate_range(
1311 	struct file		*file,
1312 	int			mode,
1313 	loff_t			offset,
1314 	loff_t			len)
1315 {
1316 	struct inode		*inode = file_inode(file);
1317 	loff_t			new_size = 0;
1318 	int			error;
1319 
1320 	/*
1321 	 * If always_cow mode we can't use preallocations and thus should not
1322 	 * create them.
1323 	 */
1324 	if (xfs_is_always_cow_inode(XFS_I(inode)))
1325 		return -EOPNOTSUPP;
1326 
1327 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1328 	if (error)
1329 		return error;
1330 
1331 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1332 	if (error)
1333 		return error;
1334 	return xfs_falloc_setsize(file, new_size);
1335 }
1336 
1337 #define	XFS_FALLOC_FL_SUPPORTED						\
1338 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
1339 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
1340 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
1341 		 FALLOC_FL_UNSHARE_RANGE)
1342 
1343 STATIC long
1344 __xfs_file_fallocate(
1345 	struct file		*file,
1346 	int			mode,
1347 	loff_t			offset,
1348 	loff_t			len,
1349 	struct xfs_zone_alloc_ctx *ac)
1350 {
1351 	struct inode		*inode = file_inode(file);
1352 	struct xfs_inode	*ip = XFS_I(inode);
1353 	long			error;
1354 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1355 
1356 	xfs_ilock(ip, iolock);
1357 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1358 	if (error)
1359 		goto out_unlock;
1360 
1361 	/*
1362 	 * Must wait for all AIO to complete before we continue as AIO can
1363 	 * change the file size on completion without holding any locks we
1364 	 * currently hold. We must do this first because AIO can update both
1365 	 * the on disk and in memory inode sizes, and the operations that follow
1366 	 * require the in-memory size to be fully up-to-date.
1367 	 */
1368 	inode_dio_wait(inode);
1369 
1370 	error = file_modified(file);
1371 	if (error)
1372 		goto out_unlock;
1373 
1374 	switch (mode & FALLOC_FL_MODE_MASK) {
1375 	case FALLOC_FL_PUNCH_HOLE:
1376 		error = xfs_free_file_space(ip, offset, len, ac);
1377 		break;
1378 	case FALLOC_FL_COLLAPSE_RANGE:
1379 		error = xfs_falloc_collapse_range(file, offset, len, ac);
1380 		break;
1381 	case FALLOC_FL_INSERT_RANGE:
1382 		error = xfs_falloc_insert_range(file, offset, len);
1383 		break;
1384 	case FALLOC_FL_ZERO_RANGE:
1385 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1386 		break;
1387 	case FALLOC_FL_UNSHARE_RANGE:
1388 		error = xfs_falloc_unshare_range(file, mode, offset, len);
1389 		break;
1390 	case FALLOC_FL_ALLOCATE_RANGE:
1391 		error = xfs_falloc_allocate_range(file, mode, offset, len);
1392 		break;
1393 	default:
1394 		error = -EOPNOTSUPP;
1395 		break;
1396 	}
1397 
1398 	if (!error && xfs_file_sync_writes(file))
1399 		error = xfs_log_force_inode(ip);
1400 
1401 out_unlock:
1402 	xfs_iunlock(ip, iolock);
1403 	return error;
1404 }
1405 
1406 static long
1407 xfs_file_zoned_fallocate(
1408 	struct file		*file,
1409 	int			mode,
1410 	loff_t			offset,
1411 	loff_t			len)
1412 {
1413 	struct xfs_zone_alloc_ctx ac = { };
1414 	struct xfs_inode	*ip = XFS_I(file_inode(file));
1415 	int			error;
1416 
1417 	error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
1418 	if (error)
1419 		return error;
1420 	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1421 	xfs_zoned_space_unreserve(ip, &ac);
1422 	return error;
1423 }
1424 
1425 static long
1426 xfs_file_fallocate(
1427 	struct file		*file,
1428 	int			mode,
1429 	loff_t			offset,
1430 	loff_t			len)
1431 {
1432 	struct inode		*inode = file_inode(file);
1433 
1434 	if (!S_ISREG(inode->i_mode))
1435 		return -EINVAL;
1436 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1437 		return -EOPNOTSUPP;
1438 
1439 	/*
1440 	 * For zoned file systems, zeroing the first and last block of a hole
1441 	 * punch requires allocating a new block to rewrite the remaining data
1442 	 * and new zeroes out of place.  Get a reservations for those before
1443 	 * taking the iolock.  Dip into the reserved pool because we are
1444 	 * expected to be able to punch a hole even on a completely full
1445 	 * file system.
1446 	 */
1447 	if (xfs_is_zoned_inode(XFS_I(inode)) &&
1448 	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1449 		     FALLOC_FL_COLLAPSE_RANGE)))
1450 		return xfs_file_zoned_fallocate(file, mode, offset, len);
1451 	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1452 }
1453 
1454 STATIC int
1455 xfs_file_fadvise(
1456 	struct file	*file,
1457 	loff_t		start,
1458 	loff_t		end,
1459 	int		advice)
1460 {
1461 	struct xfs_inode *ip = XFS_I(file_inode(file));
1462 	int ret;
1463 	int lockflags = 0;
1464 
1465 	/*
1466 	 * Operations creating pages in page cache need protection from hole
1467 	 * punching and similar ops
1468 	 */
1469 	if (advice == POSIX_FADV_WILLNEED) {
1470 		lockflags = XFS_IOLOCK_SHARED;
1471 		xfs_ilock(ip, lockflags);
1472 	}
1473 	ret = generic_fadvise(file, start, end, advice);
1474 	if (lockflags)
1475 		xfs_iunlock(ip, lockflags);
1476 	return ret;
1477 }
1478 
1479 STATIC loff_t
1480 xfs_file_remap_range(
1481 	struct file		*file_in,
1482 	loff_t			pos_in,
1483 	struct file		*file_out,
1484 	loff_t			pos_out,
1485 	loff_t			len,
1486 	unsigned int		remap_flags)
1487 {
1488 	struct inode		*inode_in = file_inode(file_in);
1489 	struct xfs_inode	*src = XFS_I(inode_in);
1490 	struct inode		*inode_out = file_inode(file_out);
1491 	struct xfs_inode	*dest = XFS_I(inode_out);
1492 	struct xfs_mount	*mp = src->i_mount;
1493 	loff_t			remapped = 0;
1494 	xfs_extlen_t		cowextsize;
1495 	int			ret;
1496 
1497 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1498 		return -EINVAL;
1499 
1500 	if (!xfs_has_reflink(mp))
1501 		return -EOPNOTSUPP;
1502 
1503 	if (xfs_is_shutdown(mp))
1504 		return -EIO;
1505 
1506 	/* Prepare and then clone file data. */
1507 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1508 			&len, remap_flags);
1509 	if (ret || len == 0)
1510 		return ret;
1511 
1512 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1513 
1514 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1515 			&remapped);
1516 	if (ret)
1517 		goto out_unlock;
1518 
1519 	/*
1520 	 * Carry the cowextsize hint from src to dest if we're sharing the
1521 	 * entire source file to the entire destination file, the source file
1522 	 * has a cowextsize hint, and the destination file does not.
1523 	 */
1524 	cowextsize = 0;
1525 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1526 	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1527 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1528 	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1529 		cowextsize = src->i_cowextsize;
1530 
1531 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1532 			remap_flags);
1533 	if (ret)
1534 		goto out_unlock;
1535 
1536 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1537 		xfs_log_force_inode(dest);
1538 out_unlock:
1539 	xfs_iunlock2_remapping(src, dest);
1540 	if (ret)
1541 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1542 	/*
1543 	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1544 	 * handle partial results -- either the whole remap succeeds, or we
1545 	 * must say why it did not.  In this case, any error should be returned
1546 	 * to the caller.
1547 	 */
1548 	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1549 		return ret;
1550 	return remapped > 0 ? remapped : ret;
1551 }
1552 
1553 STATIC int
1554 xfs_file_open(
1555 	struct inode	*inode,
1556 	struct file	*file)
1557 {
1558 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1559 		return -EIO;
1560 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1561 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1562 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1563 	return generic_file_open(inode, file);
1564 }
1565 
1566 STATIC int
1567 xfs_dir_open(
1568 	struct inode	*inode,
1569 	struct file	*file)
1570 {
1571 	struct xfs_inode *ip = XFS_I(inode);
1572 	unsigned int	mode;
1573 	int		error;
1574 
1575 	if (xfs_is_shutdown(ip->i_mount))
1576 		return -EIO;
1577 	error = generic_file_open(inode, file);
1578 	if (error)
1579 		return error;
1580 
1581 	/*
1582 	 * If there are any blocks, read-ahead block 0 as we're almost
1583 	 * certain to have the next operation be a read there.
1584 	 */
1585 	mode = xfs_ilock_data_map_shared(ip);
1586 	if (ip->i_df.if_nextents > 0)
1587 		error = xfs_dir3_data_readahead(ip, 0, 0);
1588 	xfs_iunlock(ip, mode);
1589 	return error;
1590 }
1591 
1592 /*
1593  * Don't bother propagating errors.  We're just doing cleanup, and the caller
1594  * ignores the return value anyway.
1595  */
1596 STATIC int
1597 xfs_file_release(
1598 	struct inode		*inode,
1599 	struct file		*file)
1600 {
1601 	struct xfs_inode	*ip = XFS_I(inode);
1602 	struct xfs_mount	*mp = ip->i_mount;
1603 
1604 	/*
1605 	 * If this is a read-only mount or the file system has been shut down,
1606 	 * don't generate I/O.
1607 	 */
1608 	if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1609 		return 0;
1610 
1611 	/*
1612 	 * If we previously truncated this file and removed old data in the
1613 	 * process, we want to initiate "early" writeout on the last close.
1614 	 * This is an attempt to combat the notorious NULL files problem which
1615 	 * is particularly noticeable from a truncate down, buffered (re-)write
1616 	 * (delalloc), followed by a crash.  What we are effectively doing here
1617 	 * is significantly reducing the time window where we'd otherwise be
1618 	 * exposed to that problem.
1619 	 */
1620 	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1621 		xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1622 		if (ip->i_delayed_blks > 0)
1623 			filemap_flush(inode->i_mapping);
1624 	}
1625 
1626 	/*
1627 	 * XFS aggressively preallocates post-EOF space to generate contiguous
1628 	 * allocations for writers that append to the end of the file.
1629 	 *
1630 	 * To support workloads that close and reopen the file frequently, these
1631 	 * preallocations usually persist after a close unless it is the first
1632 	 * close for the inode.  This is a tradeoff to generate tightly packed
1633 	 * data layouts for unpacking tarballs or similar archives that write
1634 	 * one file after another without going back to it while keeping the
1635 	 * preallocation for files that have recurring open/write/close cycles.
1636 	 *
1637 	 * This heuristic is skipped for inodes with the append-only flag as
1638 	 * that flag is rather pointless for inodes written only once.
1639 	 *
1640 	 * There is no point in freeing blocks here for open but unlinked files
1641 	 * as they will be taken care of by the inactivation path soon.
1642 	 *
1643 	 * When releasing a read-only context, don't flush data or trim post-EOF
1644 	 * blocks.  This avoids open/read/close workloads from removing EOF
1645 	 * blocks that other writers depend upon to reduce fragmentation.
1646 	 *
1647 	 * Inodes on the zoned RT device never have preallocations, so skip
1648 	 * taking the locks below.
1649 	 */
1650 	if (!inode->i_nlink ||
1651 	    !(file->f_mode & FMODE_WRITE) ||
1652 	    (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1653 	    xfs_is_zoned_inode(ip))
1654 		return 0;
1655 
1656 	/*
1657 	 * If we can't get the iolock just skip truncating the blocks past EOF
1658 	 * because we could deadlock with the mmap_lock otherwise. We'll get
1659 	 * another chance to drop them once the last reference to the inode is
1660 	 * dropped, so we'll never leak blocks permanently.
1661 	 */
1662 	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1663 	    xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1664 		if (xfs_can_free_eofblocks(ip) &&
1665 		    !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1666 			xfs_free_eofblocks(ip);
1667 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1668 	}
1669 
1670 	return 0;
1671 }
1672 
1673 STATIC int
1674 xfs_file_readdir(
1675 	struct file	*file,
1676 	struct dir_context *ctx)
1677 {
1678 	struct inode	*inode = file_inode(file);
1679 	xfs_inode_t	*ip = XFS_I(inode);
1680 	size_t		bufsize;
1681 
1682 	/*
1683 	 * The Linux API doesn't pass down the total size of the buffer
1684 	 * we read into down to the filesystem.  With the filldir concept
1685 	 * it's not needed for correct information, but the XFS dir2 leaf
1686 	 * code wants an estimate of the buffer size to calculate it's
1687 	 * readahead window and size the buffers used for mapping to
1688 	 * physical blocks.
1689 	 *
1690 	 * Try to give it an estimate that's good enough, maybe at some
1691 	 * point we can change the ->readdir prototype to include the
1692 	 * buffer size.  For now we use the current glibc buffer size.
1693 	 */
1694 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1695 
1696 	return xfs_readdir(NULL, ip, ctx, bufsize);
1697 }
1698 
1699 STATIC loff_t
1700 xfs_file_llseek(
1701 	struct file	*file,
1702 	loff_t		offset,
1703 	int		whence)
1704 {
1705 	struct inode		*inode = file->f_mapping->host;
1706 
1707 	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1708 		return -EIO;
1709 
1710 	switch (whence) {
1711 	default:
1712 		return generic_file_llseek(file, offset, whence);
1713 	case SEEK_HOLE:
1714 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1715 		break;
1716 	case SEEK_DATA:
1717 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1718 		break;
1719 	}
1720 
1721 	if (offset < 0)
1722 		return offset;
1723 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1724 }
1725 
1726 static inline vm_fault_t
1727 xfs_dax_fault_locked(
1728 	struct vm_fault		*vmf,
1729 	unsigned int		order,
1730 	bool			write_fault)
1731 {
1732 	vm_fault_t		ret;
1733 	pfn_t			pfn;
1734 
1735 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1736 		ASSERT(0);
1737 		return VM_FAULT_SIGBUS;
1738 	}
1739 	ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1740 			(write_fault && !vmf->cow_page) ?
1741 				&xfs_dax_write_iomap_ops :
1742 				&xfs_read_iomap_ops);
1743 	if (ret & VM_FAULT_NEEDDSYNC)
1744 		ret = dax_finish_sync_fault(vmf, order, pfn);
1745 	return ret;
1746 }
1747 
1748 static vm_fault_t
1749 xfs_dax_read_fault(
1750 	struct vm_fault		*vmf,
1751 	unsigned int		order)
1752 {
1753 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1754 	vm_fault_t		ret;
1755 
1756 	trace_xfs_read_fault(ip, order);
1757 
1758 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1759 	ret = xfs_dax_fault_locked(vmf, order, false);
1760 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1761 
1762 	return ret;
1763 }
1764 
1765 /*
1766  * Locking for serialisation of IO during page faults. This results in a lock
1767  * ordering of:
1768  *
1769  * mmap_lock (MM)
1770  *   sb_start_pagefault(vfs, freeze)
1771  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1772  *       page_lock (MM)
1773  *         i_lock (XFS - extent map serialisation)
1774  */
1775 static vm_fault_t
1776 __xfs_write_fault(
1777 	struct vm_fault		*vmf,
1778 	unsigned int		order,
1779 	struct xfs_zone_alloc_ctx *ac)
1780 {
1781 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1782 	struct xfs_inode	*ip = XFS_I(inode);
1783 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
1784 	vm_fault_t		ret;
1785 
1786 	trace_xfs_write_fault(ip, order);
1787 
1788 	sb_start_pagefault(inode->i_sb);
1789 	file_update_time(vmf->vma->vm_file);
1790 
1791 	/*
1792 	 * Normally we only need the shared mmaplock, but if a reflink remap is
1793 	 * in progress we take the exclusive lock to wait for the remap to
1794 	 * finish before taking a write fault.
1795 	 */
1796 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1797 	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1798 		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1799 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1800 		lock_mode = XFS_MMAPLOCK_EXCL;
1801 	}
1802 
1803 	if (IS_DAX(inode))
1804 		ret = xfs_dax_fault_locked(vmf, order, true);
1805 	else
1806 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1807 				ac);
1808 	xfs_iunlock(ip, lock_mode);
1809 
1810 	sb_end_pagefault(inode->i_sb);
1811 	return ret;
1812 }
1813 
1814 static vm_fault_t
1815 xfs_write_fault_zoned(
1816 	struct vm_fault		*vmf,
1817 	unsigned int		order)
1818 {
1819 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1820 	unsigned int		len = folio_size(page_folio(vmf->page));
1821 	struct xfs_zone_alloc_ctx ac = { };
1822 	int			error;
1823 	vm_fault_t		ret;
1824 
1825 	/*
1826 	 * This could over-allocate as it doesn't check for truncation.
1827 	 *
1828 	 * But as the overallocation is limited to less than a folio and will be
1829 	 * release instantly that's just fine.
1830 	 */
1831 	error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
1832 			&ac);
1833 	if (error < 0)
1834 		return vmf_fs_error(error);
1835 	ret = __xfs_write_fault(vmf, order, &ac);
1836 	xfs_zoned_space_unreserve(ip, &ac);
1837 	return ret;
1838 }
1839 
1840 static vm_fault_t
1841 xfs_write_fault(
1842 	struct vm_fault		*vmf,
1843 	unsigned int		order)
1844 {
1845 	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1846 		return xfs_write_fault_zoned(vmf, order);
1847 	return __xfs_write_fault(vmf, order, NULL);
1848 }
1849 
1850 static inline bool
1851 xfs_is_write_fault(
1852 	struct vm_fault		*vmf)
1853 {
1854 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1855 	       (vmf->vma->vm_flags & VM_SHARED);
1856 }
1857 
1858 static vm_fault_t
1859 xfs_filemap_fault(
1860 	struct vm_fault		*vmf)
1861 {
1862 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1863 
1864 	/* DAX can shortcut the normal fault path on write faults! */
1865 	if (IS_DAX(inode)) {
1866 		if (xfs_is_write_fault(vmf))
1867 			return xfs_write_fault(vmf, 0);
1868 		return xfs_dax_read_fault(vmf, 0);
1869 	}
1870 
1871 	trace_xfs_read_fault(XFS_I(inode), 0);
1872 	return filemap_fault(vmf);
1873 }
1874 
1875 static vm_fault_t
1876 xfs_filemap_huge_fault(
1877 	struct vm_fault		*vmf,
1878 	unsigned int		order)
1879 {
1880 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1881 		return VM_FAULT_FALLBACK;
1882 
1883 	/* DAX can shortcut the normal fault path on write faults! */
1884 	if (xfs_is_write_fault(vmf))
1885 		return xfs_write_fault(vmf, order);
1886 	return xfs_dax_read_fault(vmf, order);
1887 }
1888 
1889 static vm_fault_t
1890 xfs_filemap_page_mkwrite(
1891 	struct vm_fault		*vmf)
1892 {
1893 	return xfs_write_fault(vmf, 0);
1894 }
1895 
1896 /*
1897  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1898  * on write faults. In reality, it needs to serialise against truncate and
1899  * prepare memory for writing so handle is as standard write fault.
1900  */
1901 static vm_fault_t
1902 xfs_filemap_pfn_mkwrite(
1903 	struct vm_fault		*vmf)
1904 {
1905 	return xfs_write_fault(vmf, 0);
1906 }
1907 
1908 static const struct vm_operations_struct xfs_file_vm_ops = {
1909 	.fault		= xfs_filemap_fault,
1910 	.huge_fault	= xfs_filemap_huge_fault,
1911 	.map_pages	= filemap_map_pages,
1912 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1913 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1914 };
1915 
1916 STATIC int
1917 xfs_file_mmap(
1918 	struct file		*file,
1919 	struct vm_area_struct	*vma)
1920 {
1921 	struct inode		*inode = file_inode(file);
1922 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1923 
1924 	/*
1925 	 * We don't support synchronous mappings for non-DAX files and
1926 	 * for DAX files if underneath dax_device is not synchronous.
1927 	 */
1928 	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1929 		return -EOPNOTSUPP;
1930 
1931 	file_accessed(file);
1932 	vma->vm_ops = &xfs_file_vm_ops;
1933 	if (IS_DAX(inode))
1934 		vm_flags_set(vma, VM_HUGEPAGE);
1935 	return 0;
1936 }
1937 
1938 const struct file_operations xfs_file_operations = {
1939 	.llseek		= xfs_file_llseek,
1940 	.read_iter	= xfs_file_read_iter,
1941 	.write_iter	= xfs_file_write_iter,
1942 	.splice_read	= xfs_file_splice_read,
1943 	.splice_write	= iter_file_splice_write,
1944 	.iopoll		= iocb_bio_iopoll,
1945 	.unlocked_ioctl	= xfs_file_ioctl,
1946 #ifdef CONFIG_COMPAT
1947 	.compat_ioctl	= xfs_file_compat_ioctl,
1948 #endif
1949 	.mmap		= xfs_file_mmap,
1950 	.open		= xfs_file_open,
1951 	.release	= xfs_file_release,
1952 	.fsync		= xfs_file_fsync,
1953 	.get_unmapped_area = thp_get_unmapped_area,
1954 	.fallocate	= xfs_file_fallocate,
1955 	.fadvise	= xfs_file_fadvise,
1956 	.remap_file_range = xfs_file_remap_range,
1957 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
1958 			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
1959 			  FOP_DONTCACHE,
1960 };
1961 
1962 const struct file_operations xfs_dir_file_operations = {
1963 	.open		= xfs_dir_open,
1964 	.read		= generic_read_dir,
1965 	.iterate_shared	= xfs_file_readdir,
1966 	.llseek		= generic_file_llseek,
1967 	.unlocked_ioctl	= xfs_file_ioctl,
1968 #ifdef CONFIG_COMPAT
1969 	.compat_ioctl	= xfs_file_compat_ioctl,
1970 #endif
1971 	.fsync		= xfs_dir_fsync,
1972 };
1973