xref: /linux/fs/xfs/xfs_file.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 #include "xfs_error.h"
31 #include "xfs_errortag.h"
32 
33 #include <linux/dax.h>
34 #include <linux/falloc.h>
35 #include <linux/backing-dev.h>
36 #include <linux/mman.h>
37 #include <linux/fadvise.h>
38 #include <linux/mount.h>
39 #include <linux/filelock.h>
40 
41 static const struct vm_operations_struct xfs_file_vm_ops;
42 
43 /*
44  * Decide if the given file range is aligned to the size of the fundamental
45  * allocation unit for the file.
46  */
47 bool
48 xfs_is_falloc_aligned(
49 	struct xfs_inode	*ip,
50 	loff_t			pos,
51 	long long int		len)
52 {
53 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip);
54 
55 	if (!is_power_of_2(alloc_unit))
56 		return isaligned_64(pos, alloc_unit) &&
57 		       isaligned_64(len, alloc_unit);
58 
59 	return !((pos | len) & (alloc_unit - 1));
60 }
61 
62 /*
63  * Fsync operations on directories are much simpler than on regular files,
64  * as there is no file data to flush, and thus also no need for explicit
65  * cache flush operations, and there are no non-transaction metadata updates
66  * on directories either.
67  */
68 STATIC int
69 xfs_dir_fsync(
70 	struct file		*file,
71 	loff_t			start,
72 	loff_t			end,
73 	int			datasync)
74 {
75 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
76 
77 	trace_xfs_dir_fsync(ip);
78 	return xfs_log_force_inode(ip);
79 }
80 
81 /*
82  * All metadata updates are logged, which means that we just have to push the
83  * journal to the required sequence number than holds the updates. We track
84  * datasync commits separately to full sync commits, and hence only need to
85  * select the correct sequence number for the log force here.
86  *
87  * We don't have to serialise against concurrent modifications, as we do not
88  * have to wait for modifications that have not yet completed. We define a
89  * transaction commit as completing when the commit sequence number is updated,
90  * hence if the sequence number has not updated, the sync operation has been
91  * run before the commit completed and we don't have to wait for it.
92  *
93  * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
94  * set on the log item until - at least - the journal flush completes. In
95  * reality, they are only cleared when the inode is fully unpinned (i.e.
96  * persistent in the journal and not dirty in the CIL), and so we rely on
97  * xfs_log_force_seq() either skipping sequences that have been persisted or
98  * waiting on sequences that are still in flight to correctly order concurrent
99  * sync operations.
100  */
101 static int
102 xfs_fsync_flush_log(
103 	struct xfs_inode	*ip,
104 	bool			datasync,
105 	int			*log_flushed)
106 {
107 	struct xfs_inode_log_item *iip = ip->i_itemp;
108 	xfs_csn_t		seq = 0;
109 
110 	spin_lock(&iip->ili_lock);
111 	if (datasync)
112 		seq = iip->ili_datasync_seq;
113 	else
114 		seq = iip->ili_commit_seq;
115 	spin_unlock(&iip->ili_lock);
116 
117 	if (!seq)
118 		return 0;
119 
120 	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
121 					  log_flushed);
122 }
123 
124 STATIC int
125 xfs_file_fsync(
126 	struct file		*file,
127 	loff_t			start,
128 	loff_t			end,
129 	int			datasync)
130 {
131 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
132 	struct xfs_mount	*mp = ip->i_mount;
133 	int			error, err2;
134 	int			log_flushed = 0;
135 
136 	trace_xfs_file_fsync(ip);
137 
138 	error = file_write_and_wait_range(file, start, end);
139 	if (error)
140 		return error;
141 
142 	if (xfs_is_shutdown(mp))
143 		return -EIO;
144 
145 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
146 
147 	/*
148 	 * If we have an RT and/or log subvolume we need to make sure to flush
149 	 * the write cache the device used for file data first.  This is to
150 	 * ensure newly written file data make it to disk before logging the new
151 	 * inode size in case of an extending write.
152 	 */
153 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
154 		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
155 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
156 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
157 
158 	/*
159 	 * If the inode has a inode log item attached, it may need the journal
160 	 * flushed to persist any changes the log item might be tracking.
161 	 */
162 	if (ip->i_itemp) {
163 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
164 		if (err2 && !error)
165 			error = err2;
166 	}
167 
168 	/*
169 	 * If we only have a single device, and the log force about was
170 	 * a no-op we might have to flush the data device cache here.
171 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
172 	 * an already allocated file and thus do not have any metadata to
173 	 * commit.
174 	 */
175 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
176 	    mp->m_logdev_targp == mp->m_ddev_targp) {
177 		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
178 		if (err2 && !error)
179 			error = err2;
180 	}
181 
182 	return error;
183 }
184 
185 static int
186 xfs_ilock_iocb(
187 	struct kiocb		*iocb,
188 	unsigned int		lock_mode)
189 {
190 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
191 
192 	if (iocb->ki_flags & IOCB_NOWAIT) {
193 		if (!xfs_ilock_nowait(ip, lock_mode))
194 			return -EAGAIN;
195 	} else {
196 		xfs_ilock(ip, lock_mode);
197 	}
198 
199 	return 0;
200 }
201 
202 static int
203 xfs_ilock_iocb_for_write(
204 	struct kiocb		*iocb,
205 	unsigned int		*lock_mode)
206 {
207 	ssize_t			ret;
208 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
209 
210 	ret = xfs_ilock_iocb(iocb, *lock_mode);
211 	if (ret)
212 		return ret;
213 
214 	/*
215 	 * If a reflink remap is in progress we always need to take the iolock
216 	 * exclusively to wait for it to finish.
217 	 */
218 	if (*lock_mode == XFS_IOLOCK_SHARED &&
219 	    xfs_iflags_test(ip, XFS_IREMAPPING)) {
220 		xfs_iunlock(ip, *lock_mode);
221 		*lock_mode = XFS_IOLOCK_EXCL;
222 		return xfs_ilock_iocb(iocb, *lock_mode);
223 	}
224 
225 	return 0;
226 }
227 
228 /*
229  * Bounce buffering dio reads need a user context to copy back the data.
230  * Use an ioend to provide that.
231  */
232 static void
233 xfs_dio_read_bounce_submit_io(
234 	const struct iomap_iter	*iter,
235 	struct bio		*bio,
236 	loff_t			file_offset)
237 {
238 	iomap_init_ioend(iter->inode, bio, file_offset, IOMAP_IOEND_DIRECT);
239 	bio->bi_end_io = xfs_end_bio;
240 	submit_bio(bio);
241 }
242 
243 static const struct iomap_dio_ops xfs_dio_read_bounce_ops = {
244 	.submit_io	= xfs_dio_read_bounce_submit_io,
245 	.bio_set	= &iomap_ioend_bioset,
246 };
247 
248 STATIC ssize_t
249 xfs_file_dio_read(
250 	struct kiocb		*iocb,
251 	struct iov_iter		*to)
252 {
253 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
254 	unsigned int		dio_flags = 0;
255 	const struct iomap_dio_ops *dio_ops = NULL;
256 	ssize_t			ret;
257 
258 	trace_xfs_file_direct_read(iocb, to);
259 
260 	if (!iov_iter_count(to))
261 		return 0; /* skip atime */
262 
263 	file_accessed(iocb->ki_filp);
264 
265 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
266 	if (ret)
267 		return ret;
268 	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
269 		dio_ops = &xfs_dio_read_bounce_ops;
270 		dio_flags |= IOMAP_DIO_BOUNCE;
271 	}
272 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, dio_ops, dio_flags,
273 			NULL, 0);
274 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
275 
276 	return ret;
277 }
278 
279 static noinline ssize_t
280 xfs_file_dax_read(
281 	struct kiocb		*iocb,
282 	struct iov_iter		*to)
283 {
284 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
285 	ssize_t			ret = 0;
286 
287 	trace_xfs_file_dax_read(iocb, to);
288 
289 	if (!iov_iter_count(to))
290 		return 0; /* skip atime */
291 
292 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
293 	if (ret)
294 		return ret;
295 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
296 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
297 
298 	file_accessed(iocb->ki_filp);
299 	return ret;
300 }
301 
302 STATIC ssize_t
303 xfs_file_buffered_read(
304 	struct kiocb		*iocb,
305 	struct iov_iter		*to)
306 {
307 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
308 	ssize_t			ret;
309 
310 	trace_xfs_file_buffered_read(iocb, to);
311 
312 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
313 	if (ret)
314 		return ret;
315 	ret = generic_file_read_iter(iocb, to);
316 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
317 
318 	return ret;
319 }
320 
321 STATIC ssize_t
322 xfs_file_read_iter(
323 	struct kiocb		*iocb,
324 	struct iov_iter		*to)
325 {
326 	struct inode		*inode = file_inode(iocb->ki_filp);
327 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
328 	ssize_t			ret = 0;
329 
330 	XFS_STATS_INC(mp, xs_read_calls);
331 
332 	if (xfs_is_shutdown(mp))
333 		return -EIO;
334 
335 	if (IS_DAX(inode))
336 		ret = xfs_file_dax_read(iocb, to);
337 	else if (iocb->ki_flags & IOCB_DIRECT)
338 		ret = xfs_file_dio_read(iocb, to);
339 	else
340 		ret = xfs_file_buffered_read(iocb, to);
341 
342 	if (ret > 0)
343 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
344 	return ret;
345 }
346 
347 STATIC ssize_t
348 xfs_file_splice_read(
349 	struct file		*in,
350 	loff_t			*ppos,
351 	struct pipe_inode_info	*pipe,
352 	size_t			len,
353 	unsigned int		flags)
354 {
355 	struct inode		*inode = file_inode(in);
356 	struct xfs_inode	*ip = XFS_I(inode);
357 	struct xfs_mount	*mp = ip->i_mount;
358 	ssize_t			ret = 0;
359 
360 	XFS_STATS_INC(mp, xs_read_calls);
361 
362 	if (xfs_is_shutdown(mp))
363 		return -EIO;
364 
365 	trace_xfs_file_splice_read(ip, *ppos, len);
366 
367 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
368 	ret = filemap_splice_read(in, ppos, pipe, len, flags);
369 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
370 	if (ret > 0)
371 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
372 	return ret;
373 }
374 
375 /*
376  * Take care of zeroing post-EOF blocks when they might exist.
377  *
378  * Returns 0 if successfully, a negative error for a failure, or 1 if this
379  * function dropped the iolock and reacquired it exclusively and the caller
380  * needs to restart the write sanity checks.
381  */
382 static ssize_t
383 xfs_file_write_zero_eof(
384 	struct kiocb		*iocb,
385 	struct iov_iter		*from,
386 	unsigned int		*iolock,
387 	size_t			count,
388 	bool			*drained_dio,
389 	struct xfs_zone_alloc_ctx *ac)
390 {
391 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
392 	loff_t			isize;
393 	int			error;
394 
395 	/*
396 	 * We need to serialise against EOF updates that occur in IO completions
397 	 * here. We want to make sure that nobody is changing the size while
398 	 * we do this check until we have placed an IO barrier (i.e. hold
399 	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
400 	 * spinlock effectively forms a memory barrier once we have
401 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
402 	 * hence be able to correctly determine if we need to run zeroing.
403 	 */
404 	spin_lock(&ip->i_flags_lock);
405 	isize = i_size_read(VFS_I(ip));
406 	if (iocb->ki_pos <= isize) {
407 		spin_unlock(&ip->i_flags_lock);
408 		return 0;
409 	}
410 	spin_unlock(&ip->i_flags_lock);
411 
412 	if (iocb->ki_flags & IOCB_NOWAIT)
413 		return -EAGAIN;
414 
415 	if (!*drained_dio) {
416 		/*
417 		 * If zeroing is needed and we are currently holding the iolock
418 		 * shared, we need to update it to exclusive which implies
419 		 * having to redo all checks before.
420 		 */
421 		if (*iolock == XFS_IOLOCK_SHARED) {
422 			xfs_iunlock(ip, *iolock);
423 			*iolock = XFS_IOLOCK_EXCL;
424 			xfs_ilock(ip, *iolock);
425 			iov_iter_reexpand(from, count);
426 		}
427 
428 		/*
429 		 * We now have an IO submission barrier in place, but AIO can do
430 		 * EOF updates during IO completion and hence we now need to
431 		 * wait for all of them to drain.  Non-AIO DIO will have drained
432 		 * before we are given the XFS_IOLOCK_EXCL, and so for most
433 		 * cases this wait is a no-op.
434 		 */
435 		inode_dio_wait(VFS_I(ip));
436 		*drained_dio = true;
437 		return 1;
438 	}
439 
440 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
441 
442 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
443 	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
444 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
445 
446 	return error;
447 }
448 
449 /*
450  * Common pre-write limit and setup checks.
451  *
452  * Called with the iolock held either shared and exclusive according to
453  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
454  * if called for a direct write beyond i_size.
455  */
456 STATIC ssize_t
457 xfs_file_write_checks(
458 	struct kiocb		*iocb,
459 	struct iov_iter		*from,
460 	unsigned int		*iolock,
461 	struct xfs_zone_alloc_ctx *ac)
462 {
463 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
464 	size_t			count = iov_iter_count(from);
465 	bool			drained_dio = false;
466 	ssize_t			error;
467 
468 restart:
469 	error = generic_write_checks(iocb, from);
470 	if (error <= 0)
471 		return error;
472 
473 	if (iocb->ki_flags & IOCB_NOWAIT) {
474 		error = break_layout(inode, false);
475 		if (error == -EWOULDBLOCK)
476 			error = -EAGAIN;
477 	} else {
478 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
479 	}
480 
481 	if (error)
482 		return error;
483 
484 	/*
485 	 * For changing security info in file_remove_privs() we need i_rwsem
486 	 * exclusively.
487 	 */
488 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
489 		xfs_iunlock(XFS_I(inode), *iolock);
490 		*iolock = XFS_IOLOCK_EXCL;
491 		error = xfs_ilock_iocb(iocb, *iolock);
492 		if (error) {
493 			*iolock = 0;
494 			return error;
495 		}
496 		goto restart;
497 	}
498 
499 	/*
500 	 * If the offset is beyond the size of the file, we need to zero all
501 	 * blocks that fall between the existing EOF and the start of this
502 	 * write.
503 	 *
504 	 * We can do an unlocked check for i_size here safely as I/O completion
505 	 * can only extend EOF.  Truncate is locked out at this point, so the
506 	 * EOF can not move backwards, only forwards. Hence we only need to take
507 	 * the slow path when we are at or beyond the current EOF.
508 	 */
509 	if (iocb->ki_pos > i_size_read(inode)) {
510 		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
511 				&drained_dio, ac);
512 		if (error == 1)
513 			goto restart;
514 		if (error)
515 			return error;
516 	}
517 
518 	return kiocb_modified(iocb);
519 }
520 
521 static ssize_t
522 xfs_zoned_write_space_reserve(
523 	struct xfs_mount		*mp,
524 	struct kiocb			*iocb,
525 	struct iov_iter			*from,
526 	unsigned int			flags,
527 	struct xfs_zone_alloc_ctx	*ac)
528 {
529 	loff_t				count = iov_iter_count(from);
530 	int				error;
531 
532 	if (iocb->ki_flags & IOCB_NOWAIT)
533 		flags |= XFS_ZR_NOWAIT;
534 
535 	/*
536 	 * Check the rlimit and LFS boundary first so that we don't over-reserve
537 	 * by possibly a lot.
538 	 *
539 	 * The generic write path will redo this check later, and it might have
540 	 * changed by then.  If it got expanded we'll stick to our earlier
541 	 * smaller limit, and if it is decreased the new smaller limit will be
542 	 * used and our extra space reservation will be returned after finishing
543 	 * the write.
544 	 */
545 	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
546 	if (error)
547 		return error;
548 
549 	/*
550 	 * Sloppily round up count to file system blocks.
551 	 *
552 	 * This will often reserve an extra block, but that avoids having to look
553 	 * at the start offset, which isn't stable for O_APPEND until taking the
554 	 * iolock.  Also we need to reserve a block each for zeroing the old
555 	 * EOF block and the new start block if they are unaligned.
556 	 *
557 	 * Any remaining block will be returned after the write.
558 	 */
559 	return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
560 			flags, ac);
561 }
562 
563 static int
564 xfs_dio_write_end_io(
565 	struct kiocb		*iocb,
566 	ssize_t			size,
567 	int			error,
568 	unsigned		flags)
569 {
570 	struct inode		*inode = file_inode(iocb->ki_filp);
571 	struct xfs_inode	*ip = XFS_I(inode);
572 	loff_t			offset = iocb->ki_pos;
573 	unsigned int		nofs_flag;
574 
575 	ASSERT(!xfs_is_zoned_inode(ip) ||
576 	       !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
577 
578 	trace_xfs_end_io_direct_write(ip, offset, size);
579 
580 	if (xfs_is_shutdown(ip->i_mount))
581 		return -EIO;
582 
583 	if (error)
584 		return error;
585 	if (!size)
586 		return 0;
587 
588 	/*
589 	 * Capture amount written on completion as we can't reliably account
590 	 * for it on submission.
591 	 */
592 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
593 
594 	/*
595 	 * We can allocate memory here while doing writeback on behalf of
596 	 * memory reclaim.  To avoid memory allocation deadlocks set the
597 	 * task-wide nofs context for the following operations.
598 	 */
599 	nofs_flag = memalloc_nofs_save();
600 
601 	if (flags & IOMAP_DIO_COW) {
602 		if (iocb->ki_flags & IOCB_ATOMIC)
603 			error = xfs_reflink_end_atomic_cow(ip, offset, size);
604 		else
605 			error = xfs_reflink_end_cow(ip, offset, size);
606 		if (error)
607 			goto out;
608 	}
609 
610 	/*
611 	 * Unwritten conversion updates the in-core isize after extent
612 	 * conversion but before updating the on-disk size. Updating isize any
613 	 * earlier allows a racing dio read to find unwritten extents before
614 	 * they are converted.
615 	 */
616 	if (flags & IOMAP_DIO_UNWRITTEN) {
617 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
618 		goto out;
619 	}
620 
621 	/*
622 	 * We need to update the in-core inode size here so that we don't end up
623 	 * with the on-disk inode size being outside the in-core inode size. We
624 	 * have no other method of updating EOF for AIO, so always do it here
625 	 * if necessary.
626 	 *
627 	 * We need to lock the test/set EOF update as we can be racing with
628 	 * other IO completions here to update the EOF. Failing to serialise
629 	 * here can result in EOF moving backwards and Bad Things Happen when
630 	 * that occurs.
631 	 *
632 	 * As IO completion only ever extends EOF, we can do an unlocked check
633 	 * here to avoid taking the spinlock. If we land within the current EOF,
634 	 * then we do not need to do an extending update at all, and we don't
635 	 * need to take the lock to check this. If we race with an update moving
636 	 * EOF, then we'll either still be beyond EOF and need to take the lock,
637 	 * or we'll be within EOF and we don't need to take it at all.
638 	 */
639 	if (offset + size <= i_size_read(inode))
640 		goto out;
641 
642 	spin_lock(&ip->i_flags_lock);
643 	if (offset + size > i_size_read(inode)) {
644 		i_size_write(inode, offset + size);
645 		spin_unlock(&ip->i_flags_lock);
646 		error = xfs_setfilesize(ip, offset, size);
647 	} else {
648 		spin_unlock(&ip->i_flags_lock);
649 	}
650 
651 out:
652 	memalloc_nofs_restore(nofs_flag);
653 	return error;
654 }
655 
656 static const struct iomap_dio_ops xfs_dio_write_ops = {
657 	.end_io		= xfs_dio_write_end_io,
658 };
659 
660 static void
661 xfs_dio_zoned_submit_io(
662 	const struct iomap_iter	*iter,
663 	struct bio		*bio,
664 	loff_t			file_offset)
665 {
666 	struct xfs_mount	*mp = XFS_I(iter->inode)->i_mount;
667 	struct xfs_zone_alloc_ctx *ac = iter->private;
668 	xfs_filblks_t		count_fsb;
669 	struct iomap_ioend	*ioend;
670 
671 	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
672 	if (count_fsb > ac->reserved_blocks) {
673 		xfs_err(mp,
674 "allocation (%lld) larger than reservation (%lld).",
675 			count_fsb, ac->reserved_blocks);
676 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
677 		bio_io_error(bio);
678 		return;
679 	}
680 	ac->reserved_blocks -= count_fsb;
681 
682 	bio->bi_end_io = xfs_end_bio;
683 	ioend = iomap_init_ioend(iter->inode, bio, file_offset,
684 			IOMAP_IOEND_DIRECT);
685 	xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
686 }
687 
688 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
689 	.bio_set	= &iomap_ioend_bioset,
690 	.submit_io	= xfs_dio_zoned_submit_io,
691 	.end_io		= xfs_dio_write_end_io,
692 };
693 
694 /*
695  * Handle block aligned direct I/O writes.
696  */
697 static noinline ssize_t
698 xfs_file_dio_write_aligned(
699 	struct xfs_inode	*ip,
700 	struct kiocb		*iocb,
701 	struct iov_iter		*from,
702 	const struct iomap_ops	*ops,
703 	const struct iomap_dio_ops *dops,
704 	struct xfs_zone_alloc_ctx *ac)
705 {
706 	unsigned int		iolock = XFS_IOLOCK_SHARED;
707 	unsigned int		dio_flags = 0;
708 	ssize_t			ret;
709 
710 	/*
711 	 * For always COW inodes, each bio must be aligned to the file system
712 	 * block size and not just the device sector size because we need to
713 	 * allocate a block-aligned amount of space for each write.
714 	 */
715 	if (xfs_is_always_cow_inode(ip))
716 		dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
717 
718 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
719 	if (ret)
720 		return ret;
721 	ret = xfs_file_write_checks(iocb, from, &iolock, ac);
722 	if (ret)
723 		goto out_unlock;
724 
725 	/*
726 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
727 	 * the iolock back to shared if we had to take the exclusive lock in
728 	 * xfs_file_write_checks() for other reasons.
729 	 */
730 	if (iolock == XFS_IOLOCK_EXCL) {
731 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
732 		iolock = XFS_IOLOCK_SHARED;
733 	}
734 	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
735 		dio_flags |= IOMAP_DIO_BOUNCE;
736 	trace_xfs_file_direct_write(iocb, from);
737 	ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
738 out_unlock:
739 	xfs_iunlock(ip, iolock);
740 	return ret;
741 }
742 
743 /*
744  * Handle block aligned direct I/O writes to zoned devices.
745  */
746 static noinline ssize_t
747 xfs_file_dio_write_zoned(
748 	struct xfs_inode	*ip,
749 	struct kiocb		*iocb,
750 	struct iov_iter		*from)
751 {
752 	struct xfs_zone_alloc_ctx ac = { };
753 	ssize_t			ret;
754 
755 	ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
756 	if (ret < 0)
757 		return ret;
758 	ret = xfs_file_dio_write_aligned(ip, iocb, from,
759 			&xfs_zoned_direct_write_iomap_ops,
760 			&xfs_dio_zoned_write_ops, &ac);
761 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
762 	return ret;
763 }
764 
765 /*
766  * Handle block atomic writes
767  *
768  * Two methods of atomic writes are supported:
769  * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
770  *   disk
771  * - COW-based, which uses a COW fork as a staging extent for data updates
772  *   before atomically updating extent mappings for the range being written
773  *
774  */
775 static noinline ssize_t
776 xfs_file_dio_write_atomic(
777 	struct xfs_inode	*ip,
778 	struct kiocb		*iocb,
779 	struct iov_iter		*from)
780 {
781 	unsigned int		iolock = XFS_IOLOCK_SHARED;
782 	ssize_t			ret, ocount = iov_iter_count(from);
783 	unsigned int		dio_flags = 0;
784 	const struct iomap_ops	*dops;
785 
786 	/*
787 	 * HW offload should be faster, so try that first if it is already
788 	 * known that the write length is not too large.
789 	 */
790 	if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
791 		dops = &xfs_atomic_write_cow_iomap_ops;
792 	else
793 		dops = &xfs_direct_write_iomap_ops;
794 
795 retry:
796 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
797 	if (ret)
798 		return ret;
799 
800 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
801 	if (ret)
802 		goto out_unlock;
803 
804 	/* Demote similar to xfs_file_dio_write_aligned() */
805 	if (iolock == XFS_IOLOCK_EXCL) {
806 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
807 		iolock = XFS_IOLOCK_SHARED;
808 	}
809 
810 	trace_xfs_file_direct_write(iocb, from);
811 	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
812 		dio_flags |= IOMAP_DIO_BOUNCE;
813 	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, dio_flags,
814 			NULL, 0);
815 
816 	/*
817 	 * The retry mechanism is based on the ->iomap_begin method returning
818 	 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
819 	 * possible. The REQ_ATOMIC-based method typically not be possible if
820 	 * the write spans multiple extents or the disk blocks are misaligned.
821 	 */
822 	if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
823 		xfs_iunlock(ip, iolock);
824 		dops = &xfs_atomic_write_cow_iomap_ops;
825 		goto retry;
826 	}
827 
828 out_unlock:
829 	if (iolock)
830 		xfs_iunlock(ip, iolock);
831 	return ret;
832 }
833 
834 /*
835  * Handle block unaligned direct I/O writes
836  *
837  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
838  * them to be done in parallel with reads and other direct I/O writes.  However,
839  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
840  * to do sub-block zeroing and that requires serialisation against other direct
841  * I/O to the same block.  In this case we need to serialise the submission of
842  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
843  * In the case where sub-block zeroing is not required, we can do concurrent
844  * sub-block dios to the same block successfully.
845  *
846  * Optimistically submit the I/O using the shared lock first, but use the
847  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
848  * if block allocation or partial block zeroing would be required.  In that case
849  * we try again with the exclusive lock.
850  */
851 static noinline ssize_t
852 xfs_file_dio_write_unaligned(
853 	struct xfs_inode	*ip,
854 	struct kiocb		*iocb,
855 	struct iov_iter		*from)
856 {
857 	size_t			isize = i_size_read(VFS_I(ip));
858 	size_t			count = iov_iter_count(from);
859 	unsigned int		iolock = XFS_IOLOCK_SHARED;
860 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
861 	ssize_t			ret;
862 
863 	/*
864 	 * Extending writes need exclusivity because of the sub-block zeroing
865 	 * that the DIO code always does for partial tail blocks beyond EOF, so
866 	 * don't even bother trying the fast path in this case.
867 	 */
868 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
869 		if (iocb->ki_flags & IOCB_NOWAIT)
870 			return -EAGAIN;
871 retry_exclusive:
872 		iolock = XFS_IOLOCK_EXCL;
873 		flags = IOMAP_DIO_FORCE_WAIT;
874 	}
875 
876 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
877 	if (ret)
878 		return ret;
879 
880 	/*
881 	 * We can't properly handle unaligned direct I/O to reflink files yet,
882 	 * as we can't unshare a partial block.
883 	 */
884 	if (xfs_is_cow_inode(ip)) {
885 		trace_xfs_reflink_bounce_dio_write(iocb, from);
886 		ret = -ENOTBLK;
887 		goto out_unlock;
888 	}
889 
890 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
891 	if (ret)
892 		goto out_unlock;
893 
894 	/*
895 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
896 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
897 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
898 	 * drain first.
899 	 */
900 	if (flags & IOMAP_DIO_FORCE_WAIT)
901 		inode_dio_wait(VFS_I(ip));
902 
903 	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
904 		flags |= IOMAP_DIO_BOUNCE;
905 
906 	trace_xfs_file_direct_write(iocb, from);
907 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
908 			   &xfs_dio_write_ops, flags, NULL, 0);
909 
910 	/*
911 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
912 	 * layer rejected it for mapping or locking reasons. If we are doing
913 	 * nonblocking user I/O, propagate the error.
914 	 */
915 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
916 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
917 		xfs_iunlock(ip, iolock);
918 		goto retry_exclusive;
919 	}
920 
921 out_unlock:
922 	if (iolock)
923 		xfs_iunlock(ip, iolock);
924 	return ret;
925 }
926 
927 static ssize_t
928 xfs_file_dio_write(
929 	struct kiocb		*iocb,
930 	struct iov_iter		*from)
931 {
932 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
933 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
934 	size_t			count = iov_iter_count(from);
935 
936 	/* direct I/O must be aligned to device logical sector size */
937 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
938 		return -EINVAL;
939 
940 	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
941 		return xfs_file_dio_write_unaligned(ip, iocb, from);
942 	if (xfs_is_zoned_inode(ip))
943 		return xfs_file_dio_write_zoned(ip, iocb, from);
944 	if (iocb->ki_flags & IOCB_ATOMIC)
945 		return xfs_file_dio_write_atomic(ip, iocb, from);
946 	return xfs_file_dio_write_aligned(ip, iocb, from,
947 			&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
948 }
949 
950 static noinline ssize_t
951 xfs_file_dax_write(
952 	struct kiocb		*iocb,
953 	struct iov_iter		*from)
954 {
955 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
956 	struct xfs_inode	*ip = XFS_I(inode);
957 	unsigned int		iolock = XFS_IOLOCK_EXCL;
958 	ssize_t			ret, error = 0;
959 	loff_t			pos;
960 
961 	ret = xfs_ilock_iocb(iocb, iolock);
962 	if (ret)
963 		return ret;
964 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
965 	if (ret)
966 		goto out;
967 
968 	pos = iocb->ki_pos;
969 
970 	trace_xfs_file_dax_write(iocb, from);
971 	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
972 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
973 		i_size_write(inode, iocb->ki_pos);
974 		error = xfs_setfilesize(ip, pos, ret);
975 	}
976 out:
977 	if (iolock)
978 		xfs_iunlock(ip, iolock);
979 	if (error)
980 		return error;
981 
982 	if (ret > 0) {
983 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
984 
985 		/* Handle various SYNC-type writes */
986 		ret = generic_write_sync(iocb, ret);
987 	}
988 	return ret;
989 }
990 
991 STATIC ssize_t
992 xfs_file_buffered_write(
993 	struct kiocb		*iocb,
994 	struct iov_iter		*from)
995 {
996 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
997 	struct xfs_inode	*ip = XFS_I(inode);
998 	ssize_t			ret;
999 	bool			cleared_space = false;
1000 	unsigned int		iolock;
1001 
1002 write_retry:
1003 	iolock = XFS_IOLOCK_EXCL;
1004 	ret = xfs_ilock_iocb(iocb, iolock);
1005 	if (ret)
1006 		return ret;
1007 
1008 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
1009 	if (ret)
1010 		goto out;
1011 
1012 	trace_xfs_file_buffered_write(iocb, from);
1013 	ret = iomap_file_buffered_write(iocb, from,
1014 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1015 			NULL);
1016 
1017 	/*
1018 	 * If we hit a space limit, try to free up some lingering preallocated
1019 	 * space before returning an error. In the case of ENOSPC, first try to
1020 	 * write back all dirty inodes to free up some of the excess reserved
1021 	 * metadata space. This reduces the chances that the eofblocks scan
1022 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
1023 	 * also behaves as a filter to prevent too many eofblocks scans from
1024 	 * running at the same time.  Use a synchronous scan to increase the
1025 	 * effectiveness of the scan.
1026 	 */
1027 	if (ret == -EDQUOT && !cleared_space) {
1028 		xfs_iunlock(ip, iolock);
1029 		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
1030 		cleared_space = true;
1031 		goto write_retry;
1032 	} else if (ret == -ENOSPC && !cleared_space) {
1033 		struct xfs_icwalk	icw = {0};
1034 
1035 		cleared_space = true;
1036 		xfs_flush_inodes(ip->i_mount);
1037 
1038 		xfs_iunlock(ip, iolock);
1039 		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1040 		xfs_blockgc_free_space(ip->i_mount, &icw);
1041 		goto write_retry;
1042 	}
1043 
1044 out:
1045 	if (iolock)
1046 		xfs_iunlock(ip, iolock);
1047 
1048 	if (ret > 0) {
1049 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1050 		/* Handle various SYNC-type writes */
1051 		ret = generic_write_sync(iocb, ret);
1052 	}
1053 	return ret;
1054 }
1055 
1056 STATIC ssize_t
1057 xfs_file_buffered_write_zoned(
1058 	struct kiocb		*iocb,
1059 	struct iov_iter		*from)
1060 {
1061 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
1062 	struct xfs_mount	*mp = ip->i_mount;
1063 	unsigned int		iolock = XFS_IOLOCK_EXCL;
1064 	bool			cleared_space = false;
1065 	struct xfs_zone_alloc_ctx ac = { };
1066 	ssize_t			ret;
1067 
1068 	ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
1069 	if (ret < 0)
1070 		return ret;
1071 
1072 	ret = xfs_ilock_iocb(iocb, iolock);
1073 	if (ret)
1074 		goto out_unreserve;
1075 
1076 	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1077 	if (ret)
1078 		goto out_unlock;
1079 
1080 	/*
1081 	 * Truncate the iter to the length that we were actually able to
1082 	 * allocate blocks for.  This needs to happen after
1083 	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1084 	 * writes.
1085 	 */
1086 	iov_iter_truncate(from,
1087 			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1088 			(iocb->ki_pos & mp->m_blockmask));
1089 	if (!iov_iter_count(from))
1090 		goto out_unlock;
1091 
1092 retry:
1093 	trace_xfs_file_buffered_write(iocb, from);
1094 	ret = iomap_file_buffered_write(iocb, from,
1095 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1096 			&ac);
1097 	if (ret == -ENOSPC && !cleared_space) {
1098 		/*
1099 		 * Kick off writeback to convert delalloc space and release the
1100 		 * usually too pessimistic indirect block reservations.
1101 		 */
1102 		xfs_flush_inodes(mp);
1103 		cleared_space = true;
1104 		goto retry;
1105 	}
1106 
1107 out_unlock:
1108 	xfs_iunlock(ip, iolock);
1109 out_unreserve:
1110 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1111 	if (ret > 0) {
1112 		XFS_STATS_ADD(mp, xs_write_bytes, ret);
1113 		ret = generic_write_sync(iocb, ret);
1114 	}
1115 	return ret;
1116 }
1117 
1118 STATIC ssize_t
1119 xfs_file_write_iter(
1120 	struct kiocb		*iocb,
1121 	struct iov_iter		*from)
1122 {
1123 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1124 	struct xfs_inode	*ip = XFS_I(inode);
1125 	ssize_t			ret;
1126 	size_t			ocount = iov_iter_count(from);
1127 
1128 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1129 
1130 	if (ocount == 0)
1131 		return 0;
1132 
1133 	if (xfs_is_shutdown(ip->i_mount))
1134 		return -EIO;
1135 
1136 	if (iocb->ki_flags & IOCB_ATOMIC) {
1137 		if (ocount < xfs_get_atomic_write_min(ip))
1138 			return -EINVAL;
1139 
1140 		if (ocount > xfs_get_atomic_write_max(ip))
1141 			return -EINVAL;
1142 
1143 		ret = generic_atomic_write_valid(iocb, from);
1144 		if (ret)
1145 			return ret;
1146 	}
1147 
1148 	if (IS_DAX(inode))
1149 		return xfs_file_dax_write(iocb, from);
1150 
1151 	if (iocb->ki_flags & IOCB_DIRECT) {
1152 		/*
1153 		 * Allow a directio write to fall back to a buffered
1154 		 * write *only* in the case that we're doing a reflink
1155 		 * CoW.  In all other directio scenarios we do not
1156 		 * allow an operation to fall back to buffered mode.
1157 		 */
1158 		ret = xfs_file_dio_write(iocb, from);
1159 		if (ret != -ENOTBLK)
1160 			return ret;
1161 	}
1162 
1163 	if (xfs_is_zoned_inode(ip))
1164 		return xfs_file_buffered_write_zoned(iocb, from);
1165 	return xfs_file_buffered_write(iocb, from);
1166 }
1167 
1168 /* Does this file, inode, or mount want synchronous writes? */
1169 static inline bool xfs_file_sync_writes(struct file *filp)
1170 {
1171 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1172 
1173 	if (xfs_has_wsync(ip->i_mount))
1174 		return true;
1175 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1176 		return true;
1177 	if (IS_SYNC(file_inode(filp)))
1178 		return true;
1179 
1180 	return false;
1181 }
1182 
1183 static int
1184 xfs_falloc_newsize(
1185 	struct file		*file,
1186 	int			mode,
1187 	loff_t			offset,
1188 	loff_t			len,
1189 	loff_t			*new_size)
1190 {
1191 	struct inode		*inode = file_inode(file);
1192 
1193 	if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1194 		return 0;
1195 	*new_size = offset + len;
1196 	return inode_newsize_ok(inode, *new_size);
1197 }
1198 
1199 static int
1200 xfs_falloc_setsize(
1201 	struct file		*file,
1202 	loff_t			new_size)
1203 {
1204 	struct iattr iattr = {
1205 		.ia_valid	= ATTR_SIZE,
1206 		.ia_size	= new_size,
1207 	};
1208 
1209 	if (!new_size)
1210 		return 0;
1211 	return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1212 			&iattr);
1213 }
1214 
1215 static int
1216 xfs_falloc_collapse_range(
1217 	struct file		*file,
1218 	loff_t			offset,
1219 	loff_t			len,
1220 	struct xfs_zone_alloc_ctx *ac)
1221 {
1222 	struct inode		*inode = file_inode(file);
1223 	loff_t			new_size = i_size_read(inode) - len;
1224 	int			error;
1225 
1226 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1227 		return -EINVAL;
1228 
1229 	/*
1230 	 * There is no need to overlap collapse range with EOF, in which case it
1231 	 * is effectively a truncate operation
1232 	 */
1233 	if (offset + len >= i_size_read(inode))
1234 		return -EINVAL;
1235 
1236 	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1237 	if (error)
1238 		return error;
1239 	return xfs_falloc_setsize(file, new_size);
1240 }
1241 
1242 static int
1243 xfs_falloc_insert_range(
1244 	struct file		*file,
1245 	loff_t			offset,
1246 	loff_t			len)
1247 {
1248 	struct inode		*inode = file_inode(file);
1249 	loff_t			isize = i_size_read(inode);
1250 	int			error;
1251 
1252 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1253 		return -EINVAL;
1254 
1255 	/*
1256 	 * New inode size must not exceed ->s_maxbytes, accounting for
1257 	 * possible signed overflow.
1258 	 */
1259 	if (inode->i_sb->s_maxbytes - isize < len)
1260 		return -EFBIG;
1261 
1262 	/* Offset should be less than i_size */
1263 	if (offset >= isize)
1264 		return -EINVAL;
1265 
1266 	error = xfs_falloc_setsize(file, isize + len);
1267 	if (error)
1268 		return error;
1269 
1270 	/*
1271 	 * Perform hole insertion now that the file size has been updated so
1272 	 * that if we crash during the operation we don't leave shifted extents
1273 	 * past EOF and hence losing access to the data that is contained within
1274 	 * them.
1275 	 */
1276 	return xfs_insert_file_space(XFS_I(inode), offset, len);
1277 }
1278 
1279 /*
1280  * For various operations we need to zero up to one block at each end of
1281  * the affected range.  For zoned file systems this will require a space
1282  * allocation, for which we need a reservation ahead of time.
1283  */
1284 #define XFS_ZONED_ZERO_EDGE_SPACE_RES		2
1285 
1286 /*
1287  * Zero range implements a full zeroing mechanism but is only used in limited
1288  * situations. It is more efficient to allocate unwritten extents than to
1289  * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG
1290  * kernels for added test coverage.
1291  *
1292  * On zoned file systems, the error is already injected by
1293  * xfs_file_zoned_fallocate, which then reserves the additional space needed.
1294  * We only check for this extra space reservation here.
1295  */
1296 static inline bool
1297 xfs_falloc_force_zero(
1298 	struct xfs_inode		*ip,
1299 	struct xfs_zone_alloc_ctx	*ac)
1300 {
1301 	if (xfs_is_zoned_inode(ip)) {
1302 		if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) {
1303 			ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG));
1304 			return true;
1305 		}
1306 		return false;
1307 	}
1308 	return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE);
1309 }
1310 
1311 /*
1312  * Punch a hole and prealloc the range.  We use a hole punch rather than
1313  * unwritten extent conversion for two reasons:
1314  *
1315  *   1.) Hole punch handles partial block zeroing for us.
1316  *   2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1317  *	 virtue of the hole punch.
1318  */
1319 static int
1320 xfs_falloc_zero_range(
1321 	struct file		*file,
1322 	int			mode,
1323 	loff_t			offset,
1324 	loff_t			len,
1325 	struct xfs_zone_alloc_ctx *ac)
1326 {
1327 	struct inode		*inode = file_inode(file);
1328 	struct xfs_inode	*ip = XFS_I(inode);
1329 	unsigned int		blksize = i_blocksize(inode);
1330 	loff_t			new_size = 0;
1331 	int			error;
1332 
1333 	trace_xfs_zero_file_space(ip);
1334 
1335 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1336 	if (error)
1337 		return error;
1338 
1339 	if (xfs_falloc_force_zero(ip, ac)) {
1340 		error = xfs_zero_range(ip, offset, len, ac, NULL);
1341 	} else {
1342 		error = xfs_free_file_space(ip, offset, len, ac);
1343 		if (error)
1344 			return error;
1345 
1346 		len = round_up(offset + len, blksize) -
1347 			round_down(offset, blksize);
1348 		offset = round_down(offset, blksize);
1349 		error = xfs_alloc_file_space(ip, offset, len);
1350 	}
1351 	if (error)
1352 		return error;
1353 	return xfs_falloc_setsize(file, new_size);
1354 }
1355 
1356 static int
1357 xfs_falloc_unshare_range(
1358 	struct file		*file,
1359 	int			mode,
1360 	loff_t			offset,
1361 	loff_t			len)
1362 {
1363 	struct inode		*inode = file_inode(file);
1364 	loff_t			new_size = 0;
1365 	int			error;
1366 
1367 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1368 	if (error)
1369 		return error;
1370 
1371 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1372 	if (error)
1373 		return error;
1374 
1375 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1376 	if (error)
1377 		return error;
1378 	return xfs_falloc_setsize(file, new_size);
1379 }
1380 
1381 static int
1382 xfs_falloc_allocate_range(
1383 	struct file		*file,
1384 	int			mode,
1385 	loff_t			offset,
1386 	loff_t			len)
1387 {
1388 	struct inode		*inode = file_inode(file);
1389 	loff_t			new_size = 0;
1390 	int			error;
1391 
1392 	/*
1393 	 * If always_cow mode we can't use preallocations and thus should not
1394 	 * create them.
1395 	 */
1396 	if (xfs_is_always_cow_inode(XFS_I(inode)))
1397 		return -EOPNOTSUPP;
1398 
1399 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1400 	if (error)
1401 		return error;
1402 
1403 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1404 	if (error)
1405 		return error;
1406 	return xfs_falloc_setsize(file, new_size);
1407 }
1408 
1409 #define	XFS_FALLOC_FL_SUPPORTED						\
1410 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
1411 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
1412 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
1413 		 FALLOC_FL_UNSHARE_RANGE)
1414 
1415 STATIC long
1416 __xfs_file_fallocate(
1417 	struct file		*file,
1418 	int			mode,
1419 	loff_t			offset,
1420 	loff_t			len,
1421 	struct xfs_zone_alloc_ctx *ac)
1422 {
1423 	struct inode		*inode = file_inode(file);
1424 	struct xfs_inode	*ip = XFS_I(inode);
1425 	long			error;
1426 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1427 
1428 	xfs_ilock(ip, iolock);
1429 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1430 	if (error)
1431 		goto out_unlock;
1432 
1433 	/*
1434 	 * Must wait for all AIO to complete before we continue as AIO can
1435 	 * change the file size on completion without holding any locks we
1436 	 * currently hold. We must do this first because AIO can update both
1437 	 * the on disk and in memory inode sizes, and the operations that follow
1438 	 * require the in-memory size to be fully up-to-date.
1439 	 */
1440 	inode_dio_wait(inode);
1441 
1442 	error = file_modified(file);
1443 	if (error)
1444 		goto out_unlock;
1445 
1446 	switch (mode & FALLOC_FL_MODE_MASK) {
1447 	case FALLOC_FL_PUNCH_HOLE:
1448 		error = xfs_free_file_space(ip, offset, len, ac);
1449 		break;
1450 	case FALLOC_FL_COLLAPSE_RANGE:
1451 		error = xfs_falloc_collapse_range(file, offset, len, ac);
1452 		break;
1453 	case FALLOC_FL_INSERT_RANGE:
1454 		error = xfs_falloc_insert_range(file, offset, len);
1455 		break;
1456 	case FALLOC_FL_ZERO_RANGE:
1457 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1458 		break;
1459 	case FALLOC_FL_UNSHARE_RANGE:
1460 		error = xfs_falloc_unshare_range(file, mode, offset, len);
1461 		break;
1462 	case FALLOC_FL_ALLOCATE_RANGE:
1463 		error = xfs_falloc_allocate_range(file, mode, offset, len);
1464 		break;
1465 	default:
1466 		error = -EOPNOTSUPP;
1467 		break;
1468 	}
1469 
1470 	if (!error && xfs_file_sync_writes(file))
1471 		error = xfs_log_force_inode(ip);
1472 
1473 out_unlock:
1474 	xfs_iunlock(ip, iolock);
1475 	return error;
1476 }
1477 
1478 static long
1479 xfs_file_zoned_fallocate(
1480 	struct file		*file,
1481 	int			mode,
1482 	loff_t			offset,
1483 	loff_t			len)
1484 {
1485 	struct xfs_zone_alloc_ctx ac = { };
1486 	struct xfs_inode	*ip = XFS_I(file_inode(file));
1487 	struct xfs_mount	*mp = ip->i_mount;
1488 	xfs_filblks_t		count_fsb;
1489 	int			error;
1490 
1491 	/*
1492 	 * If full zeroing is forced by the error injection knob, we need a
1493 	 * space reservation that covers the entire range.  See the comment in
1494 	 * xfs_zoned_write_space_reserve for the rationale for the calculation.
1495 	 * Otherwise just reserve space for the two boundary blocks.
1496 	 */
1497 	count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES;
1498 	if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE &&
1499 	    XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE))
1500 		count_fsb += XFS_B_TO_FSB(mp, len) + 1;
1501 
1502 	error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac);
1503 	if (error)
1504 		return error;
1505 	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1506 	xfs_zoned_space_unreserve(mp, &ac);
1507 	return error;
1508 }
1509 
1510 static long
1511 xfs_file_fallocate(
1512 	struct file		*file,
1513 	int			mode,
1514 	loff_t			offset,
1515 	loff_t			len)
1516 {
1517 	struct inode		*inode = file_inode(file);
1518 
1519 	if (!S_ISREG(inode->i_mode))
1520 		return -EINVAL;
1521 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1522 		return -EOPNOTSUPP;
1523 
1524 	/*
1525 	 * For zoned file systems, zeroing the first and last block of a hole
1526 	 * punch requires allocating a new block to rewrite the remaining data
1527 	 * and new zeroes out of place.  Get a reservations for those before
1528 	 * taking the iolock.  Dip into the reserved pool because we are
1529 	 * expected to be able to punch a hole even on a completely full
1530 	 * file system.
1531 	 */
1532 	if (xfs_is_zoned_inode(XFS_I(inode)) &&
1533 	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1534 		     FALLOC_FL_COLLAPSE_RANGE)))
1535 		return xfs_file_zoned_fallocate(file, mode, offset, len);
1536 	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1537 }
1538 
1539 STATIC int
1540 xfs_file_fadvise(
1541 	struct file	*file,
1542 	loff_t		start,
1543 	loff_t		end,
1544 	int		advice)
1545 {
1546 	struct xfs_inode *ip = XFS_I(file_inode(file));
1547 	int ret;
1548 	int lockflags = 0;
1549 
1550 	/*
1551 	 * Operations creating pages in page cache need protection from hole
1552 	 * punching and similar ops
1553 	 */
1554 	if (advice == POSIX_FADV_WILLNEED) {
1555 		lockflags = XFS_IOLOCK_SHARED;
1556 		xfs_ilock(ip, lockflags);
1557 	}
1558 	ret = generic_fadvise(file, start, end, advice);
1559 	if (lockflags)
1560 		xfs_iunlock(ip, lockflags);
1561 	return ret;
1562 }
1563 
1564 STATIC loff_t
1565 xfs_file_remap_range(
1566 	struct file		*file_in,
1567 	loff_t			pos_in,
1568 	struct file		*file_out,
1569 	loff_t			pos_out,
1570 	loff_t			len,
1571 	unsigned int		remap_flags)
1572 {
1573 	struct inode		*inode_in = file_inode(file_in);
1574 	struct xfs_inode	*src = XFS_I(inode_in);
1575 	struct inode		*inode_out = file_inode(file_out);
1576 	struct xfs_inode	*dest = XFS_I(inode_out);
1577 	struct xfs_mount	*mp = src->i_mount;
1578 	loff_t			remapped = 0;
1579 	xfs_extlen_t		cowextsize;
1580 	int			ret;
1581 
1582 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1583 		return -EINVAL;
1584 
1585 	if (!xfs_has_reflink(mp))
1586 		return -EOPNOTSUPP;
1587 
1588 	if (xfs_is_shutdown(mp))
1589 		return -EIO;
1590 
1591 	/* Prepare and then clone file data. */
1592 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1593 			&len, remap_flags);
1594 	if (ret || len == 0)
1595 		return ret;
1596 
1597 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1598 
1599 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1600 			&remapped);
1601 	if (ret)
1602 		goto out_unlock;
1603 
1604 	/*
1605 	 * Carry the cowextsize hint from src to dest if we're sharing the
1606 	 * entire source file to the entire destination file, the source file
1607 	 * has a cowextsize hint, and the destination file does not.
1608 	 */
1609 	cowextsize = 0;
1610 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1611 	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1612 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1613 	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1614 		cowextsize = src->i_cowextsize;
1615 
1616 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1617 			remap_flags);
1618 	if (ret)
1619 		goto out_unlock;
1620 
1621 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1622 		xfs_log_force_inode(dest);
1623 out_unlock:
1624 	xfs_iunlock2_remapping(src, dest);
1625 	if (ret)
1626 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1627 	/*
1628 	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1629 	 * handle partial results -- either the whole remap succeeds, or we
1630 	 * must say why it did not.  In this case, any error should be returned
1631 	 * to the caller.
1632 	 */
1633 	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1634 		return ret;
1635 	return remapped > 0 ? remapped : ret;
1636 }
1637 
1638 STATIC int
1639 xfs_file_open(
1640 	struct inode	*inode,
1641 	struct file	*file)
1642 {
1643 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1644 		return -EIO;
1645 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1646 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1647 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1648 	return generic_file_open(inode, file);
1649 }
1650 
1651 STATIC int
1652 xfs_dir_open(
1653 	struct inode	*inode,
1654 	struct file	*file)
1655 {
1656 	struct xfs_inode *ip = XFS_I(inode);
1657 	unsigned int	mode;
1658 	int		error;
1659 
1660 	if (xfs_is_shutdown(ip->i_mount))
1661 		return -EIO;
1662 	error = generic_file_open(inode, file);
1663 	if (error)
1664 		return error;
1665 
1666 	/*
1667 	 * If there are any blocks, read-ahead block 0 as we're almost
1668 	 * certain to have the next operation be a read there.
1669 	 */
1670 	mode = xfs_ilock_data_map_shared(ip);
1671 	if (ip->i_df.if_nextents > 0)
1672 		error = xfs_dir3_data_readahead(ip, 0, 0);
1673 	xfs_iunlock(ip, mode);
1674 	return error;
1675 }
1676 
1677 /*
1678  * Don't bother propagating errors.  We're just doing cleanup, and the caller
1679  * ignores the return value anyway.
1680  */
1681 STATIC int
1682 xfs_file_release(
1683 	struct inode		*inode,
1684 	struct file		*file)
1685 {
1686 	struct xfs_inode	*ip = XFS_I(inode);
1687 	struct xfs_mount	*mp = ip->i_mount;
1688 
1689 	/*
1690 	 * If this is a read-only mount or the file system has been shut down,
1691 	 * don't generate I/O.
1692 	 */
1693 	if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1694 		return 0;
1695 
1696 	/*
1697 	 * If we previously truncated this file and removed old data in the
1698 	 * process, we want to initiate "early" writeout on the last close.
1699 	 * This is an attempt to combat the notorious NULL files problem which
1700 	 * is particularly noticeable from a truncate down, buffered (re-)write
1701 	 * (delalloc), followed by a crash.  What we are effectively doing here
1702 	 * is significantly reducing the time window where we'd otherwise be
1703 	 * exposed to that problem.
1704 	 */
1705 	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1706 		xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1707 		if (ip->i_delayed_blks > 0)
1708 			filemap_flush(inode->i_mapping);
1709 	}
1710 
1711 	/*
1712 	 * XFS aggressively preallocates post-EOF space to generate contiguous
1713 	 * allocations for writers that append to the end of the file.
1714 	 *
1715 	 * To support workloads that close and reopen the file frequently, these
1716 	 * preallocations usually persist after a close unless it is the first
1717 	 * close for the inode.  This is a tradeoff to generate tightly packed
1718 	 * data layouts for unpacking tarballs or similar archives that write
1719 	 * one file after another without going back to it while keeping the
1720 	 * preallocation for files that have recurring open/write/close cycles.
1721 	 *
1722 	 * This heuristic is skipped for inodes with the append-only flag as
1723 	 * that flag is rather pointless for inodes written only once.
1724 	 *
1725 	 * There is no point in freeing blocks here for open but unlinked files
1726 	 * as they will be taken care of by the inactivation path soon.
1727 	 *
1728 	 * When releasing a read-only context, don't flush data or trim post-EOF
1729 	 * blocks.  This avoids open/read/close workloads from removing EOF
1730 	 * blocks that other writers depend upon to reduce fragmentation.
1731 	 *
1732 	 * Inodes on the zoned RT device never have preallocations, so skip
1733 	 * taking the locks below.
1734 	 */
1735 	if (!inode->i_nlink ||
1736 	    !(file->f_mode & FMODE_WRITE) ||
1737 	    (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1738 	    xfs_is_zoned_inode(ip))
1739 		return 0;
1740 
1741 	/*
1742 	 * If we can't get the iolock just skip truncating the blocks past EOF
1743 	 * because we could deadlock with the mmap_lock otherwise. We'll get
1744 	 * another chance to drop them once the last reference to the inode is
1745 	 * dropped, so we'll never leak blocks permanently.
1746 	 */
1747 	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1748 	    xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1749 		if (xfs_can_free_eofblocks(ip) &&
1750 		    !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1751 			xfs_free_eofblocks(ip);
1752 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1753 	}
1754 
1755 	return 0;
1756 }
1757 
1758 STATIC int
1759 xfs_file_readdir(
1760 	struct file	*file,
1761 	struct dir_context *ctx)
1762 {
1763 	struct inode	*inode = file_inode(file);
1764 	xfs_inode_t	*ip = XFS_I(inode);
1765 	size_t		bufsize;
1766 
1767 	/*
1768 	 * The Linux API doesn't pass down the total size of the buffer
1769 	 * we read into down to the filesystem.  With the filldir concept
1770 	 * it's not needed for correct information, but the XFS dir2 leaf
1771 	 * code wants an estimate of the buffer size to calculate it's
1772 	 * readahead window and size the buffers used for mapping to
1773 	 * physical blocks.
1774 	 *
1775 	 * Try to give it an estimate that's good enough, maybe at some
1776 	 * point we can change the ->readdir prototype to include the
1777 	 * buffer size.  For now we use the current glibc buffer size.
1778 	 */
1779 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1780 
1781 	return xfs_readdir(NULL, ip, ctx, bufsize);
1782 }
1783 
1784 STATIC loff_t
1785 xfs_file_llseek(
1786 	struct file	*file,
1787 	loff_t		offset,
1788 	int		whence)
1789 {
1790 	struct inode		*inode = file->f_mapping->host;
1791 
1792 	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1793 		return -EIO;
1794 
1795 	switch (whence) {
1796 	default:
1797 		return generic_file_llseek(file, offset, whence);
1798 	case SEEK_HOLE:
1799 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1800 		break;
1801 	case SEEK_DATA:
1802 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1803 		break;
1804 	}
1805 
1806 	if (offset < 0)
1807 		return offset;
1808 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1809 }
1810 
1811 static inline vm_fault_t
1812 xfs_dax_fault_locked(
1813 	struct vm_fault		*vmf,
1814 	unsigned int		order,
1815 	bool			write_fault)
1816 {
1817 	vm_fault_t		ret;
1818 	unsigned long		pfn;
1819 
1820 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1821 		ASSERT(0);
1822 		return VM_FAULT_SIGBUS;
1823 	}
1824 	ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1825 			(write_fault && !vmf->cow_page) ?
1826 				&xfs_dax_write_iomap_ops :
1827 				&xfs_read_iomap_ops);
1828 	if (ret & VM_FAULT_NEEDDSYNC)
1829 		ret = dax_finish_sync_fault(vmf, order, pfn);
1830 	return ret;
1831 }
1832 
1833 static vm_fault_t
1834 xfs_dax_read_fault(
1835 	struct vm_fault		*vmf,
1836 	unsigned int		order)
1837 {
1838 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1839 	vm_fault_t		ret;
1840 
1841 	trace_xfs_read_fault(ip, order);
1842 
1843 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1844 	ret = xfs_dax_fault_locked(vmf, order, false);
1845 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1846 
1847 	return ret;
1848 }
1849 
1850 /*
1851  * Locking for serialisation of IO during page faults. This results in a lock
1852  * ordering of:
1853  *
1854  * mmap_lock (MM)
1855  *   sb_start_pagefault(vfs, freeze)
1856  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1857  *       page_lock (MM)
1858  *         i_lock (XFS - extent map serialisation)
1859  */
1860 static vm_fault_t
1861 __xfs_write_fault(
1862 	struct vm_fault		*vmf,
1863 	unsigned int		order,
1864 	struct xfs_zone_alloc_ctx *ac)
1865 {
1866 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1867 	struct xfs_inode	*ip = XFS_I(inode);
1868 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
1869 	vm_fault_t		ret;
1870 
1871 	trace_xfs_write_fault(ip, order);
1872 
1873 	sb_start_pagefault(inode->i_sb);
1874 	file_update_time(vmf->vma->vm_file);
1875 
1876 	/*
1877 	 * Normally we only need the shared mmaplock, but if a reflink remap is
1878 	 * in progress we take the exclusive lock to wait for the remap to
1879 	 * finish before taking a write fault.
1880 	 */
1881 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1882 	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1883 		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1884 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1885 		lock_mode = XFS_MMAPLOCK_EXCL;
1886 	}
1887 
1888 	if (IS_DAX(inode))
1889 		ret = xfs_dax_fault_locked(vmf, order, true);
1890 	else
1891 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1892 				ac);
1893 	xfs_iunlock(ip, lock_mode);
1894 
1895 	sb_end_pagefault(inode->i_sb);
1896 	return ret;
1897 }
1898 
1899 static vm_fault_t
1900 xfs_write_fault_zoned(
1901 	struct vm_fault		*vmf,
1902 	unsigned int		order)
1903 {
1904 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1905 	unsigned int		len = folio_size(page_folio(vmf->page));
1906 	struct xfs_zone_alloc_ctx ac = { };
1907 	int			error;
1908 	vm_fault_t		ret;
1909 
1910 	/*
1911 	 * This could over-allocate as it doesn't check for truncation.
1912 	 *
1913 	 * But as the overallocation is limited to less than a folio and will be
1914 	 * release instantly that's just fine.
1915 	 */
1916 	error = xfs_zoned_space_reserve(ip->i_mount,
1917 			XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
1918 	if (error < 0)
1919 		return vmf_fs_error(error);
1920 	ret = __xfs_write_fault(vmf, order, &ac);
1921 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1922 	return ret;
1923 }
1924 
1925 static vm_fault_t
1926 xfs_write_fault(
1927 	struct vm_fault		*vmf,
1928 	unsigned int		order)
1929 {
1930 	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1931 		return xfs_write_fault_zoned(vmf, order);
1932 	return __xfs_write_fault(vmf, order, NULL);
1933 }
1934 
1935 static inline bool
1936 xfs_is_write_fault(
1937 	struct vm_fault		*vmf)
1938 {
1939 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1940 	       (vmf->vma->vm_flags & VM_SHARED);
1941 }
1942 
1943 static vm_fault_t
1944 xfs_filemap_fault(
1945 	struct vm_fault		*vmf)
1946 {
1947 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1948 
1949 	/* DAX can shortcut the normal fault path on write faults! */
1950 	if (IS_DAX(inode)) {
1951 		if (xfs_is_write_fault(vmf))
1952 			return xfs_write_fault(vmf, 0);
1953 		return xfs_dax_read_fault(vmf, 0);
1954 	}
1955 
1956 	trace_xfs_read_fault(XFS_I(inode), 0);
1957 	return filemap_fault(vmf);
1958 }
1959 
1960 static vm_fault_t
1961 xfs_filemap_huge_fault(
1962 	struct vm_fault		*vmf,
1963 	unsigned int		order)
1964 {
1965 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1966 		return VM_FAULT_FALLBACK;
1967 
1968 	/* DAX can shortcut the normal fault path on write faults! */
1969 	if (xfs_is_write_fault(vmf))
1970 		return xfs_write_fault(vmf, order);
1971 	return xfs_dax_read_fault(vmf, order);
1972 }
1973 
1974 static vm_fault_t
1975 xfs_filemap_page_mkwrite(
1976 	struct vm_fault		*vmf)
1977 {
1978 	return xfs_write_fault(vmf, 0);
1979 }
1980 
1981 /*
1982  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1983  * on write faults. In reality, it needs to serialise against truncate and
1984  * prepare memory for writing so handle is as standard write fault.
1985  */
1986 static vm_fault_t
1987 xfs_filemap_pfn_mkwrite(
1988 	struct vm_fault		*vmf)
1989 {
1990 	return xfs_write_fault(vmf, 0);
1991 }
1992 
1993 static const struct vm_operations_struct xfs_file_vm_ops = {
1994 	.fault		= xfs_filemap_fault,
1995 	.huge_fault	= xfs_filemap_huge_fault,
1996 	.map_pages	= filemap_map_pages,
1997 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1998 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1999 };
2000 
2001 STATIC int
2002 xfs_file_mmap_prepare(
2003 	struct vm_area_desc	*desc)
2004 {
2005 	struct file		*file = desc->file;
2006 	struct inode		*inode = file_inode(file);
2007 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
2008 
2009 	/*
2010 	 * We don't support synchronous mappings for non-DAX files and
2011 	 * for DAX files if underneath dax_device is not synchronous.
2012 	 */
2013 	if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
2014 				      target->bt_daxdev))
2015 		return -EOPNOTSUPP;
2016 
2017 	file_accessed(file);
2018 	desc->vm_ops = &xfs_file_vm_ops;
2019 	if (IS_DAX(inode))
2020 		desc->vm_flags |= VM_HUGEPAGE;
2021 	return 0;
2022 }
2023 
2024 const struct file_operations xfs_file_operations = {
2025 	.llseek		= xfs_file_llseek,
2026 	.read_iter	= xfs_file_read_iter,
2027 	.write_iter	= xfs_file_write_iter,
2028 	.splice_read	= xfs_file_splice_read,
2029 	.splice_write	= iter_file_splice_write,
2030 	.iopoll		= iocb_bio_iopoll,
2031 	.unlocked_ioctl	= xfs_file_ioctl,
2032 #ifdef CONFIG_COMPAT
2033 	.compat_ioctl	= xfs_file_compat_ioctl,
2034 #endif
2035 	.mmap_prepare	= xfs_file_mmap_prepare,
2036 	.open		= xfs_file_open,
2037 	.release	= xfs_file_release,
2038 	.fsync		= xfs_file_fsync,
2039 	.get_unmapped_area = thp_get_unmapped_area,
2040 	.fallocate	= xfs_file_fallocate,
2041 	.fadvise	= xfs_file_fadvise,
2042 	.remap_file_range = xfs_file_remap_range,
2043 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
2044 			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
2045 			  FOP_DONTCACHE,
2046 	.setlease	= generic_setlease,
2047 };
2048 
2049 const struct file_operations xfs_dir_file_operations = {
2050 	.open		= xfs_dir_open,
2051 	.read		= generic_read_dir,
2052 	.iterate_shared	= xfs_file_readdir,
2053 	.llseek		= generic_file_llseek,
2054 	.unlocked_ioctl	= xfs_file_ioctl,
2055 #ifdef CONFIG_COMPAT
2056 	.compat_ioctl	= xfs_file_compat_ioctl,
2057 #endif
2058 	.fsync		= xfs_dir_fsync,
2059 	.setlease	= generic_setlease,
2060 };
2061