xref: /linux/fs/xfs/xfs_file.c (revision e445fba2d76369d72b497ecadf6b9787930693d9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 
31 #include <linux/dax.h>
32 #include <linux/falloc.h>
33 #include <linux/backing-dev.h>
34 #include <linux/mman.h>
35 #include <linux/fadvise.h>
36 #include <linux/mount.h>
37 
38 static const struct vm_operations_struct xfs_file_vm_ops;
39 
40 /*
41  * Decide if the given file range is aligned to the size of the fundamental
42  * allocation unit for the file.
43  */
44 bool
xfs_is_falloc_aligned(struct xfs_inode * ip,loff_t pos,long long int len)45 xfs_is_falloc_aligned(
46 	struct xfs_inode	*ip,
47 	loff_t			pos,
48 	long long int		len)
49 {
50 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip);
51 
52 	if (!is_power_of_2(alloc_unit))
53 		return isaligned_64(pos, alloc_unit) &&
54 		       isaligned_64(len, alloc_unit);
55 
56 	return !((pos | len) & (alloc_unit - 1));
57 }
58 
59 /*
60  * Fsync operations on directories are much simpler than on regular files,
61  * as there is no file data to flush, and thus also no need for explicit
62  * cache flush operations, and there are no non-transaction metadata updates
63  * on directories either.
64  */
65 STATIC int
xfs_dir_fsync(struct file * file,loff_t start,loff_t end,int datasync)66 xfs_dir_fsync(
67 	struct file		*file,
68 	loff_t			start,
69 	loff_t			end,
70 	int			datasync)
71 {
72 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
73 
74 	trace_xfs_dir_fsync(ip);
75 	return xfs_log_force_inode(ip);
76 }
77 
78 /*
79  * All metadata updates are logged, which means that we just have to push the
80  * journal to the required sequence number than holds the updates. We track
81  * datasync commits separately to full sync commits, and hence only need to
82  * select the correct sequence number for the log force here.
83  *
84  * We don't have to serialise against concurrent modifications, as we do not
85  * have to wait for modifications that have not yet completed. We define a
86  * transaction commit as completing when the commit sequence number is updated,
87  * hence if the sequence number has not updated, the sync operation has been
88  * run before the commit completed and we don't have to wait for it.
89  *
90  * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
91  * set on the log item until - at least - the journal flush completes. In
92  * reality, they are only cleared when the inode is fully unpinned (i.e.
93  * persistent in the journal and not dirty in the CIL), and so we rely on
94  * xfs_log_force_seq() either skipping sequences that have been persisted or
95  * waiting on sequences that are still in flight to correctly order concurrent
96  * sync operations.
97  */
98 static int
xfs_fsync_flush_log(struct xfs_inode * ip,bool datasync,int * log_flushed)99 xfs_fsync_flush_log(
100 	struct xfs_inode	*ip,
101 	bool			datasync,
102 	int			*log_flushed)
103 {
104 	struct xfs_inode_log_item *iip = ip->i_itemp;
105 	xfs_csn_t		seq = 0;
106 
107 	spin_lock(&iip->ili_lock);
108 	if (datasync)
109 		seq = iip->ili_datasync_seq;
110 	else
111 		seq = iip->ili_commit_seq;
112 	spin_unlock(&iip->ili_lock);
113 
114 	if (!seq)
115 		return 0;
116 
117 	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
118 					  log_flushed);
119 }
120 
121 STATIC int
xfs_file_fsync(struct file * file,loff_t start,loff_t end,int datasync)122 xfs_file_fsync(
123 	struct file		*file,
124 	loff_t			start,
125 	loff_t			end,
126 	int			datasync)
127 {
128 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
129 	struct xfs_mount	*mp = ip->i_mount;
130 	int			error, err2;
131 	int			log_flushed = 0;
132 
133 	trace_xfs_file_fsync(ip);
134 
135 	error = file_write_and_wait_range(file, start, end);
136 	if (error)
137 		return error;
138 
139 	if (xfs_is_shutdown(mp))
140 		return -EIO;
141 
142 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
143 
144 	/*
145 	 * If we have an RT and/or log subvolume we need to make sure to flush
146 	 * the write cache the device used for file data first.  This is to
147 	 * ensure newly written file data make it to disk before logging the new
148 	 * inode size in case of an extending write.
149 	 */
150 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
151 		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
152 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
153 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
154 
155 	/*
156 	 * If the inode has a inode log item attached, it may need the journal
157 	 * flushed to persist any changes the log item might be tracking.
158 	 */
159 	if (ip->i_itemp) {
160 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
161 		if (err2 && !error)
162 			error = err2;
163 	}
164 
165 	/*
166 	 * If we only have a single device, and the log force about was
167 	 * a no-op we might have to flush the data device cache here.
168 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
169 	 * an already allocated file and thus do not have any metadata to
170 	 * commit.
171 	 */
172 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
173 	    mp->m_logdev_targp == mp->m_ddev_targp) {
174 		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
175 		if (err2 && !error)
176 			error = err2;
177 	}
178 
179 	return error;
180 }
181 
182 static int
xfs_ilock_iocb(struct kiocb * iocb,unsigned int lock_mode)183 xfs_ilock_iocb(
184 	struct kiocb		*iocb,
185 	unsigned int		lock_mode)
186 {
187 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
188 
189 	if (iocb->ki_flags & IOCB_NOWAIT) {
190 		if (!xfs_ilock_nowait(ip, lock_mode))
191 			return -EAGAIN;
192 	} else {
193 		xfs_ilock(ip, lock_mode);
194 	}
195 
196 	return 0;
197 }
198 
199 static int
xfs_ilock_iocb_for_write(struct kiocb * iocb,unsigned int * lock_mode)200 xfs_ilock_iocb_for_write(
201 	struct kiocb		*iocb,
202 	unsigned int		*lock_mode)
203 {
204 	ssize_t			ret;
205 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
206 
207 	ret = xfs_ilock_iocb(iocb, *lock_mode);
208 	if (ret)
209 		return ret;
210 
211 	/*
212 	 * If a reflink remap is in progress we always need to take the iolock
213 	 * exclusively to wait for it to finish.
214 	 */
215 	if (*lock_mode == XFS_IOLOCK_SHARED &&
216 	    xfs_iflags_test(ip, XFS_IREMAPPING)) {
217 		xfs_iunlock(ip, *lock_mode);
218 		*lock_mode = XFS_IOLOCK_EXCL;
219 		return xfs_ilock_iocb(iocb, *lock_mode);
220 	}
221 
222 	return 0;
223 }
224 
225 STATIC ssize_t
xfs_file_dio_read(struct kiocb * iocb,struct iov_iter * to)226 xfs_file_dio_read(
227 	struct kiocb		*iocb,
228 	struct iov_iter		*to)
229 {
230 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
231 	ssize_t			ret;
232 
233 	trace_xfs_file_direct_read(iocb, to);
234 
235 	if (!iov_iter_count(to))
236 		return 0; /* skip atime */
237 
238 	file_accessed(iocb->ki_filp);
239 
240 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
241 	if (ret)
242 		return ret;
243 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
244 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
245 
246 	return ret;
247 }
248 
249 static noinline ssize_t
xfs_file_dax_read(struct kiocb * iocb,struct iov_iter * to)250 xfs_file_dax_read(
251 	struct kiocb		*iocb,
252 	struct iov_iter		*to)
253 {
254 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
255 	ssize_t			ret = 0;
256 
257 	trace_xfs_file_dax_read(iocb, to);
258 
259 	if (!iov_iter_count(to))
260 		return 0; /* skip atime */
261 
262 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
263 	if (ret)
264 		return ret;
265 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
266 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
267 
268 	file_accessed(iocb->ki_filp);
269 	return ret;
270 }
271 
272 STATIC ssize_t
xfs_file_buffered_read(struct kiocb * iocb,struct iov_iter * to)273 xfs_file_buffered_read(
274 	struct kiocb		*iocb,
275 	struct iov_iter		*to)
276 {
277 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
278 	ssize_t			ret;
279 
280 	trace_xfs_file_buffered_read(iocb, to);
281 
282 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
283 	if (ret)
284 		return ret;
285 	ret = generic_file_read_iter(iocb, to);
286 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
287 
288 	return ret;
289 }
290 
291 STATIC ssize_t
xfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)292 xfs_file_read_iter(
293 	struct kiocb		*iocb,
294 	struct iov_iter		*to)
295 {
296 	struct inode		*inode = file_inode(iocb->ki_filp);
297 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
298 	ssize_t			ret = 0;
299 
300 	XFS_STATS_INC(mp, xs_read_calls);
301 
302 	if (xfs_is_shutdown(mp))
303 		return -EIO;
304 
305 	if (IS_DAX(inode))
306 		ret = xfs_file_dax_read(iocb, to);
307 	else if (iocb->ki_flags & IOCB_DIRECT)
308 		ret = xfs_file_dio_read(iocb, to);
309 	else
310 		ret = xfs_file_buffered_read(iocb, to);
311 
312 	if (ret > 0)
313 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
314 	return ret;
315 }
316 
317 STATIC ssize_t
xfs_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)318 xfs_file_splice_read(
319 	struct file		*in,
320 	loff_t			*ppos,
321 	struct pipe_inode_info	*pipe,
322 	size_t			len,
323 	unsigned int		flags)
324 {
325 	struct inode		*inode = file_inode(in);
326 	struct xfs_inode	*ip = XFS_I(inode);
327 	struct xfs_mount	*mp = ip->i_mount;
328 	ssize_t			ret = 0;
329 
330 	XFS_STATS_INC(mp, xs_read_calls);
331 
332 	if (xfs_is_shutdown(mp))
333 		return -EIO;
334 
335 	trace_xfs_file_splice_read(ip, *ppos, len);
336 
337 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
338 	ret = filemap_splice_read(in, ppos, pipe, len, flags);
339 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
340 	if (ret > 0)
341 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
342 	return ret;
343 }
344 
345 /*
346  * Take care of zeroing post-EOF blocks when they might exist.
347  *
348  * Returns 0 if successfully, a negative error for a failure, or 1 if this
349  * function dropped the iolock and reacquired it exclusively and the caller
350  * needs to restart the write sanity checks.
351  */
352 static ssize_t
xfs_file_write_zero_eof(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,size_t count,bool * drained_dio,struct xfs_zone_alloc_ctx * ac)353 xfs_file_write_zero_eof(
354 	struct kiocb		*iocb,
355 	struct iov_iter		*from,
356 	unsigned int		*iolock,
357 	size_t			count,
358 	bool			*drained_dio,
359 	struct xfs_zone_alloc_ctx *ac)
360 {
361 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
362 	loff_t			isize;
363 	int			error;
364 
365 	/*
366 	 * We need to serialise against EOF updates that occur in IO completions
367 	 * here. We want to make sure that nobody is changing the size while
368 	 * we do this check until we have placed an IO barrier (i.e. hold
369 	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
370 	 * spinlock effectively forms a memory barrier once we have
371 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
372 	 * hence be able to correctly determine if we need to run zeroing.
373 	 */
374 	spin_lock(&ip->i_flags_lock);
375 	isize = i_size_read(VFS_I(ip));
376 	if (iocb->ki_pos <= isize) {
377 		spin_unlock(&ip->i_flags_lock);
378 		return 0;
379 	}
380 	spin_unlock(&ip->i_flags_lock);
381 
382 	if (iocb->ki_flags & IOCB_NOWAIT)
383 		return -EAGAIN;
384 
385 	if (!*drained_dio) {
386 		/*
387 		 * If zeroing is needed and we are currently holding the iolock
388 		 * shared, we need to update it to exclusive which implies
389 		 * having to redo all checks before.
390 		 */
391 		if (*iolock == XFS_IOLOCK_SHARED) {
392 			xfs_iunlock(ip, *iolock);
393 			*iolock = XFS_IOLOCK_EXCL;
394 			xfs_ilock(ip, *iolock);
395 			iov_iter_reexpand(from, count);
396 		}
397 
398 		/*
399 		 * We now have an IO submission barrier in place, but AIO can do
400 		 * EOF updates during IO completion and hence we now need to
401 		 * wait for all of them to drain.  Non-AIO DIO will have drained
402 		 * before we are given the XFS_IOLOCK_EXCL, and so for most
403 		 * cases this wait is a no-op.
404 		 */
405 		inode_dio_wait(VFS_I(ip));
406 		*drained_dio = true;
407 		return 1;
408 	}
409 
410 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
411 
412 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
413 	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
414 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
415 
416 	return error;
417 }
418 
419 /*
420  * Common pre-write limit and setup checks.
421  *
422  * Called with the iolock held either shared and exclusive according to
423  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
424  * if called for a direct write beyond i_size.
425  */
426 STATIC ssize_t
xfs_file_write_checks(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,struct xfs_zone_alloc_ctx * ac)427 xfs_file_write_checks(
428 	struct kiocb		*iocb,
429 	struct iov_iter		*from,
430 	unsigned int		*iolock,
431 	struct xfs_zone_alloc_ctx *ac)
432 {
433 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
434 	size_t			count = iov_iter_count(from);
435 	bool			drained_dio = false;
436 	ssize_t			error;
437 
438 restart:
439 	error = generic_write_checks(iocb, from);
440 	if (error <= 0)
441 		return error;
442 
443 	if (iocb->ki_flags & IOCB_NOWAIT) {
444 		error = break_layout(inode, false);
445 		if (error == -EWOULDBLOCK)
446 			error = -EAGAIN;
447 	} else {
448 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
449 	}
450 
451 	if (error)
452 		return error;
453 
454 	/*
455 	 * For changing security info in file_remove_privs() we need i_rwsem
456 	 * exclusively.
457 	 */
458 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
459 		xfs_iunlock(XFS_I(inode), *iolock);
460 		*iolock = XFS_IOLOCK_EXCL;
461 		error = xfs_ilock_iocb(iocb, *iolock);
462 		if (error) {
463 			*iolock = 0;
464 			return error;
465 		}
466 		goto restart;
467 	}
468 
469 	/*
470 	 * If the offset is beyond the size of the file, we need to zero all
471 	 * blocks that fall between the existing EOF and the start of this
472 	 * write.
473 	 *
474 	 * We can do an unlocked check for i_size here safely as I/O completion
475 	 * can only extend EOF.  Truncate is locked out at this point, so the
476 	 * EOF can not move backwards, only forwards. Hence we only need to take
477 	 * the slow path when we are at or beyond the current EOF.
478 	 */
479 	if (iocb->ki_pos > i_size_read(inode)) {
480 		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
481 				&drained_dio, ac);
482 		if (error == 1)
483 			goto restart;
484 		if (error)
485 			return error;
486 	}
487 
488 	return kiocb_modified(iocb);
489 }
490 
491 static ssize_t
xfs_zoned_write_space_reserve(struct xfs_mount * mp,struct kiocb * iocb,struct iov_iter * from,unsigned int flags,struct xfs_zone_alloc_ctx * ac)492 xfs_zoned_write_space_reserve(
493 	struct xfs_mount		*mp,
494 	struct kiocb			*iocb,
495 	struct iov_iter			*from,
496 	unsigned int			flags,
497 	struct xfs_zone_alloc_ctx	*ac)
498 {
499 	loff_t				count = iov_iter_count(from);
500 	int				error;
501 
502 	if (iocb->ki_flags & IOCB_NOWAIT)
503 		flags |= XFS_ZR_NOWAIT;
504 
505 	/*
506 	 * Check the rlimit and LFS boundary first so that we don't over-reserve
507 	 * by possibly a lot.
508 	 *
509 	 * The generic write path will redo this check later, and it might have
510 	 * changed by then.  If it got expanded we'll stick to our earlier
511 	 * smaller limit, and if it is decreased the new smaller limit will be
512 	 * used and our extra space reservation will be returned after finishing
513 	 * the write.
514 	 */
515 	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
516 	if (error)
517 		return error;
518 
519 	/*
520 	 * Sloppily round up count to file system blocks.
521 	 *
522 	 * This will often reserve an extra block, but that avoids having to look
523 	 * at the start offset, which isn't stable for O_APPEND until taking the
524 	 * iolock.  Also we need to reserve a block each for zeroing the old
525 	 * EOF block and the new start block if they are unaligned.
526 	 *
527 	 * Any remaining block will be returned after the write.
528 	 */
529 	return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
530 			flags, ac);
531 }
532 
533 static int
xfs_dio_write_end_io(struct kiocb * iocb,ssize_t size,int error,unsigned flags)534 xfs_dio_write_end_io(
535 	struct kiocb		*iocb,
536 	ssize_t			size,
537 	int			error,
538 	unsigned		flags)
539 {
540 	struct inode		*inode = file_inode(iocb->ki_filp);
541 	struct xfs_inode	*ip = XFS_I(inode);
542 	loff_t			offset = iocb->ki_pos;
543 	unsigned int		nofs_flag;
544 
545 	ASSERT(!xfs_is_zoned_inode(ip) ||
546 	       !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
547 
548 	trace_xfs_end_io_direct_write(ip, offset, size);
549 
550 	if (xfs_is_shutdown(ip->i_mount))
551 		return -EIO;
552 
553 	if (error)
554 		return error;
555 	if (!size)
556 		return 0;
557 
558 	/*
559 	 * Capture amount written on completion as we can't reliably account
560 	 * for it on submission.
561 	 */
562 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
563 
564 	/*
565 	 * We can allocate memory here while doing writeback on behalf of
566 	 * memory reclaim.  To avoid memory allocation deadlocks set the
567 	 * task-wide nofs context for the following operations.
568 	 */
569 	nofs_flag = memalloc_nofs_save();
570 
571 	if (flags & IOMAP_DIO_COW) {
572 		if (iocb->ki_flags & IOCB_ATOMIC)
573 			error = xfs_reflink_end_atomic_cow(ip, offset, size);
574 		else
575 			error = xfs_reflink_end_cow(ip, offset, size);
576 		if (error)
577 			goto out;
578 	}
579 
580 	/*
581 	 * Unwritten conversion updates the in-core isize after extent
582 	 * conversion but before updating the on-disk size. Updating isize any
583 	 * earlier allows a racing dio read to find unwritten extents before
584 	 * they are converted.
585 	 */
586 	if (flags & IOMAP_DIO_UNWRITTEN) {
587 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
588 		goto out;
589 	}
590 
591 	/*
592 	 * We need to update the in-core inode size here so that we don't end up
593 	 * with the on-disk inode size being outside the in-core inode size. We
594 	 * have no other method of updating EOF for AIO, so always do it here
595 	 * if necessary.
596 	 *
597 	 * We need to lock the test/set EOF update as we can be racing with
598 	 * other IO completions here to update the EOF. Failing to serialise
599 	 * here can result in EOF moving backwards and Bad Things Happen when
600 	 * that occurs.
601 	 *
602 	 * As IO completion only ever extends EOF, we can do an unlocked check
603 	 * here to avoid taking the spinlock. If we land within the current EOF,
604 	 * then we do not need to do an extending update at all, and we don't
605 	 * need to take the lock to check this. If we race with an update moving
606 	 * EOF, then we'll either still be beyond EOF and need to take the lock,
607 	 * or we'll be within EOF and we don't need to take it at all.
608 	 */
609 	if (offset + size <= i_size_read(inode))
610 		goto out;
611 
612 	spin_lock(&ip->i_flags_lock);
613 	if (offset + size > i_size_read(inode)) {
614 		i_size_write(inode, offset + size);
615 		spin_unlock(&ip->i_flags_lock);
616 		error = xfs_setfilesize(ip, offset, size);
617 	} else {
618 		spin_unlock(&ip->i_flags_lock);
619 	}
620 
621 out:
622 	memalloc_nofs_restore(nofs_flag);
623 	return error;
624 }
625 
626 static const struct iomap_dio_ops xfs_dio_write_ops = {
627 	.end_io		= xfs_dio_write_end_io,
628 };
629 
630 static void
xfs_dio_zoned_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)631 xfs_dio_zoned_submit_io(
632 	const struct iomap_iter	*iter,
633 	struct bio		*bio,
634 	loff_t			file_offset)
635 {
636 	struct xfs_mount	*mp = XFS_I(iter->inode)->i_mount;
637 	struct xfs_zone_alloc_ctx *ac = iter->private;
638 	xfs_filblks_t		count_fsb;
639 	struct iomap_ioend	*ioend;
640 
641 	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
642 	if (count_fsb > ac->reserved_blocks) {
643 		xfs_err(mp,
644 "allocation (%lld) larger than reservation (%lld).",
645 			count_fsb, ac->reserved_blocks);
646 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
647 		bio_io_error(bio);
648 		return;
649 	}
650 	ac->reserved_blocks -= count_fsb;
651 
652 	bio->bi_end_io = xfs_end_bio;
653 	ioend = iomap_init_ioend(iter->inode, bio, file_offset,
654 			IOMAP_IOEND_DIRECT);
655 	xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
656 }
657 
658 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
659 	.bio_set	= &iomap_ioend_bioset,
660 	.submit_io	= xfs_dio_zoned_submit_io,
661 	.end_io		= xfs_dio_write_end_io,
662 };
663 
664 /*
665  * Handle block aligned direct I/O writes.
666  */
667 static noinline ssize_t
xfs_file_dio_write_aligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from,const struct iomap_ops * ops,const struct iomap_dio_ops * dops,struct xfs_zone_alloc_ctx * ac)668 xfs_file_dio_write_aligned(
669 	struct xfs_inode	*ip,
670 	struct kiocb		*iocb,
671 	struct iov_iter		*from,
672 	const struct iomap_ops	*ops,
673 	const struct iomap_dio_ops *dops,
674 	struct xfs_zone_alloc_ctx *ac)
675 {
676 	unsigned int		iolock = XFS_IOLOCK_SHARED;
677 	ssize_t			ret;
678 
679 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
680 	if (ret)
681 		return ret;
682 	ret = xfs_file_write_checks(iocb, from, &iolock, ac);
683 	if (ret)
684 		goto out_unlock;
685 
686 	/*
687 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
688 	 * the iolock back to shared if we had to take the exclusive lock in
689 	 * xfs_file_write_checks() for other reasons.
690 	 */
691 	if (iolock == XFS_IOLOCK_EXCL) {
692 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
693 		iolock = XFS_IOLOCK_SHARED;
694 	}
695 	trace_xfs_file_direct_write(iocb, from);
696 	ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
697 out_unlock:
698 	xfs_iunlock(ip, iolock);
699 	return ret;
700 }
701 
702 /*
703  * Handle block aligned direct I/O writes to zoned devices.
704  */
705 static noinline ssize_t
xfs_file_dio_write_zoned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)706 xfs_file_dio_write_zoned(
707 	struct xfs_inode	*ip,
708 	struct kiocb		*iocb,
709 	struct iov_iter		*from)
710 {
711 	struct xfs_zone_alloc_ctx ac = { };
712 	ssize_t			ret;
713 
714 	ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
715 	if (ret < 0)
716 		return ret;
717 	ret = xfs_file_dio_write_aligned(ip, iocb, from,
718 			&xfs_zoned_direct_write_iomap_ops,
719 			&xfs_dio_zoned_write_ops, &ac);
720 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
721 	return ret;
722 }
723 
724 /*
725  * Handle block atomic writes
726  *
727  * Two methods of atomic writes are supported:
728  * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
729  *   disk
730  * - COW-based, which uses a COW fork as a staging extent for data updates
731  *   before atomically updating extent mappings for the range being written
732  *
733  */
734 static noinline ssize_t
xfs_file_dio_write_atomic(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)735 xfs_file_dio_write_atomic(
736 	struct xfs_inode	*ip,
737 	struct kiocb		*iocb,
738 	struct iov_iter		*from)
739 {
740 	unsigned int		iolock = XFS_IOLOCK_SHARED;
741 	ssize_t			ret, ocount = iov_iter_count(from);
742 	const struct iomap_ops	*dops;
743 
744 	/*
745 	 * HW offload should be faster, so try that first if it is already
746 	 * known that the write length is not too large.
747 	 */
748 	if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
749 		dops = &xfs_atomic_write_cow_iomap_ops;
750 	else
751 		dops = &xfs_direct_write_iomap_ops;
752 
753 retry:
754 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
755 	if (ret)
756 		return ret;
757 
758 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
759 	if (ret)
760 		goto out_unlock;
761 
762 	/* Demote similar to xfs_file_dio_write_aligned() */
763 	if (iolock == XFS_IOLOCK_EXCL) {
764 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
765 		iolock = XFS_IOLOCK_SHARED;
766 	}
767 
768 	trace_xfs_file_direct_write(iocb, from);
769 	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
770 			0, NULL, 0);
771 
772 	/*
773 	 * The retry mechanism is based on the ->iomap_begin method returning
774 	 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
775 	 * possible. The REQ_ATOMIC-based method typically not be possible if
776 	 * the write spans multiple extents or the disk blocks are misaligned.
777 	 */
778 	if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
779 		xfs_iunlock(ip, iolock);
780 		dops = &xfs_atomic_write_cow_iomap_ops;
781 		goto retry;
782 	}
783 
784 out_unlock:
785 	if (iolock)
786 		xfs_iunlock(ip, iolock);
787 	return ret;
788 }
789 
790 /*
791  * Handle block unaligned direct I/O writes
792  *
793  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
794  * them to be done in parallel with reads and other direct I/O writes.  However,
795  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
796  * to do sub-block zeroing and that requires serialisation against other direct
797  * I/O to the same block.  In this case we need to serialise the submission of
798  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
799  * In the case where sub-block zeroing is not required, we can do concurrent
800  * sub-block dios to the same block successfully.
801  *
802  * Optimistically submit the I/O using the shared lock first, but use the
803  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
804  * if block allocation or partial block zeroing would be required.  In that case
805  * we try again with the exclusive lock.
806  */
807 static noinline ssize_t
xfs_file_dio_write_unaligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)808 xfs_file_dio_write_unaligned(
809 	struct xfs_inode	*ip,
810 	struct kiocb		*iocb,
811 	struct iov_iter		*from)
812 {
813 	size_t			isize = i_size_read(VFS_I(ip));
814 	size_t			count = iov_iter_count(from);
815 	unsigned int		iolock = XFS_IOLOCK_SHARED;
816 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
817 	ssize_t			ret;
818 
819 	/*
820 	 * Extending writes need exclusivity because of the sub-block zeroing
821 	 * that the DIO code always does for partial tail blocks beyond EOF, so
822 	 * don't even bother trying the fast path in this case.
823 	 */
824 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
825 		if (iocb->ki_flags & IOCB_NOWAIT)
826 			return -EAGAIN;
827 retry_exclusive:
828 		iolock = XFS_IOLOCK_EXCL;
829 		flags = IOMAP_DIO_FORCE_WAIT;
830 	}
831 
832 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
833 	if (ret)
834 		return ret;
835 
836 	/*
837 	 * We can't properly handle unaligned direct I/O to reflink files yet,
838 	 * as we can't unshare a partial block.
839 	 */
840 	if (xfs_is_cow_inode(ip)) {
841 		trace_xfs_reflink_bounce_dio_write(iocb, from);
842 		ret = -ENOTBLK;
843 		goto out_unlock;
844 	}
845 
846 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
847 	if (ret)
848 		goto out_unlock;
849 
850 	/*
851 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
852 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
853 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
854 	 * drain first.
855 	 */
856 	if (flags & IOMAP_DIO_FORCE_WAIT)
857 		inode_dio_wait(VFS_I(ip));
858 
859 	trace_xfs_file_direct_write(iocb, from);
860 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
861 			   &xfs_dio_write_ops, flags, NULL, 0);
862 
863 	/*
864 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
865 	 * layer rejected it for mapping or locking reasons. If we are doing
866 	 * nonblocking user I/O, propagate the error.
867 	 */
868 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
869 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
870 		xfs_iunlock(ip, iolock);
871 		goto retry_exclusive;
872 	}
873 
874 out_unlock:
875 	if (iolock)
876 		xfs_iunlock(ip, iolock);
877 	return ret;
878 }
879 
880 static ssize_t
xfs_file_dio_write(struct kiocb * iocb,struct iov_iter * from)881 xfs_file_dio_write(
882 	struct kiocb		*iocb,
883 	struct iov_iter		*from)
884 {
885 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
886 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
887 	size_t			count = iov_iter_count(from);
888 
889 	/* direct I/O must be aligned to device logical sector size */
890 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
891 		return -EINVAL;
892 
893 	/*
894 	 * For always COW inodes we also must check the alignment of each
895 	 * individual iovec segment, as they could end up with different
896 	 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
897 	 * then overwrite an already written block.
898 	 */
899 	if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
900 	    (xfs_is_always_cow_inode(ip) &&
901 	     (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
902 		return xfs_file_dio_write_unaligned(ip, iocb, from);
903 	if (xfs_is_zoned_inode(ip))
904 		return xfs_file_dio_write_zoned(ip, iocb, from);
905 	if (iocb->ki_flags & IOCB_ATOMIC)
906 		return xfs_file_dio_write_atomic(ip, iocb, from);
907 	return xfs_file_dio_write_aligned(ip, iocb, from,
908 			&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
909 }
910 
911 static noinline ssize_t
xfs_file_dax_write(struct kiocb * iocb,struct iov_iter * from)912 xfs_file_dax_write(
913 	struct kiocb		*iocb,
914 	struct iov_iter		*from)
915 {
916 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
917 	struct xfs_inode	*ip = XFS_I(inode);
918 	unsigned int		iolock = XFS_IOLOCK_EXCL;
919 	ssize_t			ret, error = 0;
920 	loff_t			pos;
921 
922 	ret = xfs_ilock_iocb(iocb, iolock);
923 	if (ret)
924 		return ret;
925 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
926 	if (ret)
927 		goto out;
928 
929 	pos = iocb->ki_pos;
930 
931 	trace_xfs_file_dax_write(iocb, from);
932 	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
933 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
934 		i_size_write(inode, iocb->ki_pos);
935 		error = xfs_setfilesize(ip, pos, ret);
936 	}
937 out:
938 	if (iolock)
939 		xfs_iunlock(ip, iolock);
940 	if (error)
941 		return error;
942 
943 	if (ret > 0) {
944 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
945 
946 		/* Handle various SYNC-type writes */
947 		ret = generic_write_sync(iocb, ret);
948 	}
949 	return ret;
950 }
951 
952 STATIC ssize_t
xfs_file_buffered_write(struct kiocb * iocb,struct iov_iter * from)953 xfs_file_buffered_write(
954 	struct kiocb		*iocb,
955 	struct iov_iter		*from)
956 {
957 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
958 	struct xfs_inode	*ip = XFS_I(inode);
959 	ssize_t			ret;
960 	bool			cleared_space = false;
961 	unsigned int		iolock;
962 
963 write_retry:
964 	iolock = XFS_IOLOCK_EXCL;
965 	ret = xfs_ilock_iocb(iocb, iolock);
966 	if (ret)
967 		return ret;
968 
969 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
970 	if (ret)
971 		goto out;
972 
973 	trace_xfs_file_buffered_write(iocb, from);
974 	ret = iomap_file_buffered_write(iocb, from,
975 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
976 			NULL);
977 
978 	/*
979 	 * If we hit a space limit, try to free up some lingering preallocated
980 	 * space before returning an error. In the case of ENOSPC, first try to
981 	 * write back all dirty inodes to free up some of the excess reserved
982 	 * metadata space. This reduces the chances that the eofblocks scan
983 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
984 	 * also behaves as a filter to prevent too many eofblocks scans from
985 	 * running at the same time.  Use a synchronous scan to increase the
986 	 * effectiveness of the scan.
987 	 */
988 	if (ret == -EDQUOT && !cleared_space) {
989 		xfs_iunlock(ip, iolock);
990 		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
991 		cleared_space = true;
992 		goto write_retry;
993 	} else if (ret == -ENOSPC && !cleared_space) {
994 		struct xfs_icwalk	icw = {0};
995 
996 		cleared_space = true;
997 		xfs_flush_inodes(ip->i_mount);
998 
999 		xfs_iunlock(ip, iolock);
1000 		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1001 		xfs_blockgc_free_space(ip->i_mount, &icw);
1002 		goto write_retry;
1003 	}
1004 
1005 out:
1006 	if (iolock)
1007 		xfs_iunlock(ip, iolock);
1008 
1009 	if (ret > 0) {
1010 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1011 		/* Handle various SYNC-type writes */
1012 		ret = generic_write_sync(iocb, ret);
1013 	}
1014 	return ret;
1015 }
1016 
1017 STATIC ssize_t
xfs_file_buffered_write_zoned(struct kiocb * iocb,struct iov_iter * from)1018 xfs_file_buffered_write_zoned(
1019 	struct kiocb		*iocb,
1020 	struct iov_iter		*from)
1021 {
1022 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
1023 	struct xfs_mount	*mp = ip->i_mount;
1024 	unsigned int		iolock = XFS_IOLOCK_EXCL;
1025 	bool			cleared_space = false;
1026 	struct xfs_zone_alloc_ctx ac = { };
1027 	ssize_t			ret;
1028 
1029 	ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
1030 	if (ret < 0)
1031 		return ret;
1032 
1033 	ret = xfs_ilock_iocb(iocb, iolock);
1034 	if (ret)
1035 		goto out_unreserve;
1036 
1037 	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1038 	if (ret)
1039 		goto out_unlock;
1040 
1041 	/*
1042 	 * Truncate the iter to the length that we were actually able to
1043 	 * allocate blocks for.  This needs to happen after
1044 	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1045 	 * writes.
1046 	 */
1047 	iov_iter_truncate(from,
1048 			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1049 			(iocb->ki_pos & mp->m_blockmask));
1050 	if (!iov_iter_count(from))
1051 		goto out_unlock;
1052 
1053 retry:
1054 	trace_xfs_file_buffered_write(iocb, from);
1055 	ret = iomap_file_buffered_write(iocb, from,
1056 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1057 			&ac);
1058 	if (ret == -ENOSPC && !cleared_space) {
1059 		/*
1060 		 * Kick off writeback to convert delalloc space and release the
1061 		 * usually too pessimistic indirect block reservations.
1062 		 */
1063 		xfs_flush_inodes(mp);
1064 		cleared_space = true;
1065 		goto retry;
1066 	}
1067 
1068 out_unlock:
1069 	xfs_iunlock(ip, iolock);
1070 out_unreserve:
1071 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1072 	if (ret > 0) {
1073 		XFS_STATS_ADD(mp, xs_write_bytes, ret);
1074 		ret = generic_write_sync(iocb, ret);
1075 	}
1076 	return ret;
1077 }
1078 
1079 STATIC ssize_t
xfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1080 xfs_file_write_iter(
1081 	struct kiocb		*iocb,
1082 	struct iov_iter		*from)
1083 {
1084 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1085 	struct xfs_inode	*ip = XFS_I(inode);
1086 	ssize_t			ret;
1087 	size_t			ocount = iov_iter_count(from);
1088 
1089 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1090 
1091 	if (ocount == 0)
1092 		return 0;
1093 
1094 	if (xfs_is_shutdown(ip->i_mount))
1095 		return -EIO;
1096 
1097 	if (iocb->ki_flags & IOCB_ATOMIC) {
1098 		if (ocount < xfs_get_atomic_write_min(ip))
1099 			return -EINVAL;
1100 
1101 		if (ocount > xfs_get_atomic_write_max(ip))
1102 			return -EINVAL;
1103 
1104 		ret = generic_atomic_write_valid(iocb, from);
1105 		if (ret)
1106 			return ret;
1107 	}
1108 
1109 	if (IS_DAX(inode))
1110 		return xfs_file_dax_write(iocb, from);
1111 
1112 	if (iocb->ki_flags & IOCB_DIRECT) {
1113 		/*
1114 		 * Allow a directio write to fall back to a buffered
1115 		 * write *only* in the case that we're doing a reflink
1116 		 * CoW.  In all other directio scenarios we do not
1117 		 * allow an operation to fall back to buffered mode.
1118 		 */
1119 		ret = xfs_file_dio_write(iocb, from);
1120 		if (ret != -ENOTBLK)
1121 			return ret;
1122 	}
1123 
1124 	if (xfs_is_zoned_inode(ip))
1125 		return xfs_file_buffered_write_zoned(iocb, from);
1126 	return xfs_file_buffered_write(iocb, from);
1127 }
1128 
1129 /* Does this file, inode, or mount want synchronous writes? */
xfs_file_sync_writes(struct file * filp)1130 static inline bool xfs_file_sync_writes(struct file *filp)
1131 {
1132 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1133 
1134 	if (xfs_has_wsync(ip->i_mount))
1135 		return true;
1136 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1137 		return true;
1138 	if (IS_SYNC(file_inode(filp)))
1139 		return true;
1140 
1141 	return false;
1142 }
1143 
1144 static int
xfs_falloc_newsize(struct file * file,int mode,loff_t offset,loff_t len,loff_t * new_size)1145 xfs_falloc_newsize(
1146 	struct file		*file,
1147 	int			mode,
1148 	loff_t			offset,
1149 	loff_t			len,
1150 	loff_t			*new_size)
1151 {
1152 	struct inode		*inode = file_inode(file);
1153 
1154 	if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1155 		return 0;
1156 	*new_size = offset + len;
1157 	return inode_newsize_ok(inode, *new_size);
1158 }
1159 
1160 static int
xfs_falloc_setsize(struct file * file,loff_t new_size)1161 xfs_falloc_setsize(
1162 	struct file		*file,
1163 	loff_t			new_size)
1164 {
1165 	struct iattr iattr = {
1166 		.ia_valid	= ATTR_SIZE,
1167 		.ia_size	= new_size,
1168 	};
1169 
1170 	if (!new_size)
1171 		return 0;
1172 	return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1173 			&iattr);
1174 }
1175 
1176 static int
xfs_falloc_collapse_range(struct file * file,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1177 xfs_falloc_collapse_range(
1178 	struct file		*file,
1179 	loff_t			offset,
1180 	loff_t			len,
1181 	struct xfs_zone_alloc_ctx *ac)
1182 {
1183 	struct inode		*inode = file_inode(file);
1184 	loff_t			new_size = i_size_read(inode) - len;
1185 	int			error;
1186 
1187 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1188 		return -EINVAL;
1189 
1190 	/*
1191 	 * There is no need to overlap collapse range with EOF, in which case it
1192 	 * is effectively a truncate operation
1193 	 */
1194 	if (offset + len >= i_size_read(inode))
1195 		return -EINVAL;
1196 
1197 	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1198 	if (error)
1199 		return error;
1200 	return xfs_falloc_setsize(file, new_size);
1201 }
1202 
1203 static int
xfs_falloc_insert_range(struct file * file,loff_t offset,loff_t len)1204 xfs_falloc_insert_range(
1205 	struct file		*file,
1206 	loff_t			offset,
1207 	loff_t			len)
1208 {
1209 	struct inode		*inode = file_inode(file);
1210 	loff_t			isize = i_size_read(inode);
1211 	int			error;
1212 
1213 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1214 		return -EINVAL;
1215 
1216 	/*
1217 	 * New inode size must not exceed ->s_maxbytes, accounting for
1218 	 * possible signed overflow.
1219 	 */
1220 	if (inode->i_sb->s_maxbytes - isize < len)
1221 		return -EFBIG;
1222 
1223 	/* Offset should be less than i_size */
1224 	if (offset >= isize)
1225 		return -EINVAL;
1226 
1227 	error = xfs_falloc_setsize(file, isize + len);
1228 	if (error)
1229 		return error;
1230 
1231 	/*
1232 	 * Perform hole insertion now that the file size has been updated so
1233 	 * that if we crash during the operation we don't leave shifted extents
1234 	 * past EOF and hence losing access to the data that is contained within
1235 	 * them.
1236 	 */
1237 	return xfs_insert_file_space(XFS_I(inode), offset, len);
1238 }
1239 
1240 /*
1241  * Punch a hole and prealloc the range.  We use a hole punch rather than
1242  * unwritten extent conversion for two reasons:
1243  *
1244  *   1.) Hole punch handles partial block zeroing for us.
1245  *   2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1246  *	 virtue of the hole punch.
1247  */
1248 static int
xfs_falloc_zero_range(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1249 xfs_falloc_zero_range(
1250 	struct file		*file,
1251 	int			mode,
1252 	loff_t			offset,
1253 	loff_t			len,
1254 	struct xfs_zone_alloc_ctx *ac)
1255 {
1256 	struct inode		*inode = file_inode(file);
1257 	unsigned int		blksize = i_blocksize(inode);
1258 	loff_t			new_size = 0;
1259 	int			error;
1260 
1261 	trace_xfs_zero_file_space(XFS_I(inode));
1262 
1263 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1264 	if (error)
1265 		return error;
1266 
1267 	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
1268 	if (error)
1269 		return error;
1270 
1271 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
1272 	offset = round_down(offset, blksize);
1273 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1274 	if (error)
1275 		return error;
1276 	return xfs_falloc_setsize(file, new_size);
1277 }
1278 
1279 static int
xfs_falloc_unshare_range(struct file * file,int mode,loff_t offset,loff_t len)1280 xfs_falloc_unshare_range(
1281 	struct file		*file,
1282 	int			mode,
1283 	loff_t			offset,
1284 	loff_t			len)
1285 {
1286 	struct inode		*inode = file_inode(file);
1287 	loff_t			new_size = 0;
1288 	int			error;
1289 
1290 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1291 	if (error)
1292 		return error;
1293 
1294 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1295 	if (error)
1296 		return error;
1297 
1298 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1299 	if (error)
1300 		return error;
1301 	return xfs_falloc_setsize(file, new_size);
1302 }
1303 
1304 static int
xfs_falloc_allocate_range(struct file * file,int mode,loff_t offset,loff_t len)1305 xfs_falloc_allocate_range(
1306 	struct file		*file,
1307 	int			mode,
1308 	loff_t			offset,
1309 	loff_t			len)
1310 {
1311 	struct inode		*inode = file_inode(file);
1312 	loff_t			new_size = 0;
1313 	int			error;
1314 
1315 	/*
1316 	 * If always_cow mode we can't use preallocations and thus should not
1317 	 * create them.
1318 	 */
1319 	if (xfs_is_always_cow_inode(XFS_I(inode)))
1320 		return -EOPNOTSUPP;
1321 
1322 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1323 	if (error)
1324 		return error;
1325 
1326 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1327 	if (error)
1328 		return error;
1329 	return xfs_falloc_setsize(file, new_size);
1330 }
1331 
1332 #define	XFS_FALLOC_FL_SUPPORTED						\
1333 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
1334 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
1335 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
1336 		 FALLOC_FL_UNSHARE_RANGE)
1337 
1338 STATIC long
__xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1339 __xfs_file_fallocate(
1340 	struct file		*file,
1341 	int			mode,
1342 	loff_t			offset,
1343 	loff_t			len,
1344 	struct xfs_zone_alloc_ctx *ac)
1345 {
1346 	struct inode		*inode = file_inode(file);
1347 	struct xfs_inode	*ip = XFS_I(inode);
1348 	long			error;
1349 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1350 
1351 	xfs_ilock(ip, iolock);
1352 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1353 	if (error)
1354 		goto out_unlock;
1355 
1356 	/*
1357 	 * Must wait for all AIO to complete before we continue as AIO can
1358 	 * change the file size on completion without holding any locks we
1359 	 * currently hold. We must do this first because AIO can update both
1360 	 * the on disk and in memory inode sizes, and the operations that follow
1361 	 * require the in-memory size to be fully up-to-date.
1362 	 */
1363 	inode_dio_wait(inode);
1364 
1365 	error = file_modified(file);
1366 	if (error)
1367 		goto out_unlock;
1368 
1369 	switch (mode & FALLOC_FL_MODE_MASK) {
1370 	case FALLOC_FL_PUNCH_HOLE:
1371 		error = xfs_free_file_space(ip, offset, len, ac);
1372 		break;
1373 	case FALLOC_FL_COLLAPSE_RANGE:
1374 		error = xfs_falloc_collapse_range(file, offset, len, ac);
1375 		break;
1376 	case FALLOC_FL_INSERT_RANGE:
1377 		error = xfs_falloc_insert_range(file, offset, len);
1378 		break;
1379 	case FALLOC_FL_ZERO_RANGE:
1380 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1381 		break;
1382 	case FALLOC_FL_UNSHARE_RANGE:
1383 		error = xfs_falloc_unshare_range(file, mode, offset, len);
1384 		break;
1385 	case FALLOC_FL_ALLOCATE_RANGE:
1386 		error = xfs_falloc_allocate_range(file, mode, offset, len);
1387 		break;
1388 	default:
1389 		error = -EOPNOTSUPP;
1390 		break;
1391 	}
1392 
1393 	if (!error && xfs_file_sync_writes(file))
1394 		error = xfs_log_force_inode(ip);
1395 
1396 out_unlock:
1397 	xfs_iunlock(ip, iolock);
1398 	return error;
1399 }
1400 
1401 static long
xfs_file_zoned_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1402 xfs_file_zoned_fallocate(
1403 	struct file		*file,
1404 	int			mode,
1405 	loff_t			offset,
1406 	loff_t			len)
1407 {
1408 	struct xfs_zone_alloc_ctx ac = { };
1409 	struct xfs_inode	*ip = XFS_I(file_inode(file));
1410 	int			error;
1411 
1412 	error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac);
1413 	if (error)
1414 		return error;
1415 	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1416 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1417 	return error;
1418 }
1419 
1420 static long
xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1421 xfs_file_fallocate(
1422 	struct file		*file,
1423 	int			mode,
1424 	loff_t			offset,
1425 	loff_t			len)
1426 {
1427 	struct inode		*inode = file_inode(file);
1428 
1429 	if (!S_ISREG(inode->i_mode))
1430 		return -EINVAL;
1431 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1432 		return -EOPNOTSUPP;
1433 
1434 	/*
1435 	 * For zoned file systems, zeroing the first and last block of a hole
1436 	 * punch requires allocating a new block to rewrite the remaining data
1437 	 * and new zeroes out of place.  Get a reservations for those before
1438 	 * taking the iolock.  Dip into the reserved pool because we are
1439 	 * expected to be able to punch a hole even on a completely full
1440 	 * file system.
1441 	 */
1442 	if (xfs_is_zoned_inode(XFS_I(inode)) &&
1443 	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1444 		     FALLOC_FL_COLLAPSE_RANGE)))
1445 		return xfs_file_zoned_fallocate(file, mode, offset, len);
1446 	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1447 }
1448 
1449 STATIC int
xfs_file_fadvise(struct file * file,loff_t start,loff_t end,int advice)1450 xfs_file_fadvise(
1451 	struct file	*file,
1452 	loff_t		start,
1453 	loff_t		end,
1454 	int		advice)
1455 {
1456 	struct xfs_inode *ip = XFS_I(file_inode(file));
1457 	int ret;
1458 	int lockflags = 0;
1459 
1460 	/*
1461 	 * Operations creating pages in page cache need protection from hole
1462 	 * punching and similar ops
1463 	 */
1464 	if (advice == POSIX_FADV_WILLNEED) {
1465 		lockflags = XFS_IOLOCK_SHARED;
1466 		xfs_ilock(ip, lockflags);
1467 	}
1468 	ret = generic_fadvise(file, start, end, advice);
1469 	if (lockflags)
1470 		xfs_iunlock(ip, lockflags);
1471 	return ret;
1472 }
1473 
1474 STATIC loff_t
xfs_file_remap_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)1475 xfs_file_remap_range(
1476 	struct file		*file_in,
1477 	loff_t			pos_in,
1478 	struct file		*file_out,
1479 	loff_t			pos_out,
1480 	loff_t			len,
1481 	unsigned int		remap_flags)
1482 {
1483 	struct inode		*inode_in = file_inode(file_in);
1484 	struct xfs_inode	*src = XFS_I(inode_in);
1485 	struct inode		*inode_out = file_inode(file_out);
1486 	struct xfs_inode	*dest = XFS_I(inode_out);
1487 	struct xfs_mount	*mp = src->i_mount;
1488 	loff_t			remapped = 0;
1489 	xfs_extlen_t		cowextsize;
1490 	int			ret;
1491 
1492 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1493 		return -EINVAL;
1494 
1495 	if (!xfs_has_reflink(mp))
1496 		return -EOPNOTSUPP;
1497 
1498 	if (xfs_is_shutdown(mp))
1499 		return -EIO;
1500 
1501 	/* Prepare and then clone file data. */
1502 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1503 			&len, remap_flags);
1504 	if (ret || len == 0)
1505 		return ret;
1506 
1507 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1508 
1509 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1510 			&remapped);
1511 	if (ret)
1512 		goto out_unlock;
1513 
1514 	/*
1515 	 * Carry the cowextsize hint from src to dest if we're sharing the
1516 	 * entire source file to the entire destination file, the source file
1517 	 * has a cowextsize hint, and the destination file does not.
1518 	 */
1519 	cowextsize = 0;
1520 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1521 	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1522 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1523 	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1524 		cowextsize = src->i_cowextsize;
1525 
1526 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1527 			remap_flags);
1528 	if (ret)
1529 		goto out_unlock;
1530 
1531 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1532 		xfs_log_force_inode(dest);
1533 out_unlock:
1534 	xfs_iunlock2_remapping(src, dest);
1535 	if (ret)
1536 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1537 	/*
1538 	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1539 	 * handle partial results -- either the whole remap succeeds, or we
1540 	 * must say why it did not.  In this case, any error should be returned
1541 	 * to the caller.
1542 	 */
1543 	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1544 		return ret;
1545 	return remapped > 0 ? remapped : ret;
1546 }
1547 
1548 STATIC int
xfs_file_open(struct inode * inode,struct file * file)1549 xfs_file_open(
1550 	struct inode	*inode,
1551 	struct file	*file)
1552 {
1553 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1554 		return -EIO;
1555 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1556 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1557 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1558 	return generic_file_open(inode, file);
1559 }
1560 
1561 STATIC int
xfs_dir_open(struct inode * inode,struct file * file)1562 xfs_dir_open(
1563 	struct inode	*inode,
1564 	struct file	*file)
1565 {
1566 	struct xfs_inode *ip = XFS_I(inode);
1567 	unsigned int	mode;
1568 	int		error;
1569 
1570 	if (xfs_is_shutdown(ip->i_mount))
1571 		return -EIO;
1572 	error = generic_file_open(inode, file);
1573 	if (error)
1574 		return error;
1575 
1576 	/*
1577 	 * If there are any blocks, read-ahead block 0 as we're almost
1578 	 * certain to have the next operation be a read there.
1579 	 */
1580 	mode = xfs_ilock_data_map_shared(ip);
1581 	if (ip->i_df.if_nextents > 0)
1582 		error = xfs_dir3_data_readahead(ip, 0, 0);
1583 	xfs_iunlock(ip, mode);
1584 	return error;
1585 }
1586 
1587 /*
1588  * Don't bother propagating errors.  We're just doing cleanup, and the caller
1589  * ignores the return value anyway.
1590  */
1591 STATIC int
xfs_file_release(struct inode * inode,struct file * file)1592 xfs_file_release(
1593 	struct inode		*inode,
1594 	struct file		*file)
1595 {
1596 	struct xfs_inode	*ip = XFS_I(inode);
1597 	struct xfs_mount	*mp = ip->i_mount;
1598 
1599 	/*
1600 	 * If this is a read-only mount or the file system has been shut down,
1601 	 * don't generate I/O.
1602 	 */
1603 	if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1604 		return 0;
1605 
1606 	/*
1607 	 * If we previously truncated this file and removed old data in the
1608 	 * process, we want to initiate "early" writeout on the last close.
1609 	 * This is an attempt to combat the notorious NULL files problem which
1610 	 * is particularly noticeable from a truncate down, buffered (re-)write
1611 	 * (delalloc), followed by a crash.  What we are effectively doing here
1612 	 * is significantly reducing the time window where we'd otherwise be
1613 	 * exposed to that problem.
1614 	 */
1615 	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1616 		xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1617 		if (ip->i_delayed_blks > 0)
1618 			filemap_flush(inode->i_mapping);
1619 	}
1620 
1621 	/*
1622 	 * XFS aggressively preallocates post-EOF space to generate contiguous
1623 	 * allocations for writers that append to the end of the file.
1624 	 *
1625 	 * To support workloads that close and reopen the file frequently, these
1626 	 * preallocations usually persist after a close unless it is the first
1627 	 * close for the inode.  This is a tradeoff to generate tightly packed
1628 	 * data layouts for unpacking tarballs or similar archives that write
1629 	 * one file after another without going back to it while keeping the
1630 	 * preallocation for files that have recurring open/write/close cycles.
1631 	 *
1632 	 * This heuristic is skipped for inodes with the append-only flag as
1633 	 * that flag is rather pointless for inodes written only once.
1634 	 *
1635 	 * There is no point in freeing blocks here for open but unlinked files
1636 	 * as they will be taken care of by the inactivation path soon.
1637 	 *
1638 	 * When releasing a read-only context, don't flush data or trim post-EOF
1639 	 * blocks.  This avoids open/read/close workloads from removing EOF
1640 	 * blocks that other writers depend upon to reduce fragmentation.
1641 	 *
1642 	 * Inodes on the zoned RT device never have preallocations, so skip
1643 	 * taking the locks below.
1644 	 */
1645 	if (!inode->i_nlink ||
1646 	    !(file->f_mode & FMODE_WRITE) ||
1647 	    (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1648 	    xfs_is_zoned_inode(ip))
1649 		return 0;
1650 
1651 	/*
1652 	 * If we can't get the iolock just skip truncating the blocks past EOF
1653 	 * because we could deadlock with the mmap_lock otherwise. We'll get
1654 	 * another chance to drop them once the last reference to the inode is
1655 	 * dropped, so we'll never leak blocks permanently.
1656 	 */
1657 	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1658 	    xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1659 		if (xfs_can_free_eofblocks(ip) &&
1660 		    !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1661 			xfs_free_eofblocks(ip);
1662 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1663 	}
1664 
1665 	return 0;
1666 }
1667 
1668 STATIC int
xfs_file_readdir(struct file * file,struct dir_context * ctx)1669 xfs_file_readdir(
1670 	struct file	*file,
1671 	struct dir_context *ctx)
1672 {
1673 	struct inode	*inode = file_inode(file);
1674 	xfs_inode_t	*ip = XFS_I(inode);
1675 	size_t		bufsize;
1676 
1677 	/*
1678 	 * The Linux API doesn't pass down the total size of the buffer
1679 	 * we read into down to the filesystem.  With the filldir concept
1680 	 * it's not needed for correct information, but the XFS dir2 leaf
1681 	 * code wants an estimate of the buffer size to calculate it's
1682 	 * readahead window and size the buffers used for mapping to
1683 	 * physical blocks.
1684 	 *
1685 	 * Try to give it an estimate that's good enough, maybe at some
1686 	 * point we can change the ->readdir prototype to include the
1687 	 * buffer size.  For now we use the current glibc buffer size.
1688 	 */
1689 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1690 
1691 	return xfs_readdir(NULL, ip, ctx, bufsize);
1692 }
1693 
1694 STATIC loff_t
xfs_file_llseek(struct file * file,loff_t offset,int whence)1695 xfs_file_llseek(
1696 	struct file	*file,
1697 	loff_t		offset,
1698 	int		whence)
1699 {
1700 	struct inode		*inode = file->f_mapping->host;
1701 
1702 	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1703 		return -EIO;
1704 
1705 	switch (whence) {
1706 	default:
1707 		return generic_file_llseek(file, offset, whence);
1708 	case SEEK_HOLE:
1709 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1710 		break;
1711 	case SEEK_DATA:
1712 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1713 		break;
1714 	}
1715 
1716 	if (offset < 0)
1717 		return offset;
1718 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1719 }
1720 
1721 static inline vm_fault_t
xfs_dax_fault_locked(struct vm_fault * vmf,unsigned int order,bool write_fault)1722 xfs_dax_fault_locked(
1723 	struct vm_fault		*vmf,
1724 	unsigned int		order,
1725 	bool			write_fault)
1726 {
1727 	vm_fault_t		ret;
1728 	unsigned long		pfn;
1729 
1730 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1731 		ASSERT(0);
1732 		return VM_FAULT_SIGBUS;
1733 	}
1734 	ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1735 			(write_fault && !vmf->cow_page) ?
1736 				&xfs_dax_write_iomap_ops :
1737 				&xfs_read_iomap_ops);
1738 	if (ret & VM_FAULT_NEEDDSYNC)
1739 		ret = dax_finish_sync_fault(vmf, order, pfn);
1740 	return ret;
1741 }
1742 
1743 static vm_fault_t
xfs_dax_read_fault(struct vm_fault * vmf,unsigned int order)1744 xfs_dax_read_fault(
1745 	struct vm_fault		*vmf,
1746 	unsigned int		order)
1747 {
1748 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1749 	vm_fault_t		ret;
1750 
1751 	trace_xfs_read_fault(ip, order);
1752 
1753 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1754 	ret = xfs_dax_fault_locked(vmf, order, false);
1755 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1756 
1757 	return ret;
1758 }
1759 
1760 /*
1761  * Locking for serialisation of IO during page faults. This results in a lock
1762  * ordering of:
1763  *
1764  * mmap_lock (MM)
1765  *   sb_start_pagefault(vfs, freeze)
1766  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1767  *       page_lock (MM)
1768  *         i_lock (XFS - extent map serialisation)
1769  */
1770 static vm_fault_t
__xfs_write_fault(struct vm_fault * vmf,unsigned int order,struct xfs_zone_alloc_ctx * ac)1771 __xfs_write_fault(
1772 	struct vm_fault		*vmf,
1773 	unsigned int		order,
1774 	struct xfs_zone_alloc_ctx *ac)
1775 {
1776 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1777 	struct xfs_inode	*ip = XFS_I(inode);
1778 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
1779 	vm_fault_t		ret;
1780 
1781 	trace_xfs_write_fault(ip, order);
1782 
1783 	sb_start_pagefault(inode->i_sb);
1784 	file_update_time(vmf->vma->vm_file);
1785 
1786 	/*
1787 	 * Normally we only need the shared mmaplock, but if a reflink remap is
1788 	 * in progress we take the exclusive lock to wait for the remap to
1789 	 * finish before taking a write fault.
1790 	 */
1791 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1792 	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1793 		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1794 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1795 		lock_mode = XFS_MMAPLOCK_EXCL;
1796 	}
1797 
1798 	if (IS_DAX(inode))
1799 		ret = xfs_dax_fault_locked(vmf, order, true);
1800 	else
1801 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1802 				ac);
1803 	xfs_iunlock(ip, lock_mode);
1804 
1805 	sb_end_pagefault(inode->i_sb);
1806 	return ret;
1807 }
1808 
1809 static vm_fault_t
xfs_write_fault_zoned(struct vm_fault * vmf,unsigned int order)1810 xfs_write_fault_zoned(
1811 	struct vm_fault		*vmf,
1812 	unsigned int		order)
1813 {
1814 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1815 	unsigned int		len = folio_size(page_folio(vmf->page));
1816 	struct xfs_zone_alloc_ctx ac = { };
1817 	int			error;
1818 	vm_fault_t		ret;
1819 
1820 	/*
1821 	 * This could over-allocate as it doesn't check for truncation.
1822 	 *
1823 	 * But as the overallocation is limited to less than a folio and will be
1824 	 * release instantly that's just fine.
1825 	 */
1826 	error = xfs_zoned_space_reserve(ip->i_mount,
1827 			XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
1828 	if (error < 0)
1829 		return vmf_fs_error(error);
1830 	ret = __xfs_write_fault(vmf, order, &ac);
1831 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1832 	return ret;
1833 }
1834 
1835 static vm_fault_t
xfs_write_fault(struct vm_fault * vmf,unsigned int order)1836 xfs_write_fault(
1837 	struct vm_fault		*vmf,
1838 	unsigned int		order)
1839 {
1840 	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1841 		return xfs_write_fault_zoned(vmf, order);
1842 	return __xfs_write_fault(vmf, order, NULL);
1843 }
1844 
1845 static inline bool
xfs_is_write_fault(struct vm_fault * vmf)1846 xfs_is_write_fault(
1847 	struct vm_fault		*vmf)
1848 {
1849 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1850 	       (vmf->vma->vm_flags & VM_SHARED);
1851 }
1852 
1853 static vm_fault_t
xfs_filemap_fault(struct vm_fault * vmf)1854 xfs_filemap_fault(
1855 	struct vm_fault		*vmf)
1856 {
1857 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1858 
1859 	/* DAX can shortcut the normal fault path on write faults! */
1860 	if (IS_DAX(inode)) {
1861 		if (xfs_is_write_fault(vmf))
1862 			return xfs_write_fault(vmf, 0);
1863 		return xfs_dax_read_fault(vmf, 0);
1864 	}
1865 
1866 	trace_xfs_read_fault(XFS_I(inode), 0);
1867 	return filemap_fault(vmf);
1868 }
1869 
1870 static vm_fault_t
xfs_filemap_huge_fault(struct vm_fault * vmf,unsigned int order)1871 xfs_filemap_huge_fault(
1872 	struct vm_fault		*vmf,
1873 	unsigned int		order)
1874 {
1875 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1876 		return VM_FAULT_FALLBACK;
1877 
1878 	/* DAX can shortcut the normal fault path on write faults! */
1879 	if (xfs_is_write_fault(vmf))
1880 		return xfs_write_fault(vmf, order);
1881 	return xfs_dax_read_fault(vmf, order);
1882 }
1883 
1884 static vm_fault_t
xfs_filemap_page_mkwrite(struct vm_fault * vmf)1885 xfs_filemap_page_mkwrite(
1886 	struct vm_fault		*vmf)
1887 {
1888 	return xfs_write_fault(vmf, 0);
1889 }
1890 
1891 /*
1892  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1893  * on write faults. In reality, it needs to serialise against truncate and
1894  * prepare memory for writing so handle is as standard write fault.
1895  */
1896 static vm_fault_t
xfs_filemap_pfn_mkwrite(struct vm_fault * vmf)1897 xfs_filemap_pfn_mkwrite(
1898 	struct vm_fault		*vmf)
1899 {
1900 	return xfs_write_fault(vmf, 0);
1901 }
1902 
1903 static const struct vm_operations_struct xfs_file_vm_ops = {
1904 	.fault		= xfs_filemap_fault,
1905 	.huge_fault	= xfs_filemap_huge_fault,
1906 	.map_pages	= filemap_map_pages,
1907 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1908 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1909 };
1910 
1911 STATIC int
xfs_file_mmap_prepare(struct vm_area_desc * desc)1912 xfs_file_mmap_prepare(
1913 	struct vm_area_desc	*desc)
1914 {
1915 	struct file		*file = desc->file;
1916 	struct inode		*inode = file_inode(file);
1917 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1918 
1919 	/*
1920 	 * We don't support synchronous mappings for non-DAX files and
1921 	 * for DAX files if underneath dax_device is not synchronous.
1922 	 */
1923 	if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
1924 				      target->bt_daxdev))
1925 		return -EOPNOTSUPP;
1926 
1927 	file_accessed(file);
1928 	desc->vm_ops = &xfs_file_vm_ops;
1929 	if (IS_DAX(inode))
1930 		desc->vm_flags |= VM_HUGEPAGE;
1931 	return 0;
1932 }
1933 
1934 const struct file_operations xfs_file_operations = {
1935 	.llseek		= xfs_file_llseek,
1936 	.read_iter	= xfs_file_read_iter,
1937 	.write_iter	= xfs_file_write_iter,
1938 	.splice_read	= xfs_file_splice_read,
1939 	.splice_write	= iter_file_splice_write,
1940 	.iopoll		= iocb_bio_iopoll,
1941 	.unlocked_ioctl	= xfs_file_ioctl,
1942 #ifdef CONFIG_COMPAT
1943 	.compat_ioctl	= xfs_file_compat_ioctl,
1944 #endif
1945 	.mmap_prepare	= xfs_file_mmap_prepare,
1946 	.open		= xfs_file_open,
1947 	.release	= xfs_file_release,
1948 	.fsync		= xfs_file_fsync,
1949 	.get_unmapped_area = thp_get_unmapped_area,
1950 	.fallocate	= xfs_file_fallocate,
1951 	.fadvise	= xfs_file_fadvise,
1952 	.remap_file_range = xfs_file_remap_range,
1953 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
1954 			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
1955 			  FOP_DONTCACHE,
1956 };
1957 
1958 const struct file_operations xfs_dir_file_operations = {
1959 	.open		= xfs_dir_open,
1960 	.read		= generic_read_dir,
1961 	.iterate_shared	= xfs_file_readdir,
1962 	.llseek		= generic_file_llseek,
1963 	.unlocked_ioctl	= xfs_file_ioctl,
1964 #ifdef CONFIG_COMPAT
1965 	.compat_ioctl	= xfs_file_compat_ioctl,
1966 #endif
1967 	.fsync		= xfs_dir_fsync,
1968 };
1969