xref: /linux/fs/xfs/xfs_file.c (revision ddb7a62af2e766eabb4ab7080e6ed8d6b8915302)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 
31 #include <linux/dax.h>
32 #include <linux/falloc.h>
33 #include <linux/backing-dev.h>
34 #include <linux/mman.h>
35 #include <linux/fadvise.h>
36 #include <linux/mount.h>
37 
38 static const struct vm_operations_struct xfs_file_vm_ops;
39 
40 /*
41  * Decide if the given file range is aligned to the size of the fundamental
42  * allocation unit for the file.
43  */
44 bool
xfs_is_falloc_aligned(struct xfs_inode * ip,loff_t pos,long long int len)45 xfs_is_falloc_aligned(
46 	struct xfs_inode	*ip,
47 	loff_t			pos,
48 	long long int		len)
49 {
50 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip);
51 
52 	if (!is_power_of_2(alloc_unit))
53 		return isaligned_64(pos, alloc_unit) &&
54 		       isaligned_64(len, alloc_unit);
55 
56 	return !((pos | len) & (alloc_unit - 1));
57 }
58 
59 /*
60  * Fsync operations on directories are much simpler than on regular files,
61  * as there is no file data to flush, and thus also no need for explicit
62  * cache flush operations, and there are no non-transaction metadata updates
63  * on directories either.
64  */
65 STATIC int
xfs_dir_fsync(struct file * file,loff_t start,loff_t end,int datasync)66 xfs_dir_fsync(
67 	struct file		*file,
68 	loff_t			start,
69 	loff_t			end,
70 	int			datasync)
71 {
72 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
73 
74 	trace_xfs_dir_fsync(ip);
75 	return xfs_log_force_inode(ip);
76 }
77 
78 static xfs_csn_t
xfs_fsync_seq(struct xfs_inode * ip,bool datasync)79 xfs_fsync_seq(
80 	struct xfs_inode	*ip,
81 	bool			datasync)
82 {
83 	if (!xfs_ipincount(ip))
84 		return 0;
85 	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
86 		return 0;
87 	return ip->i_itemp->ili_commit_seq;
88 }
89 
90 /*
91  * All metadata updates are logged, which means that we just have to flush the
92  * log up to the latest LSN that touched the inode.
93  *
94  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
95  * the log force before we clear the ili_fsync_fields field. This ensures that
96  * we don't get a racing sync operation that does not wait for the metadata to
97  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
98  * then all that will happen is the log force will do nothing as the lsn will
99  * already be on disk.  We can't race with setting ili_fsync_fields because that
100  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
101  * shared until after the ili_fsync_fields is cleared.
102  */
103 static  int
xfs_fsync_flush_log(struct xfs_inode * ip,bool datasync,int * log_flushed)104 xfs_fsync_flush_log(
105 	struct xfs_inode	*ip,
106 	bool			datasync,
107 	int			*log_flushed)
108 {
109 	int			error = 0;
110 	xfs_csn_t		seq;
111 
112 	xfs_ilock(ip, XFS_ILOCK_SHARED);
113 	seq = xfs_fsync_seq(ip, datasync);
114 	if (seq) {
115 		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
116 					  log_flushed);
117 
118 		spin_lock(&ip->i_itemp->ili_lock);
119 		ip->i_itemp->ili_fsync_fields = 0;
120 		spin_unlock(&ip->i_itemp->ili_lock);
121 	}
122 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
123 	return error;
124 }
125 
126 STATIC int
xfs_file_fsync(struct file * file,loff_t start,loff_t end,int datasync)127 xfs_file_fsync(
128 	struct file		*file,
129 	loff_t			start,
130 	loff_t			end,
131 	int			datasync)
132 {
133 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
134 	struct xfs_mount	*mp = ip->i_mount;
135 	int			error, err2;
136 	int			log_flushed = 0;
137 
138 	trace_xfs_file_fsync(ip);
139 
140 	error = file_write_and_wait_range(file, start, end);
141 	if (error)
142 		return error;
143 
144 	if (xfs_is_shutdown(mp))
145 		return -EIO;
146 
147 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
148 
149 	/*
150 	 * If we have an RT and/or log subvolume we need to make sure to flush
151 	 * the write cache the device used for file data first.  This is to
152 	 * ensure newly written file data make it to disk before logging the new
153 	 * inode size in case of an extending write.
154 	 */
155 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
156 		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
157 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
158 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
159 
160 	/*
161 	 * Any inode that has dirty modifications in the log is pinned.  The
162 	 * racy check here for a pinned inode will not catch modifications
163 	 * that happen concurrently to the fsync call, but fsync semantics
164 	 * only require to sync previously completed I/O.
165 	 */
166 	if (xfs_ipincount(ip)) {
167 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
168 		if (err2 && !error)
169 			error = err2;
170 	}
171 
172 	/*
173 	 * If we only have a single device, and the log force about was
174 	 * a no-op we might have to flush the data device cache here.
175 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
176 	 * an already allocated file and thus do not have any metadata to
177 	 * commit.
178 	 */
179 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
180 	    mp->m_logdev_targp == mp->m_ddev_targp) {
181 		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
182 		if (err2 && !error)
183 			error = err2;
184 	}
185 
186 	return error;
187 }
188 
189 static int
xfs_ilock_iocb(struct kiocb * iocb,unsigned int lock_mode)190 xfs_ilock_iocb(
191 	struct kiocb		*iocb,
192 	unsigned int		lock_mode)
193 {
194 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
195 
196 	if (iocb->ki_flags & IOCB_NOWAIT) {
197 		if (!xfs_ilock_nowait(ip, lock_mode))
198 			return -EAGAIN;
199 	} else {
200 		xfs_ilock(ip, lock_mode);
201 	}
202 
203 	return 0;
204 }
205 
206 static int
xfs_ilock_iocb_for_write(struct kiocb * iocb,unsigned int * lock_mode)207 xfs_ilock_iocb_for_write(
208 	struct kiocb		*iocb,
209 	unsigned int		*lock_mode)
210 {
211 	ssize_t			ret;
212 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
213 
214 	ret = xfs_ilock_iocb(iocb, *lock_mode);
215 	if (ret)
216 		return ret;
217 
218 	/*
219 	 * If a reflink remap is in progress we always need to take the iolock
220 	 * exclusively to wait for it to finish.
221 	 */
222 	if (*lock_mode == XFS_IOLOCK_SHARED &&
223 	    xfs_iflags_test(ip, XFS_IREMAPPING)) {
224 		xfs_iunlock(ip, *lock_mode);
225 		*lock_mode = XFS_IOLOCK_EXCL;
226 		return xfs_ilock_iocb(iocb, *lock_mode);
227 	}
228 
229 	return 0;
230 }
231 
232 STATIC ssize_t
xfs_file_dio_read(struct kiocb * iocb,struct iov_iter * to)233 xfs_file_dio_read(
234 	struct kiocb		*iocb,
235 	struct iov_iter		*to)
236 {
237 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
238 	ssize_t			ret;
239 
240 	trace_xfs_file_direct_read(iocb, to);
241 
242 	if (!iov_iter_count(to))
243 		return 0; /* skip atime */
244 
245 	file_accessed(iocb->ki_filp);
246 
247 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
248 	if (ret)
249 		return ret;
250 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
251 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
252 
253 	return ret;
254 }
255 
256 static noinline ssize_t
xfs_file_dax_read(struct kiocb * iocb,struct iov_iter * to)257 xfs_file_dax_read(
258 	struct kiocb		*iocb,
259 	struct iov_iter		*to)
260 {
261 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
262 	ssize_t			ret = 0;
263 
264 	trace_xfs_file_dax_read(iocb, to);
265 
266 	if (!iov_iter_count(to))
267 		return 0; /* skip atime */
268 
269 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
270 	if (ret)
271 		return ret;
272 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
273 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
274 
275 	file_accessed(iocb->ki_filp);
276 	return ret;
277 }
278 
279 STATIC ssize_t
xfs_file_buffered_read(struct kiocb * iocb,struct iov_iter * to)280 xfs_file_buffered_read(
281 	struct kiocb		*iocb,
282 	struct iov_iter		*to)
283 {
284 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
285 	ssize_t			ret;
286 
287 	trace_xfs_file_buffered_read(iocb, to);
288 
289 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
290 	if (ret)
291 		return ret;
292 	ret = generic_file_read_iter(iocb, to);
293 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
294 
295 	return ret;
296 }
297 
298 STATIC ssize_t
xfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)299 xfs_file_read_iter(
300 	struct kiocb		*iocb,
301 	struct iov_iter		*to)
302 {
303 	struct inode		*inode = file_inode(iocb->ki_filp);
304 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
305 	ssize_t			ret = 0;
306 
307 	XFS_STATS_INC(mp, xs_read_calls);
308 
309 	if (xfs_is_shutdown(mp))
310 		return -EIO;
311 
312 	if (IS_DAX(inode))
313 		ret = xfs_file_dax_read(iocb, to);
314 	else if (iocb->ki_flags & IOCB_DIRECT)
315 		ret = xfs_file_dio_read(iocb, to);
316 	else
317 		ret = xfs_file_buffered_read(iocb, to);
318 
319 	if (ret > 0)
320 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
321 	return ret;
322 }
323 
324 STATIC ssize_t
xfs_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)325 xfs_file_splice_read(
326 	struct file		*in,
327 	loff_t			*ppos,
328 	struct pipe_inode_info	*pipe,
329 	size_t			len,
330 	unsigned int		flags)
331 {
332 	struct inode		*inode = file_inode(in);
333 	struct xfs_inode	*ip = XFS_I(inode);
334 	struct xfs_mount	*mp = ip->i_mount;
335 	ssize_t			ret = 0;
336 
337 	XFS_STATS_INC(mp, xs_read_calls);
338 
339 	if (xfs_is_shutdown(mp))
340 		return -EIO;
341 
342 	trace_xfs_file_splice_read(ip, *ppos, len);
343 
344 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
345 	ret = filemap_splice_read(in, ppos, pipe, len, flags);
346 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
347 	if (ret > 0)
348 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
349 	return ret;
350 }
351 
352 /*
353  * Take care of zeroing post-EOF blocks when they might exist.
354  *
355  * Returns 0 if successfully, a negative error for a failure, or 1 if this
356  * function dropped the iolock and reacquired it exclusively and the caller
357  * needs to restart the write sanity checks.
358  */
359 static ssize_t
xfs_file_write_zero_eof(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,size_t count,bool * drained_dio,struct xfs_zone_alloc_ctx * ac)360 xfs_file_write_zero_eof(
361 	struct kiocb		*iocb,
362 	struct iov_iter		*from,
363 	unsigned int		*iolock,
364 	size_t			count,
365 	bool			*drained_dio,
366 	struct xfs_zone_alloc_ctx *ac)
367 {
368 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
369 	loff_t			isize;
370 	int			error;
371 
372 	/*
373 	 * We need to serialise against EOF updates that occur in IO completions
374 	 * here. We want to make sure that nobody is changing the size while
375 	 * we do this check until we have placed an IO barrier (i.e. hold
376 	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
377 	 * spinlock effectively forms a memory barrier once we have
378 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
379 	 * hence be able to correctly determine if we need to run zeroing.
380 	 */
381 	spin_lock(&ip->i_flags_lock);
382 	isize = i_size_read(VFS_I(ip));
383 	if (iocb->ki_pos <= isize) {
384 		spin_unlock(&ip->i_flags_lock);
385 		return 0;
386 	}
387 	spin_unlock(&ip->i_flags_lock);
388 
389 	if (iocb->ki_flags & IOCB_NOWAIT)
390 		return -EAGAIN;
391 
392 	if (!*drained_dio) {
393 		/*
394 		 * If zeroing is needed and we are currently holding the iolock
395 		 * shared, we need to update it to exclusive which implies
396 		 * having to redo all checks before.
397 		 */
398 		if (*iolock == XFS_IOLOCK_SHARED) {
399 			xfs_iunlock(ip, *iolock);
400 			*iolock = XFS_IOLOCK_EXCL;
401 			xfs_ilock(ip, *iolock);
402 			iov_iter_reexpand(from, count);
403 		}
404 
405 		/*
406 		 * We now have an IO submission barrier in place, but AIO can do
407 		 * EOF updates during IO completion and hence we now need to
408 		 * wait for all of them to drain.  Non-AIO DIO will have drained
409 		 * before we are given the XFS_IOLOCK_EXCL, and so for most
410 		 * cases this wait is a no-op.
411 		 */
412 		inode_dio_wait(VFS_I(ip));
413 		*drained_dio = true;
414 		return 1;
415 	}
416 
417 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
418 
419 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
420 	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
421 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
422 
423 	return error;
424 }
425 
426 /*
427  * Common pre-write limit and setup checks.
428  *
429  * Called with the iolock held either shared and exclusive according to
430  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
431  * if called for a direct write beyond i_size.
432  */
433 STATIC ssize_t
xfs_file_write_checks(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,struct xfs_zone_alloc_ctx * ac)434 xfs_file_write_checks(
435 	struct kiocb		*iocb,
436 	struct iov_iter		*from,
437 	unsigned int		*iolock,
438 	struct xfs_zone_alloc_ctx *ac)
439 {
440 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
441 	size_t			count = iov_iter_count(from);
442 	bool			drained_dio = false;
443 	ssize_t			error;
444 
445 restart:
446 	error = generic_write_checks(iocb, from);
447 	if (error <= 0)
448 		return error;
449 
450 	if (iocb->ki_flags & IOCB_NOWAIT) {
451 		error = break_layout(inode, false);
452 		if (error == -EWOULDBLOCK)
453 			error = -EAGAIN;
454 	} else {
455 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
456 	}
457 
458 	if (error)
459 		return error;
460 
461 	/*
462 	 * For changing security info in file_remove_privs() we need i_rwsem
463 	 * exclusively.
464 	 */
465 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
466 		xfs_iunlock(XFS_I(inode), *iolock);
467 		*iolock = XFS_IOLOCK_EXCL;
468 		error = xfs_ilock_iocb(iocb, *iolock);
469 		if (error) {
470 			*iolock = 0;
471 			return error;
472 		}
473 		goto restart;
474 	}
475 
476 	/*
477 	 * If the offset is beyond the size of the file, we need to zero all
478 	 * blocks that fall between the existing EOF and the start of this
479 	 * write.
480 	 *
481 	 * We can do an unlocked check for i_size here safely as I/O completion
482 	 * can only extend EOF.  Truncate is locked out at this point, so the
483 	 * EOF can not move backwards, only forwards. Hence we only need to take
484 	 * the slow path when we are at or beyond the current EOF.
485 	 */
486 	if (iocb->ki_pos > i_size_read(inode)) {
487 		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
488 				&drained_dio, ac);
489 		if (error == 1)
490 			goto restart;
491 		if (error)
492 			return error;
493 	}
494 
495 	return kiocb_modified(iocb);
496 }
497 
498 static ssize_t
xfs_zoned_write_space_reserve(struct xfs_mount * mp,struct kiocb * iocb,struct iov_iter * from,unsigned int flags,struct xfs_zone_alloc_ctx * ac)499 xfs_zoned_write_space_reserve(
500 	struct xfs_mount		*mp,
501 	struct kiocb			*iocb,
502 	struct iov_iter			*from,
503 	unsigned int			flags,
504 	struct xfs_zone_alloc_ctx	*ac)
505 {
506 	loff_t				count = iov_iter_count(from);
507 	int				error;
508 
509 	if (iocb->ki_flags & IOCB_NOWAIT)
510 		flags |= XFS_ZR_NOWAIT;
511 
512 	/*
513 	 * Check the rlimit and LFS boundary first so that we don't over-reserve
514 	 * by possibly a lot.
515 	 *
516 	 * The generic write path will redo this check later, and it might have
517 	 * changed by then.  If it got expanded we'll stick to our earlier
518 	 * smaller limit, and if it is decreased the new smaller limit will be
519 	 * used and our extra space reservation will be returned after finishing
520 	 * the write.
521 	 */
522 	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
523 	if (error)
524 		return error;
525 
526 	/*
527 	 * Sloppily round up count to file system blocks.
528 	 *
529 	 * This will often reserve an extra block, but that avoids having to look
530 	 * at the start offset, which isn't stable for O_APPEND until taking the
531 	 * iolock.  Also we need to reserve a block each for zeroing the old
532 	 * EOF block and the new start block if they are unaligned.
533 	 *
534 	 * Any remaining block will be returned after the write.
535 	 */
536 	return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
537 			flags, ac);
538 }
539 
540 static int
xfs_dio_write_end_io(struct kiocb * iocb,ssize_t size,int error,unsigned flags)541 xfs_dio_write_end_io(
542 	struct kiocb		*iocb,
543 	ssize_t			size,
544 	int			error,
545 	unsigned		flags)
546 {
547 	struct inode		*inode = file_inode(iocb->ki_filp);
548 	struct xfs_inode	*ip = XFS_I(inode);
549 	loff_t			offset = iocb->ki_pos;
550 	unsigned int		nofs_flag;
551 
552 	ASSERT(!xfs_is_zoned_inode(ip) ||
553 	       !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
554 
555 	trace_xfs_end_io_direct_write(ip, offset, size);
556 
557 	if (xfs_is_shutdown(ip->i_mount))
558 		return -EIO;
559 
560 	if (error)
561 		return error;
562 	if (!size)
563 		return 0;
564 
565 	/*
566 	 * Capture amount written on completion as we can't reliably account
567 	 * for it on submission.
568 	 */
569 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
570 
571 	/*
572 	 * We can allocate memory here while doing writeback on behalf of
573 	 * memory reclaim.  To avoid memory allocation deadlocks set the
574 	 * task-wide nofs context for the following operations.
575 	 */
576 	nofs_flag = memalloc_nofs_save();
577 
578 	if (flags & IOMAP_DIO_COW) {
579 		if (iocb->ki_flags & IOCB_ATOMIC)
580 			error = xfs_reflink_end_atomic_cow(ip, offset, size);
581 		else
582 			error = xfs_reflink_end_cow(ip, offset, size);
583 		if (error)
584 			goto out;
585 	}
586 
587 	/*
588 	 * Unwritten conversion updates the in-core isize after extent
589 	 * conversion but before updating the on-disk size. Updating isize any
590 	 * earlier allows a racing dio read to find unwritten extents before
591 	 * they are converted.
592 	 */
593 	if (flags & IOMAP_DIO_UNWRITTEN) {
594 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
595 		goto out;
596 	}
597 
598 	/*
599 	 * We need to update the in-core inode size here so that we don't end up
600 	 * with the on-disk inode size being outside the in-core inode size. We
601 	 * have no other method of updating EOF for AIO, so always do it here
602 	 * if necessary.
603 	 *
604 	 * We need to lock the test/set EOF update as we can be racing with
605 	 * other IO completions here to update the EOF. Failing to serialise
606 	 * here can result in EOF moving backwards and Bad Things Happen when
607 	 * that occurs.
608 	 *
609 	 * As IO completion only ever extends EOF, we can do an unlocked check
610 	 * here to avoid taking the spinlock. If we land within the current EOF,
611 	 * then we do not need to do an extending update at all, and we don't
612 	 * need to take the lock to check this. If we race with an update moving
613 	 * EOF, then we'll either still be beyond EOF and need to take the lock,
614 	 * or we'll be within EOF and we don't need to take it at all.
615 	 */
616 	if (offset + size <= i_size_read(inode))
617 		goto out;
618 
619 	spin_lock(&ip->i_flags_lock);
620 	if (offset + size > i_size_read(inode)) {
621 		i_size_write(inode, offset + size);
622 		spin_unlock(&ip->i_flags_lock);
623 		error = xfs_setfilesize(ip, offset, size);
624 	} else {
625 		spin_unlock(&ip->i_flags_lock);
626 	}
627 
628 out:
629 	memalloc_nofs_restore(nofs_flag);
630 	return error;
631 }
632 
633 static const struct iomap_dio_ops xfs_dio_write_ops = {
634 	.end_io		= xfs_dio_write_end_io,
635 };
636 
637 static void
xfs_dio_zoned_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)638 xfs_dio_zoned_submit_io(
639 	const struct iomap_iter	*iter,
640 	struct bio		*bio,
641 	loff_t			file_offset)
642 {
643 	struct xfs_mount	*mp = XFS_I(iter->inode)->i_mount;
644 	struct xfs_zone_alloc_ctx *ac = iter->private;
645 	xfs_filblks_t		count_fsb;
646 	struct iomap_ioend	*ioend;
647 
648 	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
649 	if (count_fsb > ac->reserved_blocks) {
650 		xfs_err(mp,
651 "allocation (%lld) larger than reservation (%lld).",
652 			count_fsb, ac->reserved_blocks);
653 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
654 		bio_io_error(bio);
655 		return;
656 	}
657 	ac->reserved_blocks -= count_fsb;
658 
659 	bio->bi_end_io = xfs_end_bio;
660 	ioend = iomap_init_ioend(iter->inode, bio, file_offset,
661 			IOMAP_IOEND_DIRECT);
662 	xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
663 }
664 
665 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
666 	.bio_set	= &iomap_ioend_bioset,
667 	.submit_io	= xfs_dio_zoned_submit_io,
668 	.end_io		= xfs_dio_write_end_io,
669 };
670 
671 /*
672  * Handle block aligned direct I/O writes.
673  */
674 static noinline ssize_t
xfs_file_dio_write_aligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from,const struct iomap_ops * ops,const struct iomap_dio_ops * dops,struct xfs_zone_alloc_ctx * ac)675 xfs_file_dio_write_aligned(
676 	struct xfs_inode	*ip,
677 	struct kiocb		*iocb,
678 	struct iov_iter		*from,
679 	const struct iomap_ops	*ops,
680 	const struct iomap_dio_ops *dops,
681 	struct xfs_zone_alloc_ctx *ac)
682 {
683 	unsigned int		iolock = XFS_IOLOCK_SHARED;
684 	ssize_t			ret;
685 
686 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
687 	if (ret)
688 		return ret;
689 	ret = xfs_file_write_checks(iocb, from, &iolock, ac);
690 	if (ret)
691 		goto out_unlock;
692 
693 	/*
694 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
695 	 * the iolock back to shared if we had to take the exclusive lock in
696 	 * xfs_file_write_checks() for other reasons.
697 	 */
698 	if (iolock == XFS_IOLOCK_EXCL) {
699 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
700 		iolock = XFS_IOLOCK_SHARED;
701 	}
702 	trace_xfs_file_direct_write(iocb, from);
703 	ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
704 out_unlock:
705 	xfs_iunlock(ip, iolock);
706 	return ret;
707 }
708 
709 /*
710  * Handle block aligned direct I/O writes to zoned devices.
711  */
712 static noinline ssize_t
xfs_file_dio_write_zoned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)713 xfs_file_dio_write_zoned(
714 	struct xfs_inode	*ip,
715 	struct kiocb		*iocb,
716 	struct iov_iter		*from)
717 {
718 	struct xfs_zone_alloc_ctx ac = { };
719 	ssize_t			ret;
720 
721 	ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
722 	if (ret < 0)
723 		return ret;
724 	ret = xfs_file_dio_write_aligned(ip, iocb, from,
725 			&xfs_zoned_direct_write_iomap_ops,
726 			&xfs_dio_zoned_write_ops, &ac);
727 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
728 	return ret;
729 }
730 
731 /*
732  * Handle block atomic writes
733  *
734  * Two methods of atomic writes are supported:
735  * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
736  *   disk
737  * - COW-based, which uses a COW fork as a staging extent for data updates
738  *   before atomically updating extent mappings for the range being written
739  *
740  */
741 static noinline ssize_t
xfs_file_dio_write_atomic(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)742 xfs_file_dio_write_atomic(
743 	struct xfs_inode	*ip,
744 	struct kiocb		*iocb,
745 	struct iov_iter		*from)
746 {
747 	unsigned int		iolock = XFS_IOLOCK_SHARED;
748 	ssize_t			ret, ocount = iov_iter_count(from);
749 	const struct iomap_ops	*dops;
750 
751 	/*
752 	 * HW offload should be faster, so try that first if it is already
753 	 * known that the write length is not too large.
754 	 */
755 	if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
756 		dops = &xfs_atomic_write_cow_iomap_ops;
757 	else
758 		dops = &xfs_direct_write_iomap_ops;
759 
760 retry:
761 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
762 	if (ret)
763 		return ret;
764 
765 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
766 	if (ret)
767 		goto out_unlock;
768 
769 	/* Demote similar to xfs_file_dio_write_aligned() */
770 	if (iolock == XFS_IOLOCK_EXCL) {
771 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772 		iolock = XFS_IOLOCK_SHARED;
773 	}
774 
775 	trace_xfs_file_direct_write(iocb, from);
776 	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
777 			0, NULL, 0);
778 
779 	/*
780 	 * The retry mechanism is based on the ->iomap_begin method returning
781 	 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
782 	 * possible. The REQ_ATOMIC-based method typically not be possible if
783 	 * the write spans multiple extents or the disk blocks are misaligned.
784 	 */
785 	if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
786 		xfs_iunlock(ip, iolock);
787 		dops = &xfs_atomic_write_cow_iomap_ops;
788 		goto retry;
789 	}
790 
791 out_unlock:
792 	if (iolock)
793 		xfs_iunlock(ip, iolock);
794 	return ret;
795 }
796 
797 /*
798  * Handle block unaligned direct I/O writes
799  *
800  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
801  * them to be done in parallel with reads and other direct I/O writes.  However,
802  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
803  * to do sub-block zeroing and that requires serialisation against other direct
804  * I/O to the same block.  In this case we need to serialise the submission of
805  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
806  * In the case where sub-block zeroing is not required, we can do concurrent
807  * sub-block dios to the same block successfully.
808  *
809  * Optimistically submit the I/O using the shared lock first, but use the
810  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
811  * if block allocation or partial block zeroing would be required.  In that case
812  * we try again with the exclusive lock.
813  */
814 static noinline ssize_t
xfs_file_dio_write_unaligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)815 xfs_file_dio_write_unaligned(
816 	struct xfs_inode	*ip,
817 	struct kiocb		*iocb,
818 	struct iov_iter		*from)
819 {
820 	size_t			isize = i_size_read(VFS_I(ip));
821 	size_t			count = iov_iter_count(from);
822 	unsigned int		iolock = XFS_IOLOCK_SHARED;
823 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
824 	ssize_t			ret;
825 
826 	/*
827 	 * Extending writes need exclusivity because of the sub-block zeroing
828 	 * that the DIO code always does for partial tail blocks beyond EOF, so
829 	 * don't even bother trying the fast path in this case.
830 	 */
831 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
832 		if (iocb->ki_flags & IOCB_NOWAIT)
833 			return -EAGAIN;
834 retry_exclusive:
835 		iolock = XFS_IOLOCK_EXCL;
836 		flags = IOMAP_DIO_FORCE_WAIT;
837 	}
838 
839 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
840 	if (ret)
841 		return ret;
842 
843 	/*
844 	 * We can't properly handle unaligned direct I/O to reflink files yet,
845 	 * as we can't unshare a partial block.
846 	 */
847 	if (xfs_is_cow_inode(ip)) {
848 		trace_xfs_reflink_bounce_dio_write(iocb, from);
849 		ret = -ENOTBLK;
850 		goto out_unlock;
851 	}
852 
853 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
854 	if (ret)
855 		goto out_unlock;
856 
857 	/*
858 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
859 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
860 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
861 	 * drain first.
862 	 */
863 	if (flags & IOMAP_DIO_FORCE_WAIT)
864 		inode_dio_wait(VFS_I(ip));
865 
866 	trace_xfs_file_direct_write(iocb, from);
867 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
868 			   &xfs_dio_write_ops, flags, NULL, 0);
869 
870 	/*
871 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
872 	 * layer rejected it for mapping or locking reasons. If we are doing
873 	 * nonblocking user I/O, propagate the error.
874 	 */
875 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
876 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
877 		xfs_iunlock(ip, iolock);
878 		goto retry_exclusive;
879 	}
880 
881 out_unlock:
882 	if (iolock)
883 		xfs_iunlock(ip, iolock);
884 	return ret;
885 }
886 
887 static ssize_t
xfs_file_dio_write(struct kiocb * iocb,struct iov_iter * from)888 xfs_file_dio_write(
889 	struct kiocb		*iocb,
890 	struct iov_iter		*from)
891 {
892 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
893 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
894 	size_t			count = iov_iter_count(from);
895 
896 	/* direct I/O must be aligned to device logical sector size */
897 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
898 		return -EINVAL;
899 
900 	/*
901 	 * For always COW inodes we also must check the alignment of each
902 	 * individual iovec segment, as they could end up with different
903 	 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
904 	 * then overwrite an already written block.
905 	 */
906 	if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
907 	    (xfs_is_always_cow_inode(ip) &&
908 	     (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
909 		return xfs_file_dio_write_unaligned(ip, iocb, from);
910 	if (xfs_is_zoned_inode(ip))
911 		return xfs_file_dio_write_zoned(ip, iocb, from);
912 	if (iocb->ki_flags & IOCB_ATOMIC)
913 		return xfs_file_dio_write_atomic(ip, iocb, from);
914 	return xfs_file_dio_write_aligned(ip, iocb, from,
915 			&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
916 }
917 
918 static noinline ssize_t
xfs_file_dax_write(struct kiocb * iocb,struct iov_iter * from)919 xfs_file_dax_write(
920 	struct kiocb		*iocb,
921 	struct iov_iter		*from)
922 {
923 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
924 	struct xfs_inode	*ip = XFS_I(inode);
925 	unsigned int		iolock = XFS_IOLOCK_EXCL;
926 	ssize_t			ret, error = 0;
927 	loff_t			pos;
928 
929 	ret = xfs_ilock_iocb(iocb, iolock);
930 	if (ret)
931 		return ret;
932 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
933 	if (ret)
934 		goto out;
935 
936 	pos = iocb->ki_pos;
937 
938 	trace_xfs_file_dax_write(iocb, from);
939 	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
940 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
941 		i_size_write(inode, iocb->ki_pos);
942 		error = xfs_setfilesize(ip, pos, ret);
943 	}
944 out:
945 	if (iolock)
946 		xfs_iunlock(ip, iolock);
947 	if (error)
948 		return error;
949 
950 	if (ret > 0) {
951 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
952 
953 		/* Handle various SYNC-type writes */
954 		ret = generic_write_sync(iocb, ret);
955 	}
956 	return ret;
957 }
958 
959 STATIC ssize_t
xfs_file_buffered_write(struct kiocb * iocb,struct iov_iter * from)960 xfs_file_buffered_write(
961 	struct kiocb		*iocb,
962 	struct iov_iter		*from)
963 {
964 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
965 	struct xfs_inode	*ip = XFS_I(inode);
966 	ssize_t			ret;
967 	bool			cleared_space = false;
968 	unsigned int		iolock;
969 
970 write_retry:
971 	iolock = XFS_IOLOCK_EXCL;
972 	ret = xfs_ilock_iocb(iocb, iolock);
973 	if (ret)
974 		return ret;
975 
976 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
977 	if (ret)
978 		goto out;
979 
980 	trace_xfs_file_buffered_write(iocb, from);
981 	ret = iomap_file_buffered_write(iocb, from,
982 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
983 			NULL);
984 
985 	/*
986 	 * If we hit a space limit, try to free up some lingering preallocated
987 	 * space before returning an error. In the case of ENOSPC, first try to
988 	 * write back all dirty inodes to free up some of the excess reserved
989 	 * metadata space. This reduces the chances that the eofblocks scan
990 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
991 	 * also behaves as a filter to prevent too many eofblocks scans from
992 	 * running at the same time.  Use a synchronous scan to increase the
993 	 * effectiveness of the scan.
994 	 */
995 	if (ret == -EDQUOT && !cleared_space) {
996 		xfs_iunlock(ip, iolock);
997 		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
998 		cleared_space = true;
999 		goto write_retry;
1000 	} else if (ret == -ENOSPC && !cleared_space) {
1001 		struct xfs_icwalk	icw = {0};
1002 
1003 		cleared_space = true;
1004 		xfs_flush_inodes(ip->i_mount);
1005 
1006 		xfs_iunlock(ip, iolock);
1007 		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1008 		xfs_blockgc_free_space(ip->i_mount, &icw);
1009 		goto write_retry;
1010 	}
1011 
1012 out:
1013 	if (iolock)
1014 		xfs_iunlock(ip, iolock);
1015 
1016 	if (ret > 0) {
1017 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1018 		/* Handle various SYNC-type writes */
1019 		ret = generic_write_sync(iocb, ret);
1020 	}
1021 	return ret;
1022 }
1023 
1024 STATIC ssize_t
xfs_file_buffered_write_zoned(struct kiocb * iocb,struct iov_iter * from)1025 xfs_file_buffered_write_zoned(
1026 	struct kiocb		*iocb,
1027 	struct iov_iter		*from)
1028 {
1029 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
1030 	struct xfs_mount	*mp = ip->i_mount;
1031 	unsigned int		iolock = XFS_IOLOCK_EXCL;
1032 	bool			cleared_space = false;
1033 	struct xfs_zone_alloc_ctx ac = { };
1034 	ssize_t			ret;
1035 
1036 	ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
1037 	if (ret < 0)
1038 		return ret;
1039 
1040 	ret = xfs_ilock_iocb(iocb, iolock);
1041 	if (ret)
1042 		goto out_unreserve;
1043 
1044 	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
1045 	if (ret)
1046 		goto out_unlock;
1047 
1048 	/*
1049 	 * Truncate the iter to the length that we were actually able to
1050 	 * allocate blocks for.  This needs to happen after
1051 	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1052 	 * writes.
1053 	 */
1054 	iov_iter_truncate(from,
1055 			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1056 			(iocb->ki_pos & mp->m_blockmask));
1057 	if (!iov_iter_count(from))
1058 		goto out_unlock;
1059 
1060 retry:
1061 	trace_xfs_file_buffered_write(iocb, from);
1062 	ret = iomap_file_buffered_write(iocb, from,
1063 			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
1064 			&ac);
1065 	if (ret == -ENOSPC && !cleared_space) {
1066 		/*
1067 		 * Kick off writeback to convert delalloc space and release the
1068 		 * usually too pessimistic indirect block reservations.
1069 		 */
1070 		xfs_flush_inodes(mp);
1071 		cleared_space = true;
1072 		goto retry;
1073 	}
1074 
1075 out_unlock:
1076 	xfs_iunlock(ip, iolock);
1077 out_unreserve:
1078 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1079 	if (ret > 0) {
1080 		XFS_STATS_ADD(mp, xs_write_bytes, ret);
1081 		ret = generic_write_sync(iocb, ret);
1082 	}
1083 	return ret;
1084 }
1085 
1086 STATIC ssize_t
xfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1087 xfs_file_write_iter(
1088 	struct kiocb		*iocb,
1089 	struct iov_iter		*from)
1090 {
1091 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1092 	struct xfs_inode	*ip = XFS_I(inode);
1093 	ssize_t			ret;
1094 	size_t			ocount = iov_iter_count(from);
1095 
1096 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1097 
1098 	if (ocount == 0)
1099 		return 0;
1100 
1101 	if (xfs_is_shutdown(ip->i_mount))
1102 		return -EIO;
1103 
1104 	if (IS_DAX(inode))
1105 		return xfs_file_dax_write(iocb, from);
1106 
1107 	if (iocb->ki_flags & IOCB_ATOMIC) {
1108 		if (ocount < xfs_get_atomic_write_min(ip))
1109 			return -EINVAL;
1110 
1111 		if (ocount > xfs_get_atomic_write_max(ip))
1112 			return -EINVAL;
1113 
1114 		ret = generic_atomic_write_valid(iocb, from);
1115 		if (ret)
1116 			return ret;
1117 	}
1118 
1119 	if (iocb->ki_flags & IOCB_DIRECT) {
1120 		/*
1121 		 * Allow a directio write to fall back to a buffered
1122 		 * write *only* in the case that we're doing a reflink
1123 		 * CoW.  In all other directio scenarios we do not
1124 		 * allow an operation to fall back to buffered mode.
1125 		 */
1126 		ret = xfs_file_dio_write(iocb, from);
1127 		if (ret != -ENOTBLK)
1128 			return ret;
1129 	}
1130 
1131 	if (xfs_is_zoned_inode(ip))
1132 		return xfs_file_buffered_write_zoned(iocb, from);
1133 	return xfs_file_buffered_write(iocb, from);
1134 }
1135 
1136 /* Does this file, inode, or mount want synchronous writes? */
xfs_file_sync_writes(struct file * filp)1137 static inline bool xfs_file_sync_writes(struct file *filp)
1138 {
1139 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1140 
1141 	if (xfs_has_wsync(ip->i_mount))
1142 		return true;
1143 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1144 		return true;
1145 	if (IS_SYNC(file_inode(filp)))
1146 		return true;
1147 
1148 	return false;
1149 }
1150 
1151 static int
xfs_falloc_newsize(struct file * file,int mode,loff_t offset,loff_t len,loff_t * new_size)1152 xfs_falloc_newsize(
1153 	struct file		*file,
1154 	int			mode,
1155 	loff_t			offset,
1156 	loff_t			len,
1157 	loff_t			*new_size)
1158 {
1159 	struct inode		*inode = file_inode(file);
1160 
1161 	if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1162 		return 0;
1163 	*new_size = offset + len;
1164 	return inode_newsize_ok(inode, *new_size);
1165 }
1166 
1167 static int
xfs_falloc_setsize(struct file * file,loff_t new_size)1168 xfs_falloc_setsize(
1169 	struct file		*file,
1170 	loff_t			new_size)
1171 {
1172 	struct iattr iattr = {
1173 		.ia_valid	= ATTR_SIZE,
1174 		.ia_size	= new_size,
1175 	};
1176 
1177 	if (!new_size)
1178 		return 0;
1179 	return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1180 			&iattr);
1181 }
1182 
1183 static int
xfs_falloc_collapse_range(struct file * file,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1184 xfs_falloc_collapse_range(
1185 	struct file		*file,
1186 	loff_t			offset,
1187 	loff_t			len,
1188 	struct xfs_zone_alloc_ctx *ac)
1189 {
1190 	struct inode		*inode = file_inode(file);
1191 	loff_t			new_size = i_size_read(inode) - len;
1192 	int			error;
1193 
1194 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1195 		return -EINVAL;
1196 
1197 	/*
1198 	 * There is no need to overlap collapse range with EOF, in which case it
1199 	 * is effectively a truncate operation
1200 	 */
1201 	if (offset + len >= i_size_read(inode))
1202 		return -EINVAL;
1203 
1204 	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1205 	if (error)
1206 		return error;
1207 	return xfs_falloc_setsize(file, new_size);
1208 }
1209 
1210 static int
xfs_falloc_insert_range(struct file * file,loff_t offset,loff_t len)1211 xfs_falloc_insert_range(
1212 	struct file		*file,
1213 	loff_t			offset,
1214 	loff_t			len)
1215 {
1216 	struct inode		*inode = file_inode(file);
1217 	loff_t			isize = i_size_read(inode);
1218 	int			error;
1219 
1220 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1221 		return -EINVAL;
1222 
1223 	/*
1224 	 * New inode size must not exceed ->s_maxbytes, accounting for
1225 	 * possible signed overflow.
1226 	 */
1227 	if (inode->i_sb->s_maxbytes - isize < len)
1228 		return -EFBIG;
1229 
1230 	/* Offset should be less than i_size */
1231 	if (offset >= isize)
1232 		return -EINVAL;
1233 
1234 	error = xfs_falloc_setsize(file, isize + len);
1235 	if (error)
1236 		return error;
1237 
1238 	/*
1239 	 * Perform hole insertion now that the file size has been updated so
1240 	 * that if we crash during the operation we don't leave shifted extents
1241 	 * past EOF and hence losing access to the data that is contained within
1242 	 * them.
1243 	 */
1244 	return xfs_insert_file_space(XFS_I(inode), offset, len);
1245 }
1246 
1247 /*
1248  * Punch a hole and prealloc the range.  We use a hole punch rather than
1249  * unwritten extent conversion for two reasons:
1250  *
1251  *   1.) Hole punch handles partial block zeroing for us.
1252  *   2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1253  *	 virtue of the hole punch.
1254  */
1255 static int
xfs_falloc_zero_range(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1256 xfs_falloc_zero_range(
1257 	struct file		*file,
1258 	int			mode,
1259 	loff_t			offset,
1260 	loff_t			len,
1261 	struct xfs_zone_alloc_ctx *ac)
1262 {
1263 	struct inode		*inode = file_inode(file);
1264 	unsigned int		blksize = i_blocksize(inode);
1265 	loff_t			new_size = 0;
1266 	int			error;
1267 
1268 	trace_xfs_zero_file_space(XFS_I(inode));
1269 
1270 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1271 	if (error)
1272 		return error;
1273 
1274 	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
1275 	if (error)
1276 		return error;
1277 
1278 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
1279 	offset = round_down(offset, blksize);
1280 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1281 	if (error)
1282 		return error;
1283 	return xfs_falloc_setsize(file, new_size);
1284 }
1285 
1286 static int
xfs_falloc_unshare_range(struct file * file,int mode,loff_t offset,loff_t len)1287 xfs_falloc_unshare_range(
1288 	struct file		*file,
1289 	int			mode,
1290 	loff_t			offset,
1291 	loff_t			len)
1292 {
1293 	struct inode		*inode = file_inode(file);
1294 	loff_t			new_size = 0;
1295 	int			error;
1296 
1297 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1298 	if (error)
1299 		return error;
1300 
1301 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1302 	if (error)
1303 		return error;
1304 
1305 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1306 	if (error)
1307 		return error;
1308 	return xfs_falloc_setsize(file, new_size);
1309 }
1310 
1311 static int
xfs_falloc_allocate_range(struct file * file,int mode,loff_t offset,loff_t len)1312 xfs_falloc_allocate_range(
1313 	struct file		*file,
1314 	int			mode,
1315 	loff_t			offset,
1316 	loff_t			len)
1317 {
1318 	struct inode		*inode = file_inode(file);
1319 	loff_t			new_size = 0;
1320 	int			error;
1321 
1322 	/*
1323 	 * If always_cow mode we can't use preallocations and thus should not
1324 	 * create them.
1325 	 */
1326 	if (xfs_is_always_cow_inode(XFS_I(inode)))
1327 		return -EOPNOTSUPP;
1328 
1329 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1330 	if (error)
1331 		return error;
1332 
1333 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1334 	if (error)
1335 		return error;
1336 	return xfs_falloc_setsize(file, new_size);
1337 }
1338 
1339 #define	XFS_FALLOC_FL_SUPPORTED						\
1340 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
1341 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
1342 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
1343 		 FALLOC_FL_UNSHARE_RANGE)
1344 
1345 STATIC long
__xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1346 __xfs_file_fallocate(
1347 	struct file		*file,
1348 	int			mode,
1349 	loff_t			offset,
1350 	loff_t			len,
1351 	struct xfs_zone_alloc_ctx *ac)
1352 {
1353 	struct inode		*inode = file_inode(file);
1354 	struct xfs_inode	*ip = XFS_I(inode);
1355 	long			error;
1356 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1357 
1358 	xfs_ilock(ip, iolock);
1359 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1360 	if (error)
1361 		goto out_unlock;
1362 
1363 	/*
1364 	 * Must wait for all AIO to complete before we continue as AIO can
1365 	 * change the file size on completion without holding any locks we
1366 	 * currently hold. We must do this first because AIO can update both
1367 	 * the on disk and in memory inode sizes, and the operations that follow
1368 	 * require the in-memory size to be fully up-to-date.
1369 	 */
1370 	inode_dio_wait(inode);
1371 
1372 	error = file_modified(file);
1373 	if (error)
1374 		goto out_unlock;
1375 
1376 	switch (mode & FALLOC_FL_MODE_MASK) {
1377 	case FALLOC_FL_PUNCH_HOLE:
1378 		error = xfs_free_file_space(ip, offset, len, ac);
1379 		break;
1380 	case FALLOC_FL_COLLAPSE_RANGE:
1381 		error = xfs_falloc_collapse_range(file, offset, len, ac);
1382 		break;
1383 	case FALLOC_FL_INSERT_RANGE:
1384 		error = xfs_falloc_insert_range(file, offset, len);
1385 		break;
1386 	case FALLOC_FL_ZERO_RANGE:
1387 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1388 		break;
1389 	case FALLOC_FL_UNSHARE_RANGE:
1390 		error = xfs_falloc_unshare_range(file, mode, offset, len);
1391 		break;
1392 	case FALLOC_FL_ALLOCATE_RANGE:
1393 		error = xfs_falloc_allocate_range(file, mode, offset, len);
1394 		break;
1395 	default:
1396 		error = -EOPNOTSUPP;
1397 		break;
1398 	}
1399 
1400 	if (!error && xfs_file_sync_writes(file))
1401 		error = xfs_log_force_inode(ip);
1402 
1403 out_unlock:
1404 	xfs_iunlock(ip, iolock);
1405 	return error;
1406 }
1407 
1408 static long
xfs_file_zoned_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1409 xfs_file_zoned_fallocate(
1410 	struct file		*file,
1411 	int			mode,
1412 	loff_t			offset,
1413 	loff_t			len)
1414 {
1415 	struct xfs_zone_alloc_ctx ac = { };
1416 	struct xfs_inode	*ip = XFS_I(file_inode(file));
1417 	int			error;
1418 
1419 	error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac);
1420 	if (error)
1421 		return error;
1422 	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1423 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1424 	return error;
1425 }
1426 
1427 static long
xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1428 xfs_file_fallocate(
1429 	struct file		*file,
1430 	int			mode,
1431 	loff_t			offset,
1432 	loff_t			len)
1433 {
1434 	struct inode		*inode = file_inode(file);
1435 
1436 	if (!S_ISREG(inode->i_mode))
1437 		return -EINVAL;
1438 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1439 		return -EOPNOTSUPP;
1440 
1441 	/*
1442 	 * For zoned file systems, zeroing the first and last block of a hole
1443 	 * punch requires allocating a new block to rewrite the remaining data
1444 	 * and new zeroes out of place.  Get a reservations for those before
1445 	 * taking the iolock.  Dip into the reserved pool because we are
1446 	 * expected to be able to punch a hole even on a completely full
1447 	 * file system.
1448 	 */
1449 	if (xfs_is_zoned_inode(XFS_I(inode)) &&
1450 	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1451 		     FALLOC_FL_COLLAPSE_RANGE)))
1452 		return xfs_file_zoned_fallocate(file, mode, offset, len);
1453 	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1454 }
1455 
1456 STATIC int
xfs_file_fadvise(struct file * file,loff_t start,loff_t end,int advice)1457 xfs_file_fadvise(
1458 	struct file	*file,
1459 	loff_t		start,
1460 	loff_t		end,
1461 	int		advice)
1462 {
1463 	struct xfs_inode *ip = XFS_I(file_inode(file));
1464 	int ret;
1465 	int lockflags = 0;
1466 
1467 	/*
1468 	 * Operations creating pages in page cache need protection from hole
1469 	 * punching and similar ops
1470 	 */
1471 	if (advice == POSIX_FADV_WILLNEED) {
1472 		lockflags = XFS_IOLOCK_SHARED;
1473 		xfs_ilock(ip, lockflags);
1474 	}
1475 	ret = generic_fadvise(file, start, end, advice);
1476 	if (lockflags)
1477 		xfs_iunlock(ip, lockflags);
1478 	return ret;
1479 }
1480 
1481 STATIC loff_t
xfs_file_remap_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)1482 xfs_file_remap_range(
1483 	struct file		*file_in,
1484 	loff_t			pos_in,
1485 	struct file		*file_out,
1486 	loff_t			pos_out,
1487 	loff_t			len,
1488 	unsigned int		remap_flags)
1489 {
1490 	struct inode		*inode_in = file_inode(file_in);
1491 	struct xfs_inode	*src = XFS_I(inode_in);
1492 	struct inode		*inode_out = file_inode(file_out);
1493 	struct xfs_inode	*dest = XFS_I(inode_out);
1494 	struct xfs_mount	*mp = src->i_mount;
1495 	loff_t			remapped = 0;
1496 	xfs_extlen_t		cowextsize;
1497 	int			ret;
1498 
1499 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1500 		return -EINVAL;
1501 
1502 	if (!xfs_has_reflink(mp))
1503 		return -EOPNOTSUPP;
1504 
1505 	if (xfs_is_shutdown(mp))
1506 		return -EIO;
1507 
1508 	/* Prepare and then clone file data. */
1509 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1510 			&len, remap_flags);
1511 	if (ret || len == 0)
1512 		return ret;
1513 
1514 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1515 
1516 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1517 			&remapped);
1518 	if (ret)
1519 		goto out_unlock;
1520 
1521 	/*
1522 	 * Carry the cowextsize hint from src to dest if we're sharing the
1523 	 * entire source file to the entire destination file, the source file
1524 	 * has a cowextsize hint, and the destination file does not.
1525 	 */
1526 	cowextsize = 0;
1527 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1528 	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1529 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1530 	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1531 		cowextsize = src->i_cowextsize;
1532 
1533 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1534 			remap_flags);
1535 	if (ret)
1536 		goto out_unlock;
1537 
1538 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1539 		xfs_log_force_inode(dest);
1540 out_unlock:
1541 	xfs_iunlock2_remapping(src, dest);
1542 	if (ret)
1543 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1544 	/*
1545 	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1546 	 * handle partial results -- either the whole remap succeeds, or we
1547 	 * must say why it did not.  In this case, any error should be returned
1548 	 * to the caller.
1549 	 */
1550 	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1551 		return ret;
1552 	return remapped > 0 ? remapped : ret;
1553 }
1554 
1555 STATIC int
xfs_file_open(struct inode * inode,struct file * file)1556 xfs_file_open(
1557 	struct inode	*inode,
1558 	struct file	*file)
1559 {
1560 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1561 		return -EIO;
1562 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1563 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
1564 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1565 	return generic_file_open(inode, file);
1566 }
1567 
1568 STATIC int
xfs_dir_open(struct inode * inode,struct file * file)1569 xfs_dir_open(
1570 	struct inode	*inode,
1571 	struct file	*file)
1572 {
1573 	struct xfs_inode *ip = XFS_I(inode);
1574 	unsigned int	mode;
1575 	int		error;
1576 
1577 	if (xfs_is_shutdown(ip->i_mount))
1578 		return -EIO;
1579 	error = generic_file_open(inode, file);
1580 	if (error)
1581 		return error;
1582 
1583 	/*
1584 	 * If there are any blocks, read-ahead block 0 as we're almost
1585 	 * certain to have the next operation be a read there.
1586 	 */
1587 	mode = xfs_ilock_data_map_shared(ip);
1588 	if (ip->i_df.if_nextents > 0)
1589 		error = xfs_dir3_data_readahead(ip, 0, 0);
1590 	xfs_iunlock(ip, mode);
1591 	return error;
1592 }
1593 
1594 /*
1595  * Don't bother propagating errors.  We're just doing cleanup, and the caller
1596  * ignores the return value anyway.
1597  */
1598 STATIC int
xfs_file_release(struct inode * inode,struct file * file)1599 xfs_file_release(
1600 	struct inode		*inode,
1601 	struct file		*file)
1602 {
1603 	struct xfs_inode	*ip = XFS_I(inode);
1604 	struct xfs_mount	*mp = ip->i_mount;
1605 
1606 	/*
1607 	 * If this is a read-only mount or the file system has been shut down,
1608 	 * don't generate I/O.
1609 	 */
1610 	if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1611 		return 0;
1612 
1613 	/*
1614 	 * If we previously truncated this file and removed old data in the
1615 	 * process, we want to initiate "early" writeout on the last close.
1616 	 * This is an attempt to combat the notorious NULL files problem which
1617 	 * is particularly noticeable from a truncate down, buffered (re-)write
1618 	 * (delalloc), followed by a crash.  What we are effectively doing here
1619 	 * is significantly reducing the time window where we'd otherwise be
1620 	 * exposed to that problem.
1621 	 */
1622 	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1623 		xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1624 		if (ip->i_delayed_blks > 0)
1625 			filemap_flush(inode->i_mapping);
1626 	}
1627 
1628 	/*
1629 	 * XFS aggressively preallocates post-EOF space to generate contiguous
1630 	 * allocations for writers that append to the end of the file.
1631 	 *
1632 	 * To support workloads that close and reopen the file frequently, these
1633 	 * preallocations usually persist after a close unless it is the first
1634 	 * close for the inode.  This is a tradeoff to generate tightly packed
1635 	 * data layouts for unpacking tarballs or similar archives that write
1636 	 * one file after another without going back to it while keeping the
1637 	 * preallocation for files that have recurring open/write/close cycles.
1638 	 *
1639 	 * This heuristic is skipped for inodes with the append-only flag as
1640 	 * that flag is rather pointless for inodes written only once.
1641 	 *
1642 	 * There is no point in freeing blocks here for open but unlinked files
1643 	 * as they will be taken care of by the inactivation path soon.
1644 	 *
1645 	 * When releasing a read-only context, don't flush data or trim post-EOF
1646 	 * blocks.  This avoids open/read/close workloads from removing EOF
1647 	 * blocks that other writers depend upon to reduce fragmentation.
1648 	 *
1649 	 * Inodes on the zoned RT device never have preallocations, so skip
1650 	 * taking the locks below.
1651 	 */
1652 	if (!inode->i_nlink ||
1653 	    !(file->f_mode & FMODE_WRITE) ||
1654 	    (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1655 	    xfs_is_zoned_inode(ip))
1656 		return 0;
1657 
1658 	/*
1659 	 * If we can't get the iolock just skip truncating the blocks past EOF
1660 	 * because we could deadlock with the mmap_lock otherwise. We'll get
1661 	 * another chance to drop them once the last reference to the inode is
1662 	 * dropped, so we'll never leak blocks permanently.
1663 	 */
1664 	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1665 	    xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1666 		if (xfs_can_free_eofblocks(ip) &&
1667 		    !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1668 			xfs_free_eofblocks(ip);
1669 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1670 	}
1671 
1672 	return 0;
1673 }
1674 
1675 STATIC int
xfs_file_readdir(struct file * file,struct dir_context * ctx)1676 xfs_file_readdir(
1677 	struct file	*file,
1678 	struct dir_context *ctx)
1679 {
1680 	struct inode	*inode = file_inode(file);
1681 	xfs_inode_t	*ip = XFS_I(inode);
1682 	size_t		bufsize;
1683 
1684 	/*
1685 	 * The Linux API doesn't pass down the total size of the buffer
1686 	 * we read into down to the filesystem.  With the filldir concept
1687 	 * it's not needed for correct information, but the XFS dir2 leaf
1688 	 * code wants an estimate of the buffer size to calculate it's
1689 	 * readahead window and size the buffers used for mapping to
1690 	 * physical blocks.
1691 	 *
1692 	 * Try to give it an estimate that's good enough, maybe at some
1693 	 * point we can change the ->readdir prototype to include the
1694 	 * buffer size.  For now we use the current glibc buffer size.
1695 	 */
1696 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1697 
1698 	return xfs_readdir(NULL, ip, ctx, bufsize);
1699 }
1700 
1701 STATIC loff_t
xfs_file_llseek(struct file * file,loff_t offset,int whence)1702 xfs_file_llseek(
1703 	struct file	*file,
1704 	loff_t		offset,
1705 	int		whence)
1706 {
1707 	struct inode		*inode = file->f_mapping->host;
1708 
1709 	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1710 		return -EIO;
1711 
1712 	switch (whence) {
1713 	default:
1714 		return generic_file_llseek(file, offset, whence);
1715 	case SEEK_HOLE:
1716 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1717 		break;
1718 	case SEEK_DATA:
1719 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1720 		break;
1721 	}
1722 
1723 	if (offset < 0)
1724 		return offset;
1725 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1726 }
1727 
1728 static inline vm_fault_t
xfs_dax_fault_locked(struct vm_fault * vmf,unsigned int order,bool write_fault)1729 xfs_dax_fault_locked(
1730 	struct vm_fault		*vmf,
1731 	unsigned int		order,
1732 	bool			write_fault)
1733 {
1734 	vm_fault_t		ret;
1735 	unsigned long		pfn;
1736 
1737 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1738 		ASSERT(0);
1739 		return VM_FAULT_SIGBUS;
1740 	}
1741 	ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1742 			(write_fault && !vmf->cow_page) ?
1743 				&xfs_dax_write_iomap_ops :
1744 				&xfs_read_iomap_ops);
1745 	if (ret & VM_FAULT_NEEDDSYNC)
1746 		ret = dax_finish_sync_fault(vmf, order, pfn);
1747 	return ret;
1748 }
1749 
1750 static vm_fault_t
xfs_dax_read_fault(struct vm_fault * vmf,unsigned int order)1751 xfs_dax_read_fault(
1752 	struct vm_fault		*vmf,
1753 	unsigned int		order)
1754 {
1755 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1756 	vm_fault_t		ret;
1757 
1758 	trace_xfs_read_fault(ip, order);
1759 
1760 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1761 	ret = xfs_dax_fault_locked(vmf, order, false);
1762 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1763 
1764 	return ret;
1765 }
1766 
1767 /*
1768  * Locking for serialisation of IO during page faults. This results in a lock
1769  * ordering of:
1770  *
1771  * mmap_lock (MM)
1772  *   sb_start_pagefault(vfs, freeze)
1773  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1774  *       page_lock (MM)
1775  *         i_lock (XFS - extent map serialisation)
1776  */
1777 static vm_fault_t
__xfs_write_fault(struct vm_fault * vmf,unsigned int order,struct xfs_zone_alloc_ctx * ac)1778 __xfs_write_fault(
1779 	struct vm_fault		*vmf,
1780 	unsigned int		order,
1781 	struct xfs_zone_alloc_ctx *ac)
1782 {
1783 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1784 	struct xfs_inode	*ip = XFS_I(inode);
1785 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
1786 	vm_fault_t		ret;
1787 
1788 	trace_xfs_write_fault(ip, order);
1789 
1790 	sb_start_pagefault(inode->i_sb);
1791 	file_update_time(vmf->vma->vm_file);
1792 
1793 	/*
1794 	 * Normally we only need the shared mmaplock, but if a reflink remap is
1795 	 * in progress we take the exclusive lock to wait for the remap to
1796 	 * finish before taking a write fault.
1797 	 */
1798 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1799 	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1800 		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1801 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1802 		lock_mode = XFS_MMAPLOCK_EXCL;
1803 	}
1804 
1805 	if (IS_DAX(inode))
1806 		ret = xfs_dax_fault_locked(vmf, order, true);
1807 	else
1808 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1809 				ac);
1810 	xfs_iunlock(ip, lock_mode);
1811 
1812 	sb_end_pagefault(inode->i_sb);
1813 	return ret;
1814 }
1815 
1816 static vm_fault_t
xfs_write_fault_zoned(struct vm_fault * vmf,unsigned int order)1817 xfs_write_fault_zoned(
1818 	struct vm_fault		*vmf,
1819 	unsigned int		order)
1820 {
1821 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1822 	unsigned int		len = folio_size(page_folio(vmf->page));
1823 	struct xfs_zone_alloc_ctx ac = { };
1824 	int			error;
1825 	vm_fault_t		ret;
1826 
1827 	/*
1828 	 * This could over-allocate as it doesn't check for truncation.
1829 	 *
1830 	 * But as the overallocation is limited to less than a folio and will be
1831 	 * release instantly that's just fine.
1832 	 */
1833 	error = xfs_zoned_space_reserve(ip->i_mount,
1834 			XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
1835 	if (error < 0)
1836 		return vmf_fs_error(error);
1837 	ret = __xfs_write_fault(vmf, order, &ac);
1838 	xfs_zoned_space_unreserve(ip->i_mount, &ac);
1839 	return ret;
1840 }
1841 
1842 static vm_fault_t
xfs_write_fault(struct vm_fault * vmf,unsigned int order)1843 xfs_write_fault(
1844 	struct vm_fault		*vmf,
1845 	unsigned int		order)
1846 {
1847 	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1848 		return xfs_write_fault_zoned(vmf, order);
1849 	return __xfs_write_fault(vmf, order, NULL);
1850 }
1851 
1852 static inline bool
xfs_is_write_fault(struct vm_fault * vmf)1853 xfs_is_write_fault(
1854 	struct vm_fault		*vmf)
1855 {
1856 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1857 	       (vmf->vma->vm_flags & VM_SHARED);
1858 }
1859 
1860 static vm_fault_t
xfs_filemap_fault(struct vm_fault * vmf)1861 xfs_filemap_fault(
1862 	struct vm_fault		*vmf)
1863 {
1864 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1865 
1866 	/* DAX can shortcut the normal fault path on write faults! */
1867 	if (IS_DAX(inode)) {
1868 		if (xfs_is_write_fault(vmf))
1869 			return xfs_write_fault(vmf, 0);
1870 		return xfs_dax_read_fault(vmf, 0);
1871 	}
1872 
1873 	trace_xfs_read_fault(XFS_I(inode), 0);
1874 	return filemap_fault(vmf);
1875 }
1876 
1877 static vm_fault_t
xfs_filemap_huge_fault(struct vm_fault * vmf,unsigned int order)1878 xfs_filemap_huge_fault(
1879 	struct vm_fault		*vmf,
1880 	unsigned int		order)
1881 {
1882 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1883 		return VM_FAULT_FALLBACK;
1884 
1885 	/* DAX can shortcut the normal fault path on write faults! */
1886 	if (xfs_is_write_fault(vmf))
1887 		return xfs_write_fault(vmf, order);
1888 	return xfs_dax_read_fault(vmf, order);
1889 }
1890 
1891 static vm_fault_t
xfs_filemap_page_mkwrite(struct vm_fault * vmf)1892 xfs_filemap_page_mkwrite(
1893 	struct vm_fault		*vmf)
1894 {
1895 	return xfs_write_fault(vmf, 0);
1896 }
1897 
1898 /*
1899  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1900  * on write faults. In reality, it needs to serialise against truncate and
1901  * prepare memory for writing so handle is as standard write fault.
1902  */
1903 static vm_fault_t
xfs_filemap_pfn_mkwrite(struct vm_fault * vmf)1904 xfs_filemap_pfn_mkwrite(
1905 	struct vm_fault		*vmf)
1906 {
1907 	return xfs_write_fault(vmf, 0);
1908 }
1909 
1910 static const struct vm_operations_struct xfs_file_vm_ops = {
1911 	.fault		= xfs_filemap_fault,
1912 	.huge_fault	= xfs_filemap_huge_fault,
1913 	.map_pages	= filemap_map_pages,
1914 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1915 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1916 };
1917 
1918 STATIC int
xfs_file_mmap_prepare(struct vm_area_desc * desc)1919 xfs_file_mmap_prepare(
1920 	struct vm_area_desc	*desc)
1921 {
1922 	struct file		*file = desc->file;
1923 	struct inode		*inode = file_inode(file);
1924 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1925 
1926 	/*
1927 	 * We don't support synchronous mappings for non-DAX files and
1928 	 * for DAX files if underneath dax_device is not synchronous.
1929 	 */
1930 	if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
1931 				      target->bt_daxdev))
1932 		return -EOPNOTSUPP;
1933 
1934 	file_accessed(file);
1935 	desc->vm_ops = &xfs_file_vm_ops;
1936 	if (IS_DAX(inode))
1937 		desc->vm_flags |= VM_HUGEPAGE;
1938 	return 0;
1939 }
1940 
1941 const struct file_operations xfs_file_operations = {
1942 	.llseek		= xfs_file_llseek,
1943 	.read_iter	= xfs_file_read_iter,
1944 	.write_iter	= xfs_file_write_iter,
1945 	.splice_read	= xfs_file_splice_read,
1946 	.splice_write	= iter_file_splice_write,
1947 	.iopoll		= iocb_bio_iopoll,
1948 	.unlocked_ioctl	= xfs_file_ioctl,
1949 #ifdef CONFIG_COMPAT
1950 	.compat_ioctl	= xfs_file_compat_ioctl,
1951 #endif
1952 	.mmap_prepare	= xfs_file_mmap_prepare,
1953 	.open		= xfs_file_open,
1954 	.release	= xfs_file_release,
1955 	.fsync		= xfs_file_fsync,
1956 	.get_unmapped_area = thp_get_unmapped_area,
1957 	.fallocate	= xfs_file_fallocate,
1958 	.fadvise	= xfs_file_fadvise,
1959 	.remap_file_range = xfs_file_remap_range,
1960 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
1961 			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
1962 			  FOP_DONTCACHE,
1963 };
1964 
1965 const struct file_operations xfs_dir_file_operations = {
1966 	.open		= xfs_dir_open,
1967 	.read		= generic_read_dir,
1968 	.iterate_shared	= xfs_file_readdir,
1969 	.llseek		= generic_file_llseek,
1970 	.unlocked_ioctl	= xfs_file_ioctl,
1971 #ifdef CONFIG_COMPAT
1972 	.compat_ioctl	= xfs_file_compat_ioctl,
1973 #endif
1974 	.fsync		= xfs_dir_fsync,
1975 };
1976