xref: /linux/fs/xfs/xfs_file.c (revision c148bc7535650fbfa95a1f571b9ffa2ab478ea33)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_bmap.h"
17 #include "xfs_bmap_util.h"
18 #include "xfs_dir2.h"
19 #include "xfs_dir2_priv.h"
20 #include "xfs_ioctl.h"
21 #include "xfs_trace.h"
22 #include "xfs_log.h"
23 #include "xfs_icache.h"
24 #include "xfs_pnfs.h"
25 #include "xfs_iomap.h"
26 #include "xfs_reflink.h"
27 #include "xfs_file.h"
28 #include "xfs_aops.h"
29 #include "xfs_zone_alloc.h"
30 
31 #include <linux/dax.h>
32 #include <linux/falloc.h>
33 #include <linux/backing-dev.h>
34 #include <linux/mman.h>
35 #include <linux/fadvise.h>
36 #include <linux/mount.h>
37 
38 static const struct vm_operations_struct xfs_file_vm_ops;
39 
40 /*
41  * Decide if the given file range is aligned to the size of the fundamental
42  * allocation unit for the file.
43  */
44 bool
xfs_is_falloc_aligned(struct xfs_inode * ip,loff_t pos,long long int len)45 xfs_is_falloc_aligned(
46 	struct xfs_inode	*ip,
47 	loff_t			pos,
48 	long long int		len)
49 {
50 	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip);
51 
52 	if (!is_power_of_2(alloc_unit))
53 		return isaligned_64(pos, alloc_unit) &&
54 		       isaligned_64(len, alloc_unit);
55 
56 	return !((pos | len) & (alloc_unit - 1));
57 }
58 
59 /*
60  * Fsync operations on directories are much simpler than on regular files,
61  * as there is no file data to flush, and thus also no need for explicit
62  * cache flush operations, and there are no non-transaction metadata updates
63  * on directories either.
64  */
65 STATIC int
xfs_dir_fsync(struct file * file,loff_t start,loff_t end,int datasync)66 xfs_dir_fsync(
67 	struct file		*file,
68 	loff_t			start,
69 	loff_t			end,
70 	int			datasync)
71 {
72 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
73 
74 	trace_xfs_dir_fsync(ip);
75 	return xfs_log_force_inode(ip);
76 }
77 
78 static xfs_csn_t
xfs_fsync_seq(struct xfs_inode * ip,bool datasync)79 xfs_fsync_seq(
80 	struct xfs_inode	*ip,
81 	bool			datasync)
82 {
83 	if (!xfs_ipincount(ip))
84 		return 0;
85 	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
86 		return 0;
87 	return ip->i_itemp->ili_commit_seq;
88 }
89 
90 /*
91  * All metadata updates are logged, which means that we just have to flush the
92  * log up to the latest LSN that touched the inode.
93  *
94  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
95  * the log force before we clear the ili_fsync_fields field. This ensures that
96  * we don't get a racing sync operation that does not wait for the metadata to
97  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
98  * then all that will happen is the log force will do nothing as the lsn will
99  * already be on disk.  We can't race with setting ili_fsync_fields because that
100  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
101  * shared until after the ili_fsync_fields is cleared.
102  */
103 static  int
xfs_fsync_flush_log(struct xfs_inode * ip,bool datasync,int * log_flushed)104 xfs_fsync_flush_log(
105 	struct xfs_inode	*ip,
106 	bool			datasync,
107 	int			*log_flushed)
108 {
109 	int			error = 0;
110 	xfs_csn_t		seq;
111 
112 	xfs_ilock(ip, XFS_ILOCK_SHARED);
113 	seq = xfs_fsync_seq(ip, datasync);
114 	if (seq) {
115 		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
116 					  log_flushed);
117 
118 		spin_lock(&ip->i_itemp->ili_lock);
119 		ip->i_itemp->ili_fsync_fields = 0;
120 		spin_unlock(&ip->i_itemp->ili_lock);
121 	}
122 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
123 	return error;
124 }
125 
126 STATIC int
xfs_file_fsync(struct file * file,loff_t start,loff_t end,int datasync)127 xfs_file_fsync(
128 	struct file		*file,
129 	loff_t			start,
130 	loff_t			end,
131 	int			datasync)
132 {
133 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
134 	struct xfs_mount	*mp = ip->i_mount;
135 	int			error, err2;
136 	int			log_flushed = 0;
137 
138 	trace_xfs_file_fsync(ip);
139 
140 	error = file_write_and_wait_range(file, start, end);
141 	if (error)
142 		return error;
143 
144 	if (xfs_is_shutdown(mp))
145 		return -EIO;
146 
147 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
148 
149 	/*
150 	 * If we have an RT and/or log subvolume we need to make sure to flush
151 	 * the write cache the device used for file data first.  This is to
152 	 * ensure newly written file data make it to disk before logging the new
153 	 * inode size in case of an extending write.
154 	 */
155 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
156 		error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
157 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
158 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
159 
160 	/*
161 	 * Any inode that has dirty modifications in the log is pinned.  The
162 	 * racy check here for a pinned inode will not catch modifications
163 	 * that happen concurrently to the fsync call, but fsync semantics
164 	 * only require to sync previously completed I/O.
165 	 */
166 	if (xfs_ipincount(ip)) {
167 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
168 		if (err2 && !error)
169 			error = err2;
170 	}
171 
172 	/*
173 	 * If we only have a single device, and the log force about was
174 	 * a no-op we might have to flush the data device cache here.
175 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
176 	 * an already allocated file and thus do not have any metadata to
177 	 * commit.
178 	 */
179 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
180 	    mp->m_logdev_targp == mp->m_ddev_targp) {
181 		err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
182 		if (err2 && !error)
183 			error = err2;
184 	}
185 
186 	return error;
187 }
188 
189 static int
xfs_ilock_iocb(struct kiocb * iocb,unsigned int lock_mode)190 xfs_ilock_iocb(
191 	struct kiocb		*iocb,
192 	unsigned int		lock_mode)
193 {
194 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
195 
196 	if (iocb->ki_flags & IOCB_NOWAIT) {
197 		if (!xfs_ilock_nowait(ip, lock_mode))
198 			return -EAGAIN;
199 	} else {
200 		xfs_ilock(ip, lock_mode);
201 	}
202 
203 	return 0;
204 }
205 
206 static int
xfs_ilock_iocb_for_write(struct kiocb * iocb,unsigned int * lock_mode)207 xfs_ilock_iocb_for_write(
208 	struct kiocb		*iocb,
209 	unsigned int		*lock_mode)
210 {
211 	ssize_t			ret;
212 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
213 
214 	ret = xfs_ilock_iocb(iocb, *lock_mode);
215 	if (ret)
216 		return ret;
217 
218 	/*
219 	 * If a reflink remap is in progress we always need to take the iolock
220 	 * exclusively to wait for it to finish.
221 	 */
222 	if (*lock_mode == XFS_IOLOCK_SHARED &&
223 	    xfs_iflags_test(ip, XFS_IREMAPPING)) {
224 		xfs_iunlock(ip, *lock_mode);
225 		*lock_mode = XFS_IOLOCK_EXCL;
226 		return xfs_ilock_iocb(iocb, *lock_mode);
227 	}
228 
229 	return 0;
230 }
231 
232 STATIC ssize_t
xfs_file_dio_read(struct kiocb * iocb,struct iov_iter * to)233 xfs_file_dio_read(
234 	struct kiocb		*iocb,
235 	struct iov_iter		*to)
236 {
237 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
238 	ssize_t			ret;
239 
240 	trace_xfs_file_direct_read(iocb, to);
241 
242 	if (!iov_iter_count(to))
243 		return 0; /* skip atime */
244 
245 	file_accessed(iocb->ki_filp);
246 
247 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
248 	if (ret)
249 		return ret;
250 	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
251 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
252 
253 	return ret;
254 }
255 
256 static noinline ssize_t
xfs_file_dax_read(struct kiocb * iocb,struct iov_iter * to)257 xfs_file_dax_read(
258 	struct kiocb		*iocb,
259 	struct iov_iter		*to)
260 {
261 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
262 	ssize_t			ret = 0;
263 
264 	trace_xfs_file_dax_read(iocb, to);
265 
266 	if (!iov_iter_count(to))
267 		return 0; /* skip atime */
268 
269 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
270 	if (ret)
271 		return ret;
272 	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
273 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
274 
275 	file_accessed(iocb->ki_filp);
276 	return ret;
277 }
278 
279 STATIC ssize_t
xfs_file_buffered_read(struct kiocb * iocb,struct iov_iter * to)280 xfs_file_buffered_read(
281 	struct kiocb		*iocb,
282 	struct iov_iter		*to)
283 {
284 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
285 	ssize_t			ret;
286 
287 	trace_xfs_file_buffered_read(iocb, to);
288 
289 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
290 	if (ret)
291 		return ret;
292 	ret = generic_file_read_iter(iocb, to);
293 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
294 
295 	return ret;
296 }
297 
298 STATIC ssize_t
xfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)299 xfs_file_read_iter(
300 	struct kiocb		*iocb,
301 	struct iov_iter		*to)
302 {
303 	struct inode		*inode = file_inode(iocb->ki_filp);
304 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
305 	ssize_t			ret = 0;
306 
307 	XFS_STATS_INC(mp, xs_read_calls);
308 
309 	if (xfs_is_shutdown(mp))
310 		return -EIO;
311 
312 	if (IS_DAX(inode))
313 		ret = xfs_file_dax_read(iocb, to);
314 	else if (iocb->ki_flags & IOCB_DIRECT)
315 		ret = xfs_file_dio_read(iocb, to);
316 	else
317 		ret = xfs_file_buffered_read(iocb, to);
318 
319 	if (ret > 0)
320 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
321 	return ret;
322 }
323 
324 STATIC ssize_t
xfs_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)325 xfs_file_splice_read(
326 	struct file		*in,
327 	loff_t			*ppos,
328 	struct pipe_inode_info	*pipe,
329 	size_t			len,
330 	unsigned int		flags)
331 {
332 	struct inode		*inode = file_inode(in);
333 	struct xfs_inode	*ip = XFS_I(inode);
334 	struct xfs_mount	*mp = ip->i_mount;
335 	ssize_t			ret = 0;
336 
337 	XFS_STATS_INC(mp, xs_read_calls);
338 
339 	if (xfs_is_shutdown(mp))
340 		return -EIO;
341 
342 	trace_xfs_file_splice_read(ip, *ppos, len);
343 
344 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
345 	ret = filemap_splice_read(in, ppos, pipe, len, flags);
346 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
347 	if (ret > 0)
348 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
349 	return ret;
350 }
351 
352 /*
353  * Take care of zeroing post-EOF blocks when they might exist.
354  *
355  * Returns 0 if successfully, a negative error for a failure, or 1 if this
356  * function dropped the iolock and reacquired it exclusively and the caller
357  * needs to restart the write sanity checks.
358  */
359 static ssize_t
xfs_file_write_zero_eof(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,size_t count,bool * drained_dio,struct xfs_zone_alloc_ctx * ac)360 xfs_file_write_zero_eof(
361 	struct kiocb		*iocb,
362 	struct iov_iter		*from,
363 	unsigned int		*iolock,
364 	size_t			count,
365 	bool			*drained_dio,
366 	struct xfs_zone_alloc_ctx *ac)
367 {
368 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
369 	loff_t			isize;
370 	int			error;
371 
372 	/*
373 	 * We need to serialise against EOF updates that occur in IO completions
374 	 * here. We want to make sure that nobody is changing the size while
375 	 * we do this check until we have placed an IO barrier (i.e. hold
376 	 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
377 	 * spinlock effectively forms a memory barrier once we have
378 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
379 	 * hence be able to correctly determine if we need to run zeroing.
380 	 */
381 	spin_lock(&ip->i_flags_lock);
382 	isize = i_size_read(VFS_I(ip));
383 	if (iocb->ki_pos <= isize) {
384 		spin_unlock(&ip->i_flags_lock);
385 		return 0;
386 	}
387 	spin_unlock(&ip->i_flags_lock);
388 
389 	if (iocb->ki_flags & IOCB_NOWAIT)
390 		return -EAGAIN;
391 
392 	if (!*drained_dio) {
393 		/*
394 		 * If zeroing is needed and we are currently holding the iolock
395 		 * shared, we need to update it to exclusive which implies
396 		 * having to redo all checks before.
397 		 */
398 		if (*iolock == XFS_IOLOCK_SHARED) {
399 			xfs_iunlock(ip, *iolock);
400 			*iolock = XFS_IOLOCK_EXCL;
401 			xfs_ilock(ip, *iolock);
402 			iov_iter_reexpand(from, count);
403 		}
404 
405 		/*
406 		 * We now have an IO submission barrier in place, but AIO can do
407 		 * EOF updates during IO completion and hence we now need to
408 		 * wait for all of them to drain.  Non-AIO DIO will have drained
409 		 * before we are given the XFS_IOLOCK_EXCL, and so for most
410 		 * cases this wait is a no-op.
411 		 */
412 		inode_dio_wait(VFS_I(ip));
413 		*drained_dio = true;
414 		return 1;
415 	}
416 
417 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
418 
419 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
420 	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
421 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
422 
423 	return error;
424 }
425 
426 /*
427  * Common pre-write limit and setup checks.
428  *
429  * Called with the iolock held either shared and exclusive according to
430  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
431  * if called for a direct write beyond i_size.
432  */
433 STATIC ssize_t
xfs_file_write_checks(struct kiocb * iocb,struct iov_iter * from,unsigned int * iolock,struct xfs_zone_alloc_ctx * ac)434 xfs_file_write_checks(
435 	struct kiocb		*iocb,
436 	struct iov_iter		*from,
437 	unsigned int		*iolock,
438 	struct xfs_zone_alloc_ctx *ac)
439 {
440 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
441 	size_t			count = iov_iter_count(from);
442 	bool			drained_dio = false;
443 	ssize_t			error;
444 
445 restart:
446 	error = generic_write_checks(iocb, from);
447 	if (error <= 0)
448 		return error;
449 
450 	if (iocb->ki_flags & IOCB_NOWAIT) {
451 		error = break_layout(inode, false);
452 		if (error == -EWOULDBLOCK)
453 			error = -EAGAIN;
454 	} else {
455 		error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
456 	}
457 
458 	if (error)
459 		return error;
460 
461 	/*
462 	 * For changing security info in file_remove_privs() we need i_rwsem
463 	 * exclusively.
464 	 */
465 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
466 		xfs_iunlock(XFS_I(inode), *iolock);
467 		*iolock = XFS_IOLOCK_EXCL;
468 		error = xfs_ilock_iocb(iocb, *iolock);
469 		if (error) {
470 			*iolock = 0;
471 			return error;
472 		}
473 		goto restart;
474 	}
475 
476 	/*
477 	 * If the offset is beyond the size of the file, we need to zero all
478 	 * blocks that fall between the existing EOF and the start of this
479 	 * write.
480 	 *
481 	 * We can do an unlocked check for i_size here safely as I/O completion
482 	 * can only extend EOF.  Truncate is locked out at this point, so the
483 	 * EOF can not move backwards, only forwards. Hence we only need to take
484 	 * the slow path when we are at or beyond the current EOF.
485 	 */
486 	if (iocb->ki_pos > i_size_read(inode)) {
487 		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
488 				&drained_dio, ac);
489 		if (error == 1)
490 			goto restart;
491 		if (error)
492 			return error;
493 	}
494 
495 	return kiocb_modified(iocb);
496 }
497 
498 static ssize_t
xfs_zoned_write_space_reserve(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from,unsigned int flags,struct xfs_zone_alloc_ctx * ac)499 xfs_zoned_write_space_reserve(
500 	struct xfs_inode		*ip,
501 	struct kiocb			*iocb,
502 	struct iov_iter			*from,
503 	unsigned int			flags,
504 	struct xfs_zone_alloc_ctx	*ac)
505 {
506 	loff_t				count = iov_iter_count(from);
507 	int				error;
508 
509 	if (iocb->ki_flags & IOCB_NOWAIT)
510 		flags |= XFS_ZR_NOWAIT;
511 
512 	/*
513 	 * Check the rlimit and LFS boundary first so that we don't over-reserve
514 	 * by possibly a lot.
515 	 *
516 	 * The generic write path will redo this check later, and it might have
517 	 * changed by then.  If it got expanded we'll stick to our earlier
518 	 * smaller limit, and if it is decreased the new smaller limit will be
519 	 * used and our extra space reservation will be returned after finishing
520 	 * the write.
521 	 */
522 	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
523 	if (error)
524 		return error;
525 
526 	/*
527 	 * Sloppily round up count to file system blocks.
528 	 *
529 	 * This will often reserve an extra block, but that avoids having to look
530 	 * at the start offset, which isn't stable for O_APPEND until taking the
531 	 * iolock.  Also we need to reserve a block each for zeroing the old
532 	 * EOF block and the new start block if they are unaligned.
533 	 *
534 	 * Any remaining block will be returned after the write.
535 	 */
536 	return xfs_zoned_space_reserve(ip,
537 			XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
538 }
539 
540 static int
xfs_dio_write_end_io(struct kiocb * iocb,ssize_t size,int error,unsigned flags)541 xfs_dio_write_end_io(
542 	struct kiocb		*iocb,
543 	ssize_t			size,
544 	int			error,
545 	unsigned		flags)
546 {
547 	struct inode		*inode = file_inode(iocb->ki_filp);
548 	struct xfs_inode	*ip = XFS_I(inode);
549 	loff_t			offset = iocb->ki_pos;
550 	unsigned int		nofs_flag;
551 
552 	ASSERT(!xfs_is_zoned_inode(ip) ||
553 	       !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
554 
555 	trace_xfs_end_io_direct_write(ip, offset, size);
556 
557 	if (xfs_is_shutdown(ip->i_mount))
558 		return -EIO;
559 
560 	if (error)
561 		return error;
562 	if (!size)
563 		return 0;
564 
565 	/*
566 	 * Capture amount written on completion as we can't reliably account
567 	 * for it on submission.
568 	 */
569 	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
570 
571 	/*
572 	 * We can allocate memory here while doing writeback on behalf of
573 	 * memory reclaim.  To avoid memory allocation deadlocks set the
574 	 * task-wide nofs context for the following operations.
575 	 */
576 	nofs_flag = memalloc_nofs_save();
577 
578 	if (flags & IOMAP_DIO_COW) {
579 		error = xfs_reflink_end_cow(ip, offset, size);
580 		if (error)
581 			goto out;
582 	}
583 
584 	/*
585 	 * Unwritten conversion updates the in-core isize after extent
586 	 * conversion but before updating the on-disk size. Updating isize any
587 	 * earlier allows a racing dio read to find unwritten extents before
588 	 * they are converted.
589 	 */
590 	if (flags & IOMAP_DIO_UNWRITTEN) {
591 		error = xfs_iomap_write_unwritten(ip, offset, size, true);
592 		goto out;
593 	}
594 
595 	/*
596 	 * We need to update the in-core inode size here so that we don't end up
597 	 * with the on-disk inode size being outside the in-core inode size. We
598 	 * have no other method of updating EOF for AIO, so always do it here
599 	 * if necessary.
600 	 *
601 	 * We need to lock the test/set EOF update as we can be racing with
602 	 * other IO completions here to update the EOF. Failing to serialise
603 	 * here can result in EOF moving backwards and Bad Things Happen when
604 	 * that occurs.
605 	 *
606 	 * As IO completion only ever extends EOF, we can do an unlocked check
607 	 * here to avoid taking the spinlock. If we land within the current EOF,
608 	 * then we do not need to do an extending update at all, and we don't
609 	 * need to take the lock to check this. If we race with an update moving
610 	 * EOF, then we'll either still be beyond EOF and need to take the lock,
611 	 * or we'll be within EOF and we don't need to take it at all.
612 	 */
613 	if (offset + size <= i_size_read(inode))
614 		goto out;
615 
616 	spin_lock(&ip->i_flags_lock);
617 	if (offset + size > i_size_read(inode)) {
618 		i_size_write(inode, offset + size);
619 		spin_unlock(&ip->i_flags_lock);
620 		error = xfs_setfilesize(ip, offset, size);
621 	} else {
622 		spin_unlock(&ip->i_flags_lock);
623 	}
624 
625 out:
626 	memalloc_nofs_restore(nofs_flag);
627 	return error;
628 }
629 
630 static const struct iomap_dio_ops xfs_dio_write_ops = {
631 	.end_io		= xfs_dio_write_end_io,
632 };
633 
634 static void
xfs_dio_zoned_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)635 xfs_dio_zoned_submit_io(
636 	const struct iomap_iter	*iter,
637 	struct bio		*bio,
638 	loff_t			file_offset)
639 {
640 	struct xfs_mount	*mp = XFS_I(iter->inode)->i_mount;
641 	struct xfs_zone_alloc_ctx *ac = iter->private;
642 	xfs_filblks_t		count_fsb;
643 	struct iomap_ioend	*ioend;
644 
645 	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
646 	if (count_fsb > ac->reserved_blocks) {
647 		xfs_err(mp,
648 "allocation (%lld) larger than reservation (%lld).",
649 			count_fsb, ac->reserved_blocks);
650 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
651 		bio_io_error(bio);
652 		return;
653 	}
654 	ac->reserved_blocks -= count_fsb;
655 
656 	bio->bi_end_io = xfs_end_bio;
657 	ioend = iomap_init_ioend(iter->inode, bio, file_offset,
658 			IOMAP_IOEND_DIRECT);
659 	xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
660 }
661 
662 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
663 	.bio_set	= &iomap_ioend_bioset,
664 	.submit_io	= xfs_dio_zoned_submit_io,
665 	.end_io		= xfs_dio_write_end_io,
666 };
667 
668 /*
669  * Handle block aligned direct I/O writes.
670  */
671 static noinline ssize_t
xfs_file_dio_write_aligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from,const struct iomap_ops * ops,const struct iomap_dio_ops * dops,struct xfs_zone_alloc_ctx * ac)672 xfs_file_dio_write_aligned(
673 	struct xfs_inode	*ip,
674 	struct kiocb		*iocb,
675 	struct iov_iter		*from,
676 	const struct iomap_ops	*ops,
677 	const struct iomap_dio_ops *dops,
678 	struct xfs_zone_alloc_ctx *ac)
679 {
680 	unsigned int		iolock = XFS_IOLOCK_SHARED;
681 	ssize_t			ret;
682 
683 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
684 	if (ret)
685 		return ret;
686 	ret = xfs_file_write_checks(iocb, from, &iolock, ac);
687 	if (ret)
688 		goto out_unlock;
689 
690 	/*
691 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
692 	 * the iolock back to shared if we had to take the exclusive lock in
693 	 * xfs_file_write_checks() for other reasons.
694 	 */
695 	if (iolock == XFS_IOLOCK_EXCL) {
696 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
697 		iolock = XFS_IOLOCK_SHARED;
698 	}
699 	trace_xfs_file_direct_write(iocb, from);
700 	ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
701 out_unlock:
702 	xfs_iunlock(ip, iolock);
703 	return ret;
704 }
705 
706 /*
707  * Handle block aligned direct I/O writes to zoned devices.
708  */
709 static noinline ssize_t
xfs_file_dio_write_zoned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)710 xfs_file_dio_write_zoned(
711 	struct xfs_inode	*ip,
712 	struct kiocb		*iocb,
713 	struct iov_iter		*from)
714 {
715 	struct xfs_zone_alloc_ctx ac = { };
716 	ssize_t			ret;
717 
718 	ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
719 	if (ret < 0)
720 		return ret;
721 	ret = xfs_file_dio_write_aligned(ip, iocb, from,
722 			&xfs_zoned_direct_write_iomap_ops,
723 			&xfs_dio_zoned_write_ops, &ac);
724 	xfs_zoned_space_unreserve(ip, &ac);
725 	return ret;
726 }
727 
728 /*
729  * Handle block unaligned direct I/O writes
730  *
731  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
732  * them to be done in parallel with reads and other direct I/O writes.  However,
733  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
734  * to do sub-block zeroing and that requires serialisation against other direct
735  * I/O to the same block.  In this case we need to serialise the submission of
736  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
737  * In the case where sub-block zeroing is not required, we can do concurrent
738  * sub-block dios to the same block successfully.
739  *
740  * Optimistically submit the I/O using the shared lock first, but use the
741  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
742  * if block allocation or partial block zeroing would be required.  In that case
743  * we try again with the exclusive lock.
744  */
745 static noinline ssize_t
xfs_file_dio_write_unaligned(struct xfs_inode * ip,struct kiocb * iocb,struct iov_iter * from)746 xfs_file_dio_write_unaligned(
747 	struct xfs_inode	*ip,
748 	struct kiocb		*iocb,
749 	struct iov_iter		*from)
750 {
751 	size_t			isize = i_size_read(VFS_I(ip));
752 	size_t			count = iov_iter_count(from);
753 	unsigned int		iolock = XFS_IOLOCK_SHARED;
754 	unsigned int		flags = IOMAP_DIO_OVERWRITE_ONLY;
755 	ssize_t			ret;
756 
757 	/*
758 	 * Extending writes need exclusivity because of the sub-block zeroing
759 	 * that the DIO code always does for partial tail blocks beyond EOF, so
760 	 * don't even bother trying the fast path in this case.
761 	 */
762 	if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
763 		if (iocb->ki_flags & IOCB_NOWAIT)
764 			return -EAGAIN;
765 retry_exclusive:
766 		iolock = XFS_IOLOCK_EXCL;
767 		flags = IOMAP_DIO_FORCE_WAIT;
768 	}
769 
770 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
771 	if (ret)
772 		return ret;
773 
774 	/*
775 	 * We can't properly handle unaligned direct I/O to reflink files yet,
776 	 * as we can't unshare a partial block.
777 	 */
778 	if (xfs_is_cow_inode(ip)) {
779 		trace_xfs_reflink_bounce_dio_write(iocb, from);
780 		ret = -ENOTBLK;
781 		goto out_unlock;
782 	}
783 
784 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
785 	if (ret)
786 		goto out_unlock;
787 
788 	/*
789 	 * If we are doing exclusive unaligned I/O, this must be the only I/O
790 	 * in-flight.  Otherwise we risk data corruption due to unwritten extent
791 	 * conversions from the AIO end_io handler.  Wait for all other I/O to
792 	 * drain first.
793 	 */
794 	if (flags & IOMAP_DIO_FORCE_WAIT)
795 		inode_dio_wait(VFS_I(ip));
796 
797 	trace_xfs_file_direct_write(iocb, from);
798 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
799 			   &xfs_dio_write_ops, flags, NULL, 0);
800 
801 	/*
802 	 * Retry unaligned I/O with exclusive blocking semantics if the DIO
803 	 * layer rejected it for mapping or locking reasons. If we are doing
804 	 * nonblocking user I/O, propagate the error.
805 	 */
806 	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
807 		ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
808 		xfs_iunlock(ip, iolock);
809 		goto retry_exclusive;
810 	}
811 
812 out_unlock:
813 	if (iolock)
814 		xfs_iunlock(ip, iolock);
815 	return ret;
816 }
817 
818 static ssize_t
xfs_file_dio_write(struct kiocb * iocb,struct iov_iter * from)819 xfs_file_dio_write(
820 	struct kiocb		*iocb,
821 	struct iov_iter		*from)
822 {
823 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
824 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
825 	size_t			count = iov_iter_count(from);
826 
827 	/* direct I/O must be aligned to device logical sector size */
828 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
829 		return -EINVAL;
830 
831 	/*
832 	 * For always COW inodes we also must check the alignment of each
833 	 * individual iovec segment, as they could end up with different
834 	 * I/Os due to the way bio_iov_iter_get_pages works, and we'd
835 	 * then overwrite an already written block.
836 	 */
837 	if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
838 	    (xfs_is_always_cow_inode(ip) &&
839 	     (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
840 		return xfs_file_dio_write_unaligned(ip, iocb, from);
841 	if (xfs_is_zoned_inode(ip))
842 		return xfs_file_dio_write_zoned(ip, iocb, from);
843 	return xfs_file_dio_write_aligned(ip, iocb, from,
844 			&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
845 }
846 
847 static noinline ssize_t
xfs_file_dax_write(struct kiocb * iocb,struct iov_iter * from)848 xfs_file_dax_write(
849 	struct kiocb		*iocb,
850 	struct iov_iter		*from)
851 {
852 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
853 	struct xfs_inode	*ip = XFS_I(inode);
854 	unsigned int		iolock = XFS_IOLOCK_EXCL;
855 	ssize_t			ret, error = 0;
856 	loff_t			pos;
857 
858 	ret = xfs_ilock_iocb(iocb, iolock);
859 	if (ret)
860 		return ret;
861 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
862 	if (ret)
863 		goto out;
864 
865 	pos = iocb->ki_pos;
866 
867 	trace_xfs_file_dax_write(iocb, from);
868 	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
869 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
870 		i_size_write(inode, iocb->ki_pos);
871 		error = xfs_setfilesize(ip, pos, ret);
872 	}
873 out:
874 	if (iolock)
875 		xfs_iunlock(ip, iolock);
876 	if (error)
877 		return error;
878 
879 	if (ret > 0) {
880 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
881 
882 		/* Handle various SYNC-type writes */
883 		ret = generic_write_sync(iocb, ret);
884 	}
885 	return ret;
886 }
887 
888 STATIC ssize_t
xfs_file_buffered_write(struct kiocb * iocb,struct iov_iter * from)889 xfs_file_buffered_write(
890 	struct kiocb		*iocb,
891 	struct iov_iter		*from)
892 {
893 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
894 	struct xfs_inode	*ip = XFS_I(inode);
895 	ssize_t			ret;
896 	bool			cleared_space = false;
897 	unsigned int		iolock;
898 
899 write_retry:
900 	iolock = XFS_IOLOCK_EXCL;
901 	ret = xfs_ilock_iocb(iocb, iolock);
902 	if (ret)
903 		return ret;
904 
905 	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
906 	if (ret)
907 		goto out;
908 
909 	trace_xfs_file_buffered_write(iocb, from);
910 	ret = iomap_file_buffered_write(iocb, from,
911 			&xfs_buffered_write_iomap_ops, NULL);
912 
913 	/*
914 	 * If we hit a space limit, try to free up some lingering preallocated
915 	 * space before returning an error. In the case of ENOSPC, first try to
916 	 * write back all dirty inodes to free up some of the excess reserved
917 	 * metadata space. This reduces the chances that the eofblocks scan
918 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
919 	 * also behaves as a filter to prevent too many eofblocks scans from
920 	 * running at the same time.  Use a synchronous scan to increase the
921 	 * effectiveness of the scan.
922 	 */
923 	if (ret == -EDQUOT && !cleared_space) {
924 		xfs_iunlock(ip, iolock);
925 		xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
926 		cleared_space = true;
927 		goto write_retry;
928 	} else if (ret == -ENOSPC && !cleared_space) {
929 		struct xfs_icwalk	icw = {0};
930 
931 		cleared_space = true;
932 		xfs_flush_inodes(ip->i_mount);
933 
934 		xfs_iunlock(ip, iolock);
935 		icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
936 		xfs_blockgc_free_space(ip->i_mount, &icw);
937 		goto write_retry;
938 	}
939 
940 out:
941 	if (iolock)
942 		xfs_iunlock(ip, iolock);
943 
944 	if (ret > 0) {
945 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
946 		/* Handle various SYNC-type writes */
947 		ret = generic_write_sync(iocb, ret);
948 	}
949 	return ret;
950 }
951 
952 STATIC ssize_t
xfs_file_buffered_write_zoned(struct kiocb * iocb,struct iov_iter * from)953 xfs_file_buffered_write_zoned(
954 	struct kiocb		*iocb,
955 	struct iov_iter		*from)
956 {
957 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
958 	struct xfs_mount	*mp = ip->i_mount;
959 	unsigned int		iolock = XFS_IOLOCK_EXCL;
960 	bool			cleared_space = false;
961 	struct xfs_zone_alloc_ctx ac = { };
962 	ssize_t			ret;
963 
964 	ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
965 	if (ret < 0)
966 		return ret;
967 
968 	ret = xfs_ilock_iocb(iocb, iolock);
969 	if (ret)
970 		goto out_unreserve;
971 
972 	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
973 	if (ret)
974 		goto out_unlock;
975 
976 	/*
977 	 * Truncate the iter to the length that we were actually able to
978 	 * allocate blocks for.  This needs to happen after
979 	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
980 	 * writes.
981 	 */
982 	iov_iter_truncate(from,
983 			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
984 			(iocb->ki_pos & mp->m_blockmask));
985 	if (!iov_iter_count(from))
986 		goto out_unlock;
987 
988 retry:
989 	trace_xfs_file_buffered_write(iocb, from);
990 	ret = iomap_file_buffered_write(iocb, from,
991 			&xfs_buffered_write_iomap_ops, &ac);
992 	if (ret == -ENOSPC && !cleared_space) {
993 		/*
994 		 * Kick off writeback to convert delalloc space and release the
995 		 * usually too pessimistic indirect block reservations.
996 		 */
997 		xfs_flush_inodes(mp);
998 		cleared_space = true;
999 		goto retry;
1000 	}
1001 
1002 out_unlock:
1003 	xfs_iunlock(ip, iolock);
1004 out_unreserve:
1005 	xfs_zoned_space_unreserve(ip, &ac);
1006 	if (ret > 0) {
1007 		XFS_STATS_ADD(mp, xs_write_bytes, ret);
1008 		ret = generic_write_sync(iocb, ret);
1009 	}
1010 	return ret;
1011 }
1012 
1013 STATIC ssize_t
xfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1014 xfs_file_write_iter(
1015 	struct kiocb		*iocb,
1016 	struct iov_iter		*from)
1017 {
1018 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1019 	struct xfs_inode	*ip = XFS_I(inode);
1020 	ssize_t			ret;
1021 	size_t			ocount = iov_iter_count(from);
1022 
1023 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1024 
1025 	if (ocount == 0)
1026 		return 0;
1027 
1028 	if (xfs_is_shutdown(ip->i_mount))
1029 		return -EIO;
1030 
1031 	if (IS_DAX(inode))
1032 		return xfs_file_dax_write(iocb, from);
1033 
1034 	if (iocb->ki_flags & IOCB_ATOMIC) {
1035 		/*
1036 		 * Currently only atomic writing of a single FS block is
1037 		 * supported. It would be possible to atomic write smaller than
1038 		 * a FS block, but there is no requirement to support this.
1039 		 * Note that iomap also does not support this yet.
1040 		 */
1041 		if (ocount != ip->i_mount->m_sb.sb_blocksize)
1042 			return -EINVAL;
1043 		ret = generic_atomic_write_valid(iocb, from);
1044 		if (ret)
1045 			return ret;
1046 	}
1047 
1048 	if (iocb->ki_flags & IOCB_DIRECT) {
1049 		/*
1050 		 * Allow a directio write to fall back to a buffered
1051 		 * write *only* in the case that we're doing a reflink
1052 		 * CoW.  In all other directio scenarios we do not
1053 		 * allow an operation to fall back to buffered mode.
1054 		 */
1055 		ret = xfs_file_dio_write(iocb, from);
1056 		if (ret != -ENOTBLK)
1057 			return ret;
1058 	}
1059 
1060 	if (xfs_is_zoned_inode(ip))
1061 		return xfs_file_buffered_write_zoned(iocb, from);
1062 	return xfs_file_buffered_write(iocb, from);
1063 }
1064 
1065 /* Does this file, inode, or mount want synchronous writes? */
xfs_file_sync_writes(struct file * filp)1066 static inline bool xfs_file_sync_writes(struct file *filp)
1067 {
1068 	struct xfs_inode	*ip = XFS_I(file_inode(filp));
1069 
1070 	if (xfs_has_wsync(ip->i_mount))
1071 		return true;
1072 	if (filp->f_flags & (__O_SYNC | O_DSYNC))
1073 		return true;
1074 	if (IS_SYNC(file_inode(filp)))
1075 		return true;
1076 
1077 	return false;
1078 }
1079 
1080 static int
xfs_falloc_newsize(struct file * file,int mode,loff_t offset,loff_t len,loff_t * new_size)1081 xfs_falloc_newsize(
1082 	struct file		*file,
1083 	int			mode,
1084 	loff_t			offset,
1085 	loff_t			len,
1086 	loff_t			*new_size)
1087 {
1088 	struct inode		*inode = file_inode(file);
1089 
1090 	if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
1091 		return 0;
1092 	*new_size = offset + len;
1093 	return inode_newsize_ok(inode, *new_size);
1094 }
1095 
1096 static int
xfs_falloc_setsize(struct file * file,loff_t new_size)1097 xfs_falloc_setsize(
1098 	struct file		*file,
1099 	loff_t			new_size)
1100 {
1101 	struct iattr iattr = {
1102 		.ia_valid	= ATTR_SIZE,
1103 		.ia_size	= new_size,
1104 	};
1105 
1106 	if (!new_size)
1107 		return 0;
1108 	return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
1109 			&iattr);
1110 }
1111 
1112 static int
xfs_falloc_collapse_range(struct file * file,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1113 xfs_falloc_collapse_range(
1114 	struct file		*file,
1115 	loff_t			offset,
1116 	loff_t			len,
1117 	struct xfs_zone_alloc_ctx *ac)
1118 {
1119 	struct inode		*inode = file_inode(file);
1120 	loff_t			new_size = i_size_read(inode) - len;
1121 	int			error;
1122 
1123 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1124 		return -EINVAL;
1125 
1126 	/*
1127 	 * There is no need to overlap collapse range with EOF, in which case it
1128 	 * is effectively a truncate operation
1129 	 */
1130 	if (offset + len >= i_size_read(inode))
1131 		return -EINVAL;
1132 
1133 	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1134 	if (error)
1135 		return error;
1136 	return xfs_falloc_setsize(file, new_size);
1137 }
1138 
1139 static int
xfs_falloc_insert_range(struct file * file,loff_t offset,loff_t len)1140 xfs_falloc_insert_range(
1141 	struct file		*file,
1142 	loff_t			offset,
1143 	loff_t			len)
1144 {
1145 	struct inode		*inode = file_inode(file);
1146 	loff_t			isize = i_size_read(inode);
1147 	int			error;
1148 
1149 	if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
1150 		return -EINVAL;
1151 
1152 	/*
1153 	 * New inode size must not exceed ->s_maxbytes, accounting for
1154 	 * possible signed overflow.
1155 	 */
1156 	if (inode->i_sb->s_maxbytes - isize < len)
1157 		return -EFBIG;
1158 
1159 	/* Offset should be less than i_size */
1160 	if (offset >= isize)
1161 		return -EINVAL;
1162 
1163 	error = xfs_falloc_setsize(file, isize + len);
1164 	if (error)
1165 		return error;
1166 
1167 	/*
1168 	 * Perform hole insertion now that the file size has been updated so
1169 	 * that if we crash during the operation we don't leave shifted extents
1170 	 * past EOF and hence losing access to the data that is contained within
1171 	 * them.
1172 	 */
1173 	return xfs_insert_file_space(XFS_I(inode), offset, len);
1174 }
1175 
1176 /*
1177  * Punch a hole and prealloc the range.  We use a hole punch rather than
1178  * unwritten extent conversion for two reasons:
1179  *
1180  *   1.) Hole punch handles partial block zeroing for us.
1181  *   2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1182  *	 virtue of the hole punch.
1183  */
1184 static int
xfs_falloc_zero_range(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1185 xfs_falloc_zero_range(
1186 	struct file		*file,
1187 	int			mode,
1188 	loff_t			offset,
1189 	loff_t			len,
1190 	struct xfs_zone_alloc_ctx *ac)
1191 {
1192 	struct inode		*inode = file_inode(file);
1193 	unsigned int		blksize = i_blocksize(inode);
1194 	loff_t			new_size = 0;
1195 	int			error;
1196 
1197 	trace_xfs_zero_file_space(XFS_I(inode));
1198 
1199 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1200 	if (error)
1201 		return error;
1202 
1203 	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
1204 	if (error)
1205 		return error;
1206 
1207 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
1208 	offset = round_down(offset, blksize);
1209 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1210 	if (error)
1211 		return error;
1212 	return xfs_falloc_setsize(file, new_size);
1213 }
1214 
1215 static int
xfs_falloc_unshare_range(struct file * file,int mode,loff_t offset,loff_t len)1216 xfs_falloc_unshare_range(
1217 	struct file		*file,
1218 	int			mode,
1219 	loff_t			offset,
1220 	loff_t			len)
1221 {
1222 	struct inode		*inode = file_inode(file);
1223 	loff_t			new_size = 0;
1224 	int			error;
1225 
1226 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1227 	if (error)
1228 		return error;
1229 
1230 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
1231 	if (error)
1232 		return error;
1233 
1234 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1235 	if (error)
1236 		return error;
1237 	return xfs_falloc_setsize(file, new_size);
1238 }
1239 
1240 static int
xfs_falloc_allocate_range(struct file * file,int mode,loff_t offset,loff_t len)1241 xfs_falloc_allocate_range(
1242 	struct file		*file,
1243 	int			mode,
1244 	loff_t			offset,
1245 	loff_t			len)
1246 {
1247 	struct inode		*inode = file_inode(file);
1248 	loff_t			new_size = 0;
1249 	int			error;
1250 
1251 	/*
1252 	 * If always_cow mode we can't use preallocations and thus should not
1253 	 * create them.
1254 	 */
1255 	if (xfs_is_always_cow_inode(XFS_I(inode)))
1256 		return -EOPNOTSUPP;
1257 
1258 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
1259 	if (error)
1260 		return error;
1261 
1262 	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1263 	if (error)
1264 		return error;
1265 	return xfs_falloc_setsize(file, new_size);
1266 }
1267 
1268 #define	XFS_FALLOC_FL_SUPPORTED						\
1269 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
1270 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
1271 		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
1272 
1273 STATIC long
__xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len,struct xfs_zone_alloc_ctx * ac)1274 __xfs_file_fallocate(
1275 	struct file		*file,
1276 	int			mode,
1277 	loff_t			offset,
1278 	loff_t			len,
1279 	struct xfs_zone_alloc_ctx *ac)
1280 {
1281 	struct inode		*inode = file_inode(file);
1282 	struct xfs_inode	*ip = XFS_I(inode);
1283 	long			error;
1284 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1285 
1286 	xfs_ilock(ip, iolock);
1287 	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1288 	if (error)
1289 		goto out_unlock;
1290 
1291 	/*
1292 	 * Must wait for all AIO to complete before we continue as AIO can
1293 	 * change the file size on completion without holding any locks we
1294 	 * currently hold. We must do this first because AIO can update both
1295 	 * the on disk and in memory inode sizes, and the operations that follow
1296 	 * require the in-memory size to be fully up-to-date.
1297 	 */
1298 	inode_dio_wait(inode);
1299 
1300 	error = file_modified(file);
1301 	if (error)
1302 		goto out_unlock;
1303 
1304 	switch (mode & FALLOC_FL_MODE_MASK) {
1305 	case FALLOC_FL_PUNCH_HOLE:
1306 		error = xfs_free_file_space(ip, offset, len, ac);
1307 		break;
1308 	case FALLOC_FL_COLLAPSE_RANGE:
1309 		error = xfs_falloc_collapse_range(file, offset, len, ac);
1310 		break;
1311 	case FALLOC_FL_INSERT_RANGE:
1312 		error = xfs_falloc_insert_range(file, offset, len);
1313 		break;
1314 	case FALLOC_FL_ZERO_RANGE:
1315 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1316 		break;
1317 	case FALLOC_FL_UNSHARE_RANGE:
1318 		error = xfs_falloc_unshare_range(file, mode, offset, len);
1319 		break;
1320 	case FALLOC_FL_ALLOCATE_RANGE:
1321 		error = xfs_falloc_allocate_range(file, mode, offset, len);
1322 		break;
1323 	default:
1324 		error = -EOPNOTSUPP;
1325 		break;
1326 	}
1327 
1328 	if (!error && xfs_file_sync_writes(file))
1329 		error = xfs_log_force_inode(ip);
1330 
1331 out_unlock:
1332 	xfs_iunlock(ip, iolock);
1333 	return error;
1334 }
1335 
1336 static long
xfs_file_zoned_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1337 xfs_file_zoned_fallocate(
1338 	struct file		*file,
1339 	int			mode,
1340 	loff_t			offset,
1341 	loff_t			len)
1342 {
1343 	struct xfs_zone_alloc_ctx ac = { };
1344 	struct xfs_inode	*ip = XFS_I(file_inode(file));
1345 	int			error;
1346 
1347 	error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
1348 	if (error)
1349 		return error;
1350 	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
1351 	xfs_zoned_space_unreserve(ip, &ac);
1352 	return error;
1353 }
1354 
1355 static long
xfs_file_fallocate(struct file * file,int mode,loff_t offset,loff_t len)1356 xfs_file_fallocate(
1357 	struct file		*file,
1358 	int			mode,
1359 	loff_t			offset,
1360 	loff_t			len)
1361 {
1362 	struct inode		*inode = file_inode(file);
1363 
1364 	if (!S_ISREG(inode->i_mode))
1365 		return -EINVAL;
1366 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1367 		return -EOPNOTSUPP;
1368 
1369 	/*
1370 	 * For zoned file systems, zeroing the first and last block of a hole
1371 	 * punch requires allocating a new block to rewrite the remaining data
1372 	 * and new zeroes out of place.  Get a reservations for those before
1373 	 * taking the iolock.  Dip into the reserved pool because we are
1374 	 * expected to be able to punch a hole even on a completely full
1375 	 * file system.
1376 	 */
1377 	if (xfs_is_zoned_inode(XFS_I(inode)) &&
1378 	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1379 		     FALLOC_FL_COLLAPSE_RANGE)))
1380 		return xfs_file_zoned_fallocate(file, mode, offset, len);
1381 	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1382 }
1383 
1384 STATIC int
xfs_file_fadvise(struct file * file,loff_t start,loff_t end,int advice)1385 xfs_file_fadvise(
1386 	struct file	*file,
1387 	loff_t		start,
1388 	loff_t		end,
1389 	int		advice)
1390 {
1391 	struct xfs_inode *ip = XFS_I(file_inode(file));
1392 	int ret;
1393 	int lockflags = 0;
1394 
1395 	/*
1396 	 * Operations creating pages in page cache need protection from hole
1397 	 * punching and similar ops
1398 	 */
1399 	if (advice == POSIX_FADV_WILLNEED) {
1400 		lockflags = XFS_IOLOCK_SHARED;
1401 		xfs_ilock(ip, lockflags);
1402 	}
1403 	ret = generic_fadvise(file, start, end, advice);
1404 	if (lockflags)
1405 		xfs_iunlock(ip, lockflags);
1406 	return ret;
1407 }
1408 
1409 STATIC loff_t
xfs_file_remap_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)1410 xfs_file_remap_range(
1411 	struct file		*file_in,
1412 	loff_t			pos_in,
1413 	struct file		*file_out,
1414 	loff_t			pos_out,
1415 	loff_t			len,
1416 	unsigned int		remap_flags)
1417 {
1418 	struct inode		*inode_in = file_inode(file_in);
1419 	struct xfs_inode	*src = XFS_I(inode_in);
1420 	struct inode		*inode_out = file_inode(file_out);
1421 	struct xfs_inode	*dest = XFS_I(inode_out);
1422 	struct xfs_mount	*mp = src->i_mount;
1423 	loff_t			remapped = 0;
1424 	xfs_extlen_t		cowextsize;
1425 	int			ret;
1426 
1427 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1428 		return -EINVAL;
1429 
1430 	if (!xfs_has_reflink(mp))
1431 		return -EOPNOTSUPP;
1432 
1433 	if (xfs_is_shutdown(mp))
1434 		return -EIO;
1435 
1436 	/* Prepare and then clone file data. */
1437 	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1438 			&len, remap_flags);
1439 	if (ret || len == 0)
1440 		return ret;
1441 
1442 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1443 
1444 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1445 			&remapped);
1446 	if (ret)
1447 		goto out_unlock;
1448 
1449 	/*
1450 	 * Carry the cowextsize hint from src to dest if we're sharing the
1451 	 * entire source file to the entire destination file, the source file
1452 	 * has a cowextsize hint, and the destination file does not.
1453 	 */
1454 	cowextsize = 0;
1455 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1456 	    (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1457 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1458 	    !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1459 		cowextsize = src->i_cowextsize;
1460 
1461 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1462 			remap_flags);
1463 	if (ret)
1464 		goto out_unlock;
1465 
1466 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1467 		xfs_log_force_inode(dest);
1468 out_unlock:
1469 	xfs_iunlock2_remapping(src, dest);
1470 	if (ret)
1471 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1472 	/*
1473 	 * If the caller did not set CAN_SHORTEN, then it is not prepared to
1474 	 * handle partial results -- either the whole remap succeeds, or we
1475 	 * must say why it did not.  In this case, any error should be returned
1476 	 * to the caller.
1477 	 */
1478 	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1479 		return ret;
1480 	return remapped > 0 ? remapped : ret;
1481 }
1482 
1483 STATIC int
xfs_file_open(struct inode * inode,struct file * file)1484 xfs_file_open(
1485 	struct inode	*inode,
1486 	struct file	*file)
1487 {
1488 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1489 		return -EIO;
1490 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
1491 	if (xfs_inode_can_atomicwrite(XFS_I(inode)))
1492 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
1493 	return generic_file_open(inode, file);
1494 }
1495 
1496 STATIC int
xfs_dir_open(struct inode * inode,struct file * file)1497 xfs_dir_open(
1498 	struct inode	*inode,
1499 	struct file	*file)
1500 {
1501 	struct xfs_inode *ip = XFS_I(inode);
1502 	unsigned int	mode;
1503 	int		error;
1504 
1505 	if (xfs_is_shutdown(ip->i_mount))
1506 		return -EIO;
1507 	error = generic_file_open(inode, file);
1508 	if (error)
1509 		return error;
1510 
1511 	/*
1512 	 * If there are any blocks, read-ahead block 0 as we're almost
1513 	 * certain to have the next operation be a read there.
1514 	 */
1515 	mode = xfs_ilock_data_map_shared(ip);
1516 	if (ip->i_df.if_nextents > 0)
1517 		error = xfs_dir3_data_readahead(ip, 0, 0);
1518 	xfs_iunlock(ip, mode);
1519 	return error;
1520 }
1521 
1522 /*
1523  * Don't bother propagating errors.  We're just doing cleanup, and the caller
1524  * ignores the return value anyway.
1525  */
1526 STATIC int
xfs_file_release(struct inode * inode,struct file * file)1527 xfs_file_release(
1528 	struct inode		*inode,
1529 	struct file		*file)
1530 {
1531 	struct xfs_inode	*ip = XFS_I(inode);
1532 	struct xfs_mount	*mp = ip->i_mount;
1533 
1534 	/*
1535 	 * If this is a read-only mount or the file system has been shut down,
1536 	 * don't generate I/O.
1537 	 */
1538 	if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
1539 		return 0;
1540 
1541 	/*
1542 	 * If we previously truncated this file and removed old data in the
1543 	 * process, we want to initiate "early" writeout on the last close.
1544 	 * This is an attempt to combat the notorious NULL files problem which
1545 	 * is particularly noticeable from a truncate down, buffered (re-)write
1546 	 * (delalloc), followed by a crash.  What we are effectively doing here
1547 	 * is significantly reducing the time window where we'd otherwise be
1548 	 * exposed to that problem.
1549 	 */
1550 	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1551 		xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1552 		if (ip->i_delayed_blks > 0)
1553 			filemap_flush(inode->i_mapping);
1554 	}
1555 
1556 	/*
1557 	 * XFS aggressively preallocates post-EOF space to generate contiguous
1558 	 * allocations for writers that append to the end of the file.
1559 	 *
1560 	 * To support workloads that close and reopen the file frequently, these
1561 	 * preallocations usually persist after a close unless it is the first
1562 	 * close for the inode.  This is a tradeoff to generate tightly packed
1563 	 * data layouts for unpacking tarballs or similar archives that write
1564 	 * one file after another without going back to it while keeping the
1565 	 * preallocation for files that have recurring open/write/close cycles.
1566 	 *
1567 	 * This heuristic is skipped for inodes with the append-only flag as
1568 	 * that flag is rather pointless for inodes written only once.
1569 	 *
1570 	 * There is no point in freeing blocks here for open but unlinked files
1571 	 * as they will be taken care of by the inactivation path soon.
1572 	 *
1573 	 * When releasing a read-only context, don't flush data or trim post-EOF
1574 	 * blocks.  This avoids open/read/close workloads from removing EOF
1575 	 * blocks that other writers depend upon to reduce fragmentation.
1576 	 *
1577 	 * Inodes on the zoned RT device never have preallocations, so skip
1578 	 * taking the locks below.
1579 	 */
1580 	if (!inode->i_nlink ||
1581 	    !(file->f_mode & FMODE_WRITE) ||
1582 	    (ip->i_diflags & XFS_DIFLAG_APPEND) ||
1583 	    xfs_is_zoned_inode(ip))
1584 		return 0;
1585 
1586 	/*
1587 	 * If we can't get the iolock just skip truncating the blocks past EOF
1588 	 * because we could deadlock with the mmap_lock otherwise. We'll get
1589 	 * another chance to drop them once the last reference to the inode is
1590 	 * dropped, so we'll never leak blocks permanently.
1591 	 */
1592 	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1593 	    xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1594 		if (xfs_can_free_eofblocks(ip) &&
1595 		    !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1596 			xfs_free_eofblocks(ip);
1597 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1598 	}
1599 
1600 	return 0;
1601 }
1602 
1603 STATIC int
xfs_file_readdir(struct file * file,struct dir_context * ctx)1604 xfs_file_readdir(
1605 	struct file	*file,
1606 	struct dir_context *ctx)
1607 {
1608 	struct inode	*inode = file_inode(file);
1609 	xfs_inode_t	*ip = XFS_I(inode);
1610 	size_t		bufsize;
1611 
1612 	/*
1613 	 * The Linux API doesn't pass down the total size of the buffer
1614 	 * we read into down to the filesystem.  With the filldir concept
1615 	 * it's not needed for correct information, but the XFS dir2 leaf
1616 	 * code wants an estimate of the buffer size to calculate it's
1617 	 * readahead window and size the buffers used for mapping to
1618 	 * physical blocks.
1619 	 *
1620 	 * Try to give it an estimate that's good enough, maybe at some
1621 	 * point we can change the ->readdir prototype to include the
1622 	 * buffer size.  For now we use the current glibc buffer size.
1623 	 */
1624 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1625 
1626 	return xfs_readdir(NULL, ip, ctx, bufsize);
1627 }
1628 
1629 STATIC loff_t
xfs_file_llseek(struct file * file,loff_t offset,int whence)1630 xfs_file_llseek(
1631 	struct file	*file,
1632 	loff_t		offset,
1633 	int		whence)
1634 {
1635 	struct inode		*inode = file->f_mapping->host;
1636 
1637 	if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1638 		return -EIO;
1639 
1640 	switch (whence) {
1641 	default:
1642 		return generic_file_llseek(file, offset, whence);
1643 	case SEEK_HOLE:
1644 		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1645 		break;
1646 	case SEEK_DATA:
1647 		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1648 		break;
1649 	}
1650 
1651 	if (offset < 0)
1652 		return offset;
1653 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1654 }
1655 
1656 static inline vm_fault_t
xfs_dax_fault_locked(struct vm_fault * vmf,unsigned int order,bool write_fault)1657 xfs_dax_fault_locked(
1658 	struct vm_fault		*vmf,
1659 	unsigned int		order,
1660 	bool			write_fault)
1661 {
1662 	vm_fault_t		ret;
1663 	pfn_t			pfn;
1664 
1665 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1666 		ASSERT(0);
1667 		return VM_FAULT_SIGBUS;
1668 	}
1669 	ret = dax_iomap_fault(vmf, order, &pfn, NULL,
1670 			(write_fault && !vmf->cow_page) ?
1671 				&xfs_dax_write_iomap_ops :
1672 				&xfs_read_iomap_ops);
1673 	if (ret & VM_FAULT_NEEDDSYNC)
1674 		ret = dax_finish_sync_fault(vmf, order, pfn);
1675 	return ret;
1676 }
1677 
1678 static vm_fault_t
xfs_dax_read_fault(struct vm_fault * vmf,unsigned int order)1679 xfs_dax_read_fault(
1680 	struct vm_fault		*vmf,
1681 	unsigned int		order)
1682 {
1683 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1684 	vm_fault_t		ret;
1685 
1686 	trace_xfs_read_fault(ip, order);
1687 
1688 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1689 	ret = xfs_dax_fault_locked(vmf, order, false);
1690 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1691 
1692 	return ret;
1693 }
1694 
1695 /*
1696  * Locking for serialisation of IO during page faults. This results in a lock
1697  * ordering of:
1698  *
1699  * mmap_lock (MM)
1700  *   sb_start_pagefault(vfs, freeze)
1701  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1702  *       page_lock (MM)
1703  *         i_lock (XFS - extent map serialisation)
1704  */
1705 static vm_fault_t
__xfs_write_fault(struct vm_fault * vmf,unsigned int order,struct xfs_zone_alloc_ctx * ac)1706 __xfs_write_fault(
1707 	struct vm_fault		*vmf,
1708 	unsigned int		order,
1709 	struct xfs_zone_alloc_ctx *ac)
1710 {
1711 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1712 	struct xfs_inode	*ip = XFS_I(inode);
1713 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
1714 	vm_fault_t		ret;
1715 
1716 	trace_xfs_write_fault(ip, order);
1717 
1718 	sb_start_pagefault(inode->i_sb);
1719 	file_update_time(vmf->vma->vm_file);
1720 
1721 	/*
1722 	 * Normally we only need the shared mmaplock, but if a reflink remap is
1723 	 * in progress we take the exclusive lock to wait for the remap to
1724 	 * finish before taking a write fault.
1725 	 */
1726 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1727 	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1728 		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1729 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1730 		lock_mode = XFS_MMAPLOCK_EXCL;
1731 	}
1732 
1733 	if (IS_DAX(inode))
1734 		ret = xfs_dax_fault_locked(vmf, order, true);
1735 	else
1736 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
1737 				ac);
1738 	xfs_iunlock(ip, lock_mode);
1739 
1740 	sb_end_pagefault(inode->i_sb);
1741 	return ret;
1742 }
1743 
1744 static vm_fault_t
xfs_write_fault_zoned(struct vm_fault * vmf,unsigned int order)1745 xfs_write_fault_zoned(
1746 	struct vm_fault		*vmf,
1747 	unsigned int		order)
1748 {
1749 	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
1750 	unsigned int		len = folio_size(page_folio(vmf->page));
1751 	struct xfs_zone_alloc_ctx ac = { };
1752 	int			error;
1753 	vm_fault_t		ret;
1754 
1755 	/*
1756 	 * This could over-allocate as it doesn't check for truncation.
1757 	 *
1758 	 * But as the overallocation is limited to less than a folio and will be
1759 	 * release instantly that's just fine.
1760 	 */
1761 	error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
1762 			&ac);
1763 	if (error < 0)
1764 		return vmf_fs_error(error);
1765 	ret = __xfs_write_fault(vmf, order, &ac);
1766 	xfs_zoned_space_unreserve(ip, &ac);
1767 	return ret;
1768 }
1769 
1770 static vm_fault_t
xfs_write_fault(struct vm_fault * vmf,unsigned int order)1771 xfs_write_fault(
1772 	struct vm_fault		*vmf,
1773 	unsigned int		order)
1774 {
1775 	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
1776 		return xfs_write_fault_zoned(vmf, order);
1777 	return __xfs_write_fault(vmf, order, NULL);
1778 }
1779 
1780 static inline bool
xfs_is_write_fault(struct vm_fault * vmf)1781 xfs_is_write_fault(
1782 	struct vm_fault		*vmf)
1783 {
1784 	return (vmf->flags & FAULT_FLAG_WRITE) &&
1785 	       (vmf->vma->vm_flags & VM_SHARED);
1786 }
1787 
1788 static vm_fault_t
xfs_filemap_fault(struct vm_fault * vmf)1789 xfs_filemap_fault(
1790 	struct vm_fault		*vmf)
1791 {
1792 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1793 
1794 	/* DAX can shortcut the normal fault path on write faults! */
1795 	if (IS_DAX(inode)) {
1796 		if (xfs_is_write_fault(vmf))
1797 			return xfs_write_fault(vmf, 0);
1798 		return xfs_dax_read_fault(vmf, 0);
1799 	}
1800 
1801 	trace_xfs_read_fault(XFS_I(inode), 0);
1802 	return filemap_fault(vmf);
1803 }
1804 
1805 static vm_fault_t
xfs_filemap_huge_fault(struct vm_fault * vmf,unsigned int order)1806 xfs_filemap_huge_fault(
1807 	struct vm_fault		*vmf,
1808 	unsigned int		order)
1809 {
1810 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1811 		return VM_FAULT_FALLBACK;
1812 
1813 	/* DAX can shortcut the normal fault path on write faults! */
1814 	if (xfs_is_write_fault(vmf))
1815 		return xfs_write_fault(vmf, order);
1816 	return xfs_dax_read_fault(vmf, order);
1817 }
1818 
1819 static vm_fault_t
xfs_filemap_page_mkwrite(struct vm_fault * vmf)1820 xfs_filemap_page_mkwrite(
1821 	struct vm_fault		*vmf)
1822 {
1823 	return xfs_write_fault(vmf, 0);
1824 }
1825 
1826 /*
1827  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1828  * on write faults. In reality, it needs to serialise against truncate and
1829  * prepare memory for writing so handle is as standard write fault.
1830  */
1831 static vm_fault_t
xfs_filemap_pfn_mkwrite(struct vm_fault * vmf)1832 xfs_filemap_pfn_mkwrite(
1833 	struct vm_fault		*vmf)
1834 {
1835 	return xfs_write_fault(vmf, 0);
1836 }
1837 
1838 static const struct vm_operations_struct xfs_file_vm_ops = {
1839 	.fault		= xfs_filemap_fault,
1840 	.huge_fault	= xfs_filemap_huge_fault,
1841 	.map_pages	= filemap_map_pages,
1842 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1843 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1844 };
1845 
1846 STATIC int
xfs_file_mmap(struct file * file,struct vm_area_struct * vma)1847 xfs_file_mmap(
1848 	struct file		*file,
1849 	struct vm_area_struct	*vma)
1850 {
1851 	struct inode		*inode = file_inode(file);
1852 	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1853 
1854 	/*
1855 	 * We don't support synchronous mappings for non-DAX files and
1856 	 * for DAX files if underneath dax_device is not synchronous.
1857 	 */
1858 	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1859 		return -EOPNOTSUPP;
1860 
1861 	file_accessed(file);
1862 	vma->vm_ops = &xfs_file_vm_ops;
1863 	if (IS_DAX(inode))
1864 		vm_flags_set(vma, VM_HUGEPAGE);
1865 	return 0;
1866 }
1867 
1868 const struct file_operations xfs_file_operations = {
1869 	.llseek		= xfs_file_llseek,
1870 	.read_iter	= xfs_file_read_iter,
1871 	.write_iter	= xfs_file_write_iter,
1872 	.splice_read	= xfs_file_splice_read,
1873 	.splice_write	= iter_file_splice_write,
1874 	.iopoll		= iocb_bio_iopoll,
1875 	.unlocked_ioctl	= xfs_file_ioctl,
1876 #ifdef CONFIG_COMPAT
1877 	.compat_ioctl	= xfs_file_compat_ioctl,
1878 #endif
1879 	.mmap		= xfs_file_mmap,
1880 	.open		= xfs_file_open,
1881 	.release	= xfs_file_release,
1882 	.fsync		= xfs_file_fsync,
1883 	.get_unmapped_area = thp_get_unmapped_area,
1884 	.fallocate	= xfs_file_fallocate,
1885 	.fadvise	= xfs_file_fadvise,
1886 	.remap_file_range = xfs_file_remap_range,
1887 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
1888 			  FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
1889 			  FOP_DONTCACHE,
1890 };
1891 
1892 const struct file_operations xfs_dir_file_operations = {
1893 	.open		= xfs_dir_open,
1894 	.read		= generic_read_dir,
1895 	.iterate_shared	= xfs_file_readdir,
1896 	.llseek		= generic_file_llseek,
1897 	.unlocked_ioctl	= xfs_file_ioctl,
1898 #ifdef CONFIG_COMPAT
1899 	.compat_ioctl	= xfs_file_compat_ioctl,
1900 #endif
1901 	.fsync		= xfs_dir_fsync,
1902 };
1903