xref: /linux/fs/xfs/xfs_file.c (revision 160b8e75932fd51a49607d32dbfa1d417977b79c)
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_mount.h"
25 #include "xfs_da_format.h"
26 #include "xfs_da_btree.h"
27 #include "xfs_inode.h"
28 #include "xfs_trans.h"
29 #include "xfs_inode_item.h"
30 #include "xfs_bmap.h"
31 #include "xfs_bmap_util.h"
32 #include "xfs_error.h"
33 #include "xfs_dir2.h"
34 #include "xfs_dir2_priv.h"
35 #include "xfs_ioctl.h"
36 #include "xfs_trace.h"
37 #include "xfs_log.h"
38 #include "xfs_icache.h"
39 #include "xfs_pnfs.h"
40 #include "xfs_iomap.h"
41 #include "xfs_reflink.h"
42 
43 #include <linux/dcache.h>
44 #include <linux/falloc.h>
45 #include <linux/pagevec.h>
46 #include <linux/backing-dev.h>
47 #include <linux/mman.h>
48 
49 static const struct vm_operations_struct xfs_file_vm_ops;
50 
51 /*
52  * Clear the specified ranges to zero through either the pagecache or DAX.
53  * Holes and unwritten extents will be left as-is as they already are zeroed.
54  */
55 int
56 xfs_zero_range(
57 	struct xfs_inode	*ip,
58 	xfs_off_t		pos,
59 	xfs_off_t		count,
60 	bool			*did_zero)
61 {
62 	return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops);
63 }
64 
65 int
66 xfs_update_prealloc_flags(
67 	struct xfs_inode	*ip,
68 	enum xfs_prealloc_flags	flags)
69 {
70 	struct xfs_trans	*tp;
71 	int			error;
72 
73 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
74 			0, 0, 0, &tp);
75 	if (error)
76 		return error;
77 
78 	xfs_ilock(ip, XFS_ILOCK_EXCL);
79 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
80 
81 	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
82 		VFS_I(ip)->i_mode &= ~S_ISUID;
83 		if (VFS_I(ip)->i_mode & S_IXGRP)
84 			VFS_I(ip)->i_mode &= ~S_ISGID;
85 		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
86 	}
87 
88 	if (flags & XFS_PREALLOC_SET)
89 		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
90 	if (flags & XFS_PREALLOC_CLEAR)
91 		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
92 
93 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
94 	if (flags & XFS_PREALLOC_SYNC)
95 		xfs_trans_set_sync(tp);
96 	return xfs_trans_commit(tp);
97 }
98 
99 /*
100  * Fsync operations on directories are much simpler than on regular files,
101  * as there is no file data to flush, and thus also no need for explicit
102  * cache flush operations, and there are no non-transaction metadata updates
103  * on directories either.
104  */
105 STATIC int
106 xfs_dir_fsync(
107 	struct file		*file,
108 	loff_t			start,
109 	loff_t			end,
110 	int			datasync)
111 {
112 	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
113 	struct xfs_mount	*mp = ip->i_mount;
114 	xfs_lsn_t		lsn = 0;
115 
116 	trace_xfs_dir_fsync(ip);
117 
118 	xfs_ilock(ip, XFS_ILOCK_SHARED);
119 	if (xfs_ipincount(ip))
120 		lsn = ip->i_itemp->ili_last_lsn;
121 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
122 
123 	if (!lsn)
124 		return 0;
125 	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
126 }
127 
128 STATIC int
129 xfs_file_fsync(
130 	struct file		*file,
131 	loff_t			start,
132 	loff_t			end,
133 	int			datasync)
134 {
135 	struct inode		*inode = file->f_mapping->host;
136 	struct xfs_inode	*ip = XFS_I(inode);
137 	struct xfs_mount	*mp = ip->i_mount;
138 	int			error = 0;
139 	int			log_flushed = 0;
140 	xfs_lsn_t		lsn = 0;
141 
142 	trace_xfs_file_fsync(ip);
143 
144 	error = file_write_and_wait_range(file, start, end);
145 	if (error)
146 		return error;
147 
148 	if (XFS_FORCED_SHUTDOWN(mp))
149 		return -EIO;
150 
151 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
152 
153 	/*
154 	 * If we have an RT and/or log subvolume we need to make sure to flush
155 	 * the write cache the device used for file data first.  This is to
156 	 * ensure newly written file data make it to disk before logging the new
157 	 * inode size in case of an extending write.
158 	 */
159 	if (XFS_IS_REALTIME_INODE(ip))
160 		xfs_blkdev_issue_flush(mp->m_rtdev_targp);
161 	else if (mp->m_logdev_targp != mp->m_ddev_targp)
162 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
163 
164 	/*
165 	 * All metadata updates are logged, which means that we just have to
166 	 * flush the log up to the latest LSN that touched the inode. If we have
167 	 * concurrent fsync/fdatasync() calls, we need them to all block on the
168 	 * log force before we clear the ili_fsync_fields field. This ensures
169 	 * that we don't get a racing sync operation that does not wait for the
170 	 * metadata to hit the journal before returning. If we race with
171 	 * clearing the ili_fsync_fields, then all that will happen is the log
172 	 * force will do nothing as the lsn will already be on disk. We can't
173 	 * race with setting ili_fsync_fields because that is done under
174 	 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
175 	 * until after the ili_fsync_fields is cleared.
176 	 */
177 	xfs_ilock(ip, XFS_ILOCK_SHARED);
178 	if (xfs_ipincount(ip)) {
179 		if (!datasync ||
180 		    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
181 			lsn = ip->i_itemp->ili_last_lsn;
182 	}
183 
184 	if (lsn) {
185 		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
186 		ip->i_itemp->ili_fsync_fields = 0;
187 	}
188 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
189 
190 	/*
191 	 * If we only have a single device, and the log force about was
192 	 * a no-op we might have to flush the data device cache here.
193 	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
194 	 * an already allocated file and thus do not have any metadata to
195 	 * commit.
196 	 */
197 	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
198 	    mp->m_logdev_targp == mp->m_ddev_targp)
199 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
200 
201 	return error;
202 }
203 
204 STATIC ssize_t
205 xfs_file_dio_aio_read(
206 	struct kiocb		*iocb,
207 	struct iov_iter		*to)
208 {
209 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
210 	size_t			count = iov_iter_count(to);
211 	ssize_t			ret;
212 
213 	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
214 
215 	if (!count)
216 		return 0; /* skip atime */
217 
218 	file_accessed(iocb->ki_filp);
219 
220 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
221 	ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
222 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
223 
224 	return ret;
225 }
226 
227 static noinline ssize_t
228 xfs_file_dax_read(
229 	struct kiocb		*iocb,
230 	struct iov_iter		*to)
231 {
232 	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
233 	size_t			count = iov_iter_count(to);
234 	ssize_t			ret = 0;
235 
236 	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
237 
238 	if (!count)
239 		return 0; /* skip atime */
240 
241 	if (iocb->ki_flags & IOCB_NOWAIT) {
242 		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
243 			return -EAGAIN;
244 	} else {
245 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
246 	}
247 
248 	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
249 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
250 
251 	file_accessed(iocb->ki_filp);
252 	return ret;
253 }
254 
255 STATIC ssize_t
256 xfs_file_buffered_aio_read(
257 	struct kiocb		*iocb,
258 	struct iov_iter		*to)
259 {
260 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
261 	ssize_t			ret;
262 
263 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
264 
265 	if (iocb->ki_flags & IOCB_NOWAIT) {
266 		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
267 			return -EAGAIN;
268 	} else {
269 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
270 	}
271 	ret = generic_file_read_iter(iocb, to);
272 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
273 
274 	return ret;
275 }
276 
277 STATIC ssize_t
278 xfs_file_read_iter(
279 	struct kiocb		*iocb,
280 	struct iov_iter		*to)
281 {
282 	struct inode		*inode = file_inode(iocb->ki_filp);
283 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
284 	ssize_t			ret = 0;
285 
286 	XFS_STATS_INC(mp, xs_read_calls);
287 
288 	if (XFS_FORCED_SHUTDOWN(mp))
289 		return -EIO;
290 
291 	if (IS_DAX(inode))
292 		ret = xfs_file_dax_read(iocb, to);
293 	else if (iocb->ki_flags & IOCB_DIRECT)
294 		ret = xfs_file_dio_aio_read(iocb, to);
295 	else
296 		ret = xfs_file_buffered_aio_read(iocb, to);
297 
298 	if (ret > 0)
299 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
300 	return ret;
301 }
302 
303 /*
304  * Zero any on disk space between the current EOF and the new, larger EOF.
305  *
306  * This handles the normal case of zeroing the remainder of the last block in
307  * the file and the unusual case of zeroing blocks out beyond the size of the
308  * file.  This second case only happens with fixed size extents and when the
309  * system crashes before the inode size was updated but after blocks were
310  * allocated.
311  *
312  * Expects the iolock to be held exclusive, and will take the ilock internally.
313  */
314 int					/* error (positive) */
315 xfs_zero_eof(
316 	struct xfs_inode	*ip,
317 	xfs_off_t		offset,		/* starting I/O offset */
318 	xfs_fsize_t		isize,		/* current inode size */
319 	bool			*did_zeroing)
320 {
321 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
322 	ASSERT(offset > isize);
323 
324 	trace_xfs_zero_eof(ip, isize, offset - isize);
325 	return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
326 }
327 
328 /*
329  * Common pre-write limit and setup checks.
330  *
331  * Called with the iolocked held either shared and exclusive according to
332  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
333  * if called for a direct write beyond i_size.
334  */
335 STATIC ssize_t
336 xfs_file_aio_write_checks(
337 	struct kiocb		*iocb,
338 	struct iov_iter		*from,
339 	int			*iolock)
340 {
341 	struct file		*file = iocb->ki_filp;
342 	struct inode		*inode = file->f_mapping->host;
343 	struct xfs_inode	*ip = XFS_I(inode);
344 	ssize_t			error = 0;
345 	size_t			count = iov_iter_count(from);
346 	bool			drained_dio = false;
347 
348 restart:
349 	error = generic_write_checks(iocb, from);
350 	if (error <= 0)
351 		return error;
352 
353 	error = xfs_break_layouts(inode, iolock);
354 	if (error)
355 		return error;
356 
357 	/*
358 	 * For changing security info in file_remove_privs() we need i_rwsem
359 	 * exclusively.
360 	 */
361 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
362 		xfs_iunlock(ip, *iolock);
363 		*iolock = XFS_IOLOCK_EXCL;
364 		xfs_ilock(ip, *iolock);
365 		goto restart;
366 	}
367 	/*
368 	 * If the offset is beyond the size of the file, we need to zero any
369 	 * blocks that fall between the existing EOF and the start of this
370 	 * write.  If zeroing is needed and we are currently holding the
371 	 * iolock shared, we need to update it to exclusive which implies
372 	 * having to redo all checks before.
373 	 *
374 	 * We need to serialise against EOF updates that occur in IO
375 	 * completions here. We want to make sure that nobody is changing the
376 	 * size while we do this check until we have placed an IO barrier (i.e.
377 	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
378 	 * The spinlock effectively forms a memory barrier once we have the
379 	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
380 	 * and hence be able to correctly determine if we need to run zeroing.
381 	 */
382 	spin_lock(&ip->i_flags_lock);
383 	if (iocb->ki_pos > i_size_read(inode)) {
384 		spin_unlock(&ip->i_flags_lock);
385 		if (!drained_dio) {
386 			if (*iolock == XFS_IOLOCK_SHARED) {
387 				xfs_iunlock(ip, *iolock);
388 				*iolock = XFS_IOLOCK_EXCL;
389 				xfs_ilock(ip, *iolock);
390 				iov_iter_reexpand(from, count);
391 			}
392 			/*
393 			 * We now have an IO submission barrier in place, but
394 			 * AIO can do EOF updates during IO completion and hence
395 			 * we now need to wait for all of them to drain. Non-AIO
396 			 * DIO will have drained before we are given the
397 			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
398 			 * no-op.
399 			 */
400 			inode_dio_wait(inode);
401 			drained_dio = true;
402 			goto restart;
403 		}
404 		error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL);
405 		if (error)
406 			return error;
407 	} else
408 		spin_unlock(&ip->i_flags_lock);
409 
410 	/*
411 	 * Updating the timestamps will grab the ilock again from
412 	 * xfs_fs_dirty_inode, so we have to call it after dropping the
413 	 * lock above.  Eventually we should look into a way to avoid
414 	 * the pointless lock roundtrip.
415 	 */
416 	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
417 		error = file_update_time(file);
418 		if (error)
419 			return error;
420 	}
421 
422 	/*
423 	 * If we're writing the file then make sure to clear the setuid and
424 	 * setgid bits if the process is not being run by root.  This keeps
425 	 * people from modifying setuid and setgid binaries.
426 	 */
427 	if (!IS_NOSEC(inode))
428 		return file_remove_privs(file);
429 	return 0;
430 }
431 
432 static int
433 xfs_dio_write_end_io(
434 	struct kiocb		*iocb,
435 	ssize_t			size,
436 	unsigned		flags)
437 {
438 	struct inode		*inode = file_inode(iocb->ki_filp);
439 	struct xfs_inode	*ip = XFS_I(inode);
440 	loff_t			offset = iocb->ki_pos;
441 	int			error = 0;
442 
443 	trace_xfs_end_io_direct_write(ip, offset, size);
444 
445 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
446 		return -EIO;
447 
448 	if (size <= 0)
449 		return size;
450 
451 	if (flags & IOMAP_DIO_COW) {
452 		error = xfs_reflink_end_cow(ip, offset, size);
453 		if (error)
454 			return error;
455 	}
456 
457 	/*
458 	 * Unwritten conversion updates the in-core isize after extent
459 	 * conversion but before updating the on-disk size. Updating isize any
460 	 * earlier allows a racing dio read to find unwritten extents before
461 	 * they are converted.
462 	 */
463 	if (flags & IOMAP_DIO_UNWRITTEN)
464 		return xfs_iomap_write_unwritten(ip, offset, size, true);
465 
466 	/*
467 	 * We need to update the in-core inode size here so that we don't end up
468 	 * with the on-disk inode size being outside the in-core inode size. We
469 	 * have no other method of updating EOF for AIO, so always do it here
470 	 * if necessary.
471 	 *
472 	 * We need to lock the test/set EOF update as we can be racing with
473 	 * other IO completions here to update the EOF. Failing to serialise
474 	 * here can result in EOF moving backwards and Bad Things Happen when
475 	 * that occurs.
476 	 */
477 	spin_lock(&ip->i_flags_lock);
478 	if (offset + size > i_size_read(inode)) {
479 		i_size_write(inode, offset + size);
480 		spin_unlock(&ip->i_flags_lock);
481 		error = xfs_setfilesize(ip, offset, size);
482 	} else {
483 		spin_unlock(&ip->i_flags_lock);
484 	}
485 
486 	return error;
487 }
488 
489 /*
490  * xfs_file_dio_aio_write - handle direct IO writes
491  *
492  * Lock the inode appropriately to prepare for and issue a direct IO write.
493  * By separating it from the buffered write path we remove all the tricky to
494  * follow locking changes and looping.
495  *
496  * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
497  * until we're sure the bytes at the new EOF have been zeroed and/or the cached
498  * pages are flushed out.
499  *
500  * In most cases the direct IO writes will be done holding IOLOCK_SHARED
501  * allowing them to be done in parallel with reads and other direct IO writes.
502  * However, if the IO is not aligned to filesystem blocks, the direct IO layer
503  * needs to do sub-block zeroing and that requires serialisation against other
504  * direct IOs to the same block. In this case we need to serialise the
505  * submission of the unaligned IOs so that we don't get racing block zeroing in
506  * the dio layer.  To avoid the problem with aio, we also need to wait for
507  * outstanding IOs to complete so that unwritten extent conversion is completed
508  * before we try to map the overlapping block. This is currently implemented by
509  * hitting it with a big hammer (i.e. inode_dio_wait()).
510  *
511  * Returns with locks held indicated by @iolock and errors indicated by
512  * negative return values.
513  */
514 STATIC ssize_t
515 xfs_file_dio_aio_write(
516 	struct kiocb		*iocb,
517 	struct iov_iter		*from)
518 {
519 	struct file		*file = iocb->ki_filp;
520 	struct address_space	*mapping = file->f_mapping;
521 	struct inode		*inode = mapping->host;
522 	struct xfs_inode	*ip = XFS_I(inode);
523 	struct xfs_mount	*mp = ip->i_mount;
524 	ssize_t			ret = 0;
525 	int			unaligned_io = 0;
526 	int			iolock;
527 	size_t			count = iov_iter_count(from);
528 	struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
529 					mp->m_rtdev_targp : mp->m_ddev_targp;
530 
531 	/* DIO must be aligned to device logical sector size */
532 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
533 		return -EINVAL;
534 
535 	/*
536 	 * Don't take the exclusive iolock here unless the I/O is unaligned to
537 	 * the file system block size.  We don't need to consider the EOF
538 	 * extension case here because xfs_file_aio_write_checks() will relock
539 	 * the inode as necessary for EOF zeroing cases and fill out the new
540 	 * inode size as appropriate.
541 	 */
542 	if ((iocb->ki_pos & mp->m_blockmask) ||
543 	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
544 		unaligned_io = 1;
545 
546 		/*
547 		 * We can't properly handle unaligned direct I/O to reflink
548 		 * files yet, as we can't unshare a partial block.
549 		 */
550 		if (xfs_is_reflink_inode(ip)) {
551 			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
552 			return -EREMCHG;
553 		}
554 		iolock = XFS_IOLOCK_EXCL;
555 	} else {
556 		iolock = XFS_IOLOCK_SHARED;
557 	}
558 
559 	if (iocb->ki_flags & IOCB_NOWAIT) {
560 		if (!xfs_ilock_nowait(ip, iolock))
561 			return -EAGAIN;
562 	} else {
563 		xfs_ilock(ip, iolock);
564 	}
565 
566 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
567 	if (ret)
568 		goto out;
569 	count = iov_iter_count(from);
570 
571 	/*
572 	 * If we are doing unaligned IO, wait for all other IO to drain,
573 	 * otherwise demote the lock if we had to take the exclusive lock
574 	 * for other reasons in xfs_file_aio_write_checks.
575 	 */
576 	if (unaligned_io) {
577 		/* If we are going to wait for other DIO to finish, bail */
578 		if (iocb->ki_flags & IOCB_NOWAIT) {
579 			if (atomic_read(&inode->i_dio_count))
580 				return -EAGAIN;
581 		} else {
582 			inode_dio_wait(inode);
583 		}
584 	} else if (iolock == XFS_IOLOCK_EXCL) {
585 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
586 		iolock = XFS_IOLOCK_SHARED;
587 	}
588 
589 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
590 	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
591 out:
592 	xfs_iunlock(ip, iolock);
593 
594 	/*
595 	 * No fallback to buffered IO on errors for XFS, direct IO will either
596 	 * complete fully or fail.
597 	 */
598 	ASSERT(ret < 0 || ret == count);
599 	return ret;
600 }
601 
602 static noinline ssize_t
603 xfs_file_dax_write(
604 	struct kiocb		*iocb,
605 	struct iov_iter		*from)
606 {
607 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
608 	struct xfs_inode	*ip = XFS_I(inode);
609 	int			iolock = XFS_IOLOCK_EXCL;
610 	ssize_t			ret, error = 0;
611 	size_t			count;
612 	loff_t			pos;
613 
614 	if (iocb->ki_flags & IOCB_NOWAIT) {
615 		if (!xfs_ilock_nowait(ip, iolock))
616 			return -EAGAIN;
617 	} else {
618 		xfs_ilock(ip, iolock);
619 	}
620 
621 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
622 	if (ret)
623 		goto out;
624 
625 	pos = iocb->ki_pos;
626 	count = iov_iter_count(from);
627 
628 	trace_xfs_file_dax_write(ip, count, pos);
629 	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
630 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
631 		i_size_write(inode, iocb->ki_pos);
632 		error = xfs_setfilesize(ip, pos, ret);
633 	}
634 out:
635 	xfs_iunlock(ip, iolock);
636 	return error ? error : ret;
637 }
638 
639 STATIC ssize_t
640 xfs_file_buffered_aio_write(
641 	struct kiocb		*iocb,
642 	struct iov_iter		*from)
643 {
644 	struct file		*file = iocb->ki_filp;
645 	struct address_space	*mapping = file->f_mapping;
646 	struct inode		*inode = mapping->host;
647 	struct xfs_inode	*ip = XFS_I(inode);
648 	ssize_t			ret;
649 	int			enospc = 0;
650 	int			iolock;
651 
652 	if (iocb->ki_flags & IOCB_NOWAIT)
653 		return -EOPNOTSUPP;
654 
655 write_retry:
656 	iolock = XFS_IOLOCK_EXCL;
657 	xfs_ilock(ip, iolock);
658 
659 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
660 	if (ret)
661 		goto out;
662 
663 	/* We can write back this queue in page reclaim */
664 	current->backing_dev_info = inode_to_bdi(inode);
665 
666 	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
667 	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
668 	if (likely(ret >= 0))
669 		iocb->ki_pos += ret;
670 
671 	/*
672 	 * If we hit a space limit, try to free up some lingering preallocated
673 	 * space before returning an error. In the case of ENOSPC, first try to
674 	 * write back all dirty inodes to free up some of the excess reserved
675 	 * metadata space. This reduces the chances that the eofblocks scan
676 	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
677 	 * also behaves as a filter to prevent too many eofblocks scans from
678 	 * running at the same time.
679 	 */
680 	if (ret == -EDQUOT && !enospc) {
681 		xfs_iunlock(ip, iolock);
682 		enospc = xfs_inode_free_quota_eofblocks(ip);
683 		if (enospc)
684 			goto write_retry;
685 		enospc = xfs_inode_free_quota_cowblocks(ip);
686 		if (enospc)
687 			goto write_retry;
688 		iolock = 0;
689 	} else if (ret == -ENOSPC && !enospc) {
690 		struct xfs_eofblocks eofb = {0};
691 
692 		enospc = 1;
693 		xfs_flush_inodes(ip->i_mount);
694 
695 		xfs_iunlock(ip, iolock);
696 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
697 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
698 		xfs_icache_free_cowblocks(ip->i_mount, &eofb);
699 		goto write_retry;
700 	}
701 
702 	current->backing_dev_info = NULL;
703 out:
704 	if (iolock)
705 		xfs_iunlock(ip, iolock);
706 	return ret;
707 }
708 
709 STATIC ssize_t
710 xfs_file_write_iter(
711 	struct kiocb		*iocb,
712 	struct iov_iter		*from)
713 {
714 	struct file		*file = iocb->ki_filp;
715 	struct address_space	*mapping = file->f_mapping;
716 	struct inode		*inode = mapping->host;
717 	struct xfs_inode	*ip = XFS_I(inode);
718 	ssize_t			ret;
719 	size_t			ocount = iov_iter_count(from);
720 
721 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
722 
723 	if (ocount == 0)
724 		return 0;
725 
726 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
727 		return -EIO;
728 
729 	if (IS_DAX(inode))
730 		ret = xfs_file_dax_write(iocb, from);
731 	else if (iocb->ki_flags & IOCB_DIRECT) {
732 		/*
733 		 * Allow a directio write to fall back to a buffered
734 		 * write *only* in the case that we're doing a reflink
735 		 * CoW.  In all other directio scenarios we do not
736 		 * allow an operation to fall back to buffered mode.
737 		 */
738 		ret = xfs_file_dio_aio_write(iocb, from);
739 		if (ret == -EREMCHG)
740 			goto buffered;
741 	} else {
742 buffered:
743 		ret = xfs_file_buffered_aio_write(iocb, from);
744 	}
745 
746 	if (ret > 0) {
747 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
748 
749 		/* Handle various SYNC-type writes */
750 		ret = generic_write_sync(iocb, ret);
751 	}
752 	return ret;
753 }
754 
755 #define	XFS_FALLOC_FL_SUPPORTED						\
756 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
757 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
758 		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
759 
760 STATIC long
761 xfs_file_fallocate(
762 	struct file		*file,
763 	int			mode,
764 	loff_t			offset,
765 	loff_t			len)
766 {
767 	struct inode		*inode = file_inode(file);
768 	struct xfs_inode	*ip = XFS_I(inode);
769 	long			error;
770 	enum xfs_prealloc_flags	flags = 0;
771 	uint			iolock = XFS_IOLOCK_EXCL;
772 	loff_t			new_size = 0;
773 	bool			do_file_insert = false;
774 
775 	if (!S_ISREG(inode->i_mode))
776 		return -EINVAL;
777 	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
778 		return -EOPNOTSUPP;
779 
780 	xfs_ilock(ip, iolock);
781 	error = xfs_break_layouts(inode, &iolock);
782 	if (error)
783 		goto out_unlock;
784 
785 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
786 	iolock |= XFS_MMAPLOCK_EXCL;
787 
788 	if (mode & FALLOC_FL_PUNCH_HOLE) {
789 		error = xfs_free_file_space(ip, offset, len);
790 		if (error)
791 			goto out_unlock;
792 	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
793 		unsigned int blksize_mask = i_blocksize(inode) - 1;
794 
795 		if (offset & blksize_mask || len & blksize_mask) {
796 			error = -EINVAL;
797 			goto out_unlock;
798 		}
799 
800 		/*
801 		 * There is no need to overlap collapse range with EOF,
802 		 * in which case it is effectively a truncate operation
803 		 */
804 		if (offset + len >= i_size_read(inode)) {
805 			error = -EINVAL;
806 			goto out_unlock;
807 		}
808 
809 		new_size = i_size_read(inode) - len;
810 
811 		error = xfs_collapse_file_space(ip, offset, len);
812 		if (error)
813 			goto out_unlock;
814 	} else if (mode & FALLOC_FL_INSERT_RANGE) {
815 		unsigned int blksize_mask = i_blocksize(inode) - 1;
816 
817 		new_size = i_size_read(inode) + len;
818 		if (offset & blksize_mask || len & blksize_mask) {
819 			error = -EINVAL;
820 			goto out_unlock;
821 		}
822 
823 		/* check the new inode size does not wrap through zero */
824 		if (new_size > inode->i_sb->s_maxbytes) {
825 			error = -EFBIG;
826 			goto out_unlock;
827 		}
828 
829 		/* Offset should be less than i_size */
830 		if (offset >= i_size_read(inode)) {
831 			error = -EINVAL;
832 			goto out_unlock;
833 		}
834 		do_file_insert = true;
835 	} else {
836 		flags |= XFS_PREALLOC_SET;
837 
838 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
839 		    offset + len > i_size_read(inode)) {
840 			new_size = offset + len;
841 			error = inode_newsize_ok(inode, new_size);
842 			if (error)
843 				goto out_unlock;
844 		}
845 
846 		if (mode & FALLOC_FL_ZERO_RANGE)
847 			error = xfs_zero_file_space(ip, offset, len);
848 		else {
849 			if (mode & FALLOC_FL_UNSHARE_RANGE) {
850 				error = xfs_reflink_unshare(ip, offset, len);
851 				if (error)
852 					goto out_unlock;
853 			}
854 			error = xfs_alloc_file_space(ip, offset, len,
855 						     XFS_BMAPI_PREALLOC);
856 		}
857 		if (error)
858 			goto out_unlock;
859 	}
860 
861 	if (file->f_flags & O_DSYNC)
862 		flags |= XFS_PREALLOC_SYNC;
863 
864 	error = xfs_update_prealloc_flags(ip, flags);
865 	if (error)
866 		goto out_unlock;
867 
868 	/* Change file size if needed */
869 	if (new_size) {
870 		struct iattr iattr;
871 
872 		iattr.ia_valid = ATTR_SIZE;
873 		iattr.ia_size = new_size;
874 		error = xfs_vn_setattr_size(file_dentry(file), &iattr);
875 		if (error)
876 			goto out_unlock;
877 	}
878 
879 	/*
880 	 * Perform hole insertion now that the file size has been
881 	 * updated so that if we crash during the operation we don't
882 	 * leave shifted extents past EOF and hence losing access to
883 	 * the data that is contained within them.
884 	 */
885 	if (do_file_insert)
886 		error = xfs_insert_file_space(ip, offset, len);
887 
888 out_unlock:
889 	xfs_iunlock(ip, iolock);
890 	return error;
891 }
892 
893 STATIC int
894 xfs_file_clone_range(
895 	struct file	*file_in,
896 	loff_t		pos_in,
897 	struct file	*file_out,
898 	loff_t		pos_out,
899 	u64		len)
900 {
901 	return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
902 				     len, false);
903 }
904 
905 STATIC ssize_t
906 xfs_file_dedupe_range(
907 	struct file	*src_file,
908 	u64		loff,
909 	u64		len,
910 	struct file	*dst_file,
911 	u64		dst_loff)
912 {
913 	int		error;
914 
915 	error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
916 				     len, true);
917 	if (error)
918 		return error;
919 	return len;
920 }
921 
922 STATIC int
923 xfs_file_open(
924 	struct inode	*inode,
925 	struct file	*file)
926 {
927 	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
928 		return -EFBIG;
929 	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
930 		return -EIO;
931 	file->f_mode |= FMODE_NOWAIT;
932 	return 0;
933 }
934 
935 STATIC int
936 xfs_dir_open(
937 	struct inode	*inode,
938 	struct file	*file)
939 {
940 	struct xfs_inode *ip = XFS_I(inode);
941 	int		mode;
942 	int		error;
943 
944 	error = xfs_file_open(inode, file);
945 	if (error)
946 		return error;
947 
948 	/*
949 	 * If there are any blocks, read-ahead block 0 as we're almost
950 	 * certain to have the next operation be a read there.
951 	 */
952 	mode = xfs_ilock_data_map_shared(ip);
953 	if (ip->i_d.di_nextents > 0)
954 		error = xfs_dir3_data_readahead(ip, 0, -1);
955 	xfs_iunlock(ip, mode);
956 	return error;
957 }
958 
959 STATIC int
960 xfs_file_release(
961 	struct inode	*inode,
962 	struct file	*filp)
963 {
964 	return xfs_release(XFS_I(inode));
965 }
966 
967 STATIC int
968 xfs_file_readdir(
969 	struct file	*file,
970 	struct dir_context *ctx)
971 {
972 	struct inode	*inode = file_inode(file);
973 	xfs_inode_t	*ip = XFS_I(inode);
974 	size_t		bufsize;
975 
976 	/*
977 	 * The Linux API doesn't pass down the total size of the buffer
978 	 * we read into down to the filesystem.  With the filldir concept
979 	 * it's not needed for correct information, but the XFS dir2 leaf
980 	 * code wants an estimate of the buffer size to calculate it's
981 	 * readahead window and size the buffers used for mapping to
982 	 * physical blocks.
983 	 *
984 	 * Try to give it an estimate that's good enough, maybe at some
985 	 * point we can change the ->readdir prototype to include the
986 	 * buffer size.  For now we use the current glibc buffer size.
987 	 */
988 	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
989 
990 	return xfs_readdir(NULL, ip, ctx, bufsize);
991 }
992 
993 STATIC loff_t
994 xfs_file_llseek(
995 	struct file	*file,
996 	loff_t		offset,
997 	int		whence)
998 {
999 	struct inode		*inode = file->f_mapping->host;
1000 
1001 	if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
1002 		return -EIO;
1003 
1004 	switch (whence) {
1005 	default:
1006 		return generic_file_llseek(file, offset, whence);
1007 	case SEEK_HOLE:
1008 		offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
1009 		break;
1010 	case SEEK_DATA:
1011 		offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
1012 		break;
1013 	}
1014 
1015 	if (offset < 0)
1016 		return offset;
1017 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1018 }
1019 
1020 /*
1021  * Locking for serialisation of IO during page faults. This results in a lock
1022  * ordering of:
1023  *
1024  * mmap_sem (MM)
1025  *   sb_start_pagefault(vfs, freeze)
1026  *     i_mmaplock (XFS - truncate serialisation)
1027  *       page_lock (MM)
1028  *         i_lock (XFS - extent map serialisation)
1029  */
1030 static int
1031 __xfs_filemap_fault(
1032 	struct vm_fault		*vmf,
1033 	enum page_entry_size	pe_size,
1034 	bool			write_fault)
1035 {
1036 	struct inode		*inode = file_inode(vmf->vma->vm_file);
1037 	struct xfs_inode	*ip = XFS_I(inode);
1038 	int			ret;
1039 
1040 	trace_xfs_filemap_fault(ip, pe_size, write_fault);
1041 
1042 	if (write_fault) {
1043 		sb_start_pagefault(inode->i_sb);
1044 		file_update_time(vmf->vma->vm_file);
1045 	}
1046 
1047 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1048 	if (IS_DAX(inode)) {
1049 		pfn_t pfn;
1050 
1051 		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
1052 		if (ret & VM_FAULT_NEEDDSYNC)
1053 			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1054 	} else {
1055 		if (write_fault)
1056 			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
1057 		else
1058 			ret = filemap_fault(vmf);
1059 	}
1060 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1061 
1062 	if (write_fault)
1063 		sb_end_pagefault(inode->i_sb);
1064 	return ret;
1065 }
1066 
1067 static int
1068 xfs_filemap_fault(
1069 	struct vm_fault		*vmf)
1070 {
1071 	/* DAX can shortcut the normal fault path on write faults! */
1072 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1073 			IS_DAX(file_inode(vmf->vma->vm_file)) &&
1074 			(vmf->flags & FAULT_FLAG_WRITE));
1075 }
1076 
1077 static int
1078 xfs_filemap_huge_fault(
1079 	struct vm_fault		*vmf,
1080 	enum page_entry_size	pe_size)
1081 {
1082 	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1083 		return VM_FAULT_FALLBACK;
1084 
1085 	/* DAX can shortcut the normal fault path on write faults! */
1086 	return __xfs_filemap_fault(vmf, pe_size,
1087 			(vmf->flags & FAULT_FLAG_WRITE));
1088 }
1089 
1090 static int
1091 xfs_filemap_page_mkwrite(
1092 	struct vm_fault		*vmf)
1093 {
1094 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1095 }
1096 
1097 /*
1098  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1099  * on write faults. In reality, it needs to serialise against truncate and
1100  * prepare memory for writing so handle is as standard write fault.
1101  */
1102 static int
1103 xfs_filemap_pfn_mkwrite(
1104 	struct vm_fault		*vmf)
1105 {
1106 
1107 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1108 }
1109 
1110 static const struct vm_operations_struct xfs_file_vm_ops = {
1111 	.fault		= xfs_filemap_fault,
1112 	.huge_fault	= xfs_filemap_huge_fault,
1113 	.map_pages	= filemap_map_pages,
1114 	.page_mkwrite	= xfs_filemap_page_mkwrite,
1115 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1116 };
1117 
1118 STATIC int
1119 xfs_file_mmap(
1120 	struct file	*filp,
1121 	struct vm_area_struct *vma)
1122 {
1123 	/*
1124 	 * We don't support synchronous mappings for non-DAX files. At least
1125 	 * until someone comes with a sensible use case.
1126 	 */
1127 	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
1128 		return -EOPNOTSUPP;
1129 
1130 	file_accessed(filp);
1131 	vma->vm_ops = &xfs_file_vm_ops;
1132 	if (IS_DAX(file_inode(filp)))
1133 		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
1134 	return 0;
1135 }
1136 
1137 const struct file_operations xfs_file_operations = {
1138 	.llseek		= xfs_file_llseek,
1139 	.read_iter	= xfs_file_read_iter,
1140 	.write_iter	= xfs_file_write_iter,
1141 	.splice_read	= generic_file_splice_read,
1142 	.splice_write	= iter_file_splice_write,
1143 	.unlocked_ioctl	= xfs_file_ioctl,
1144 #ifdef CONFIG_COMPAT
1145 	.compat_ioctl	= xfs_file_compat_ioctl,
1146 #endif
1147 	.mmap		= xfs_file_mmap,
1148 	.mmap_supported_flags = MAP_SYNC,
1149 	.open		= xfs_file_open,
1150 	.release	= xfs_file_release,
1151 	.fsync		= xfs_file_fsync,
1152 	.get_unmapped_area = thp_get_unmapped_area,
1153 	.fallocate	= xfs_file_fallocate,
1154 	.clone_file_range = xfs_file_clone_range,
1155 	.dedupe_file_range = xfs_file_dedupe_range,
1156 };
1157 
1158 const struct file_operations xfs_dir_file_operations = {
1159 	.open		= xfs_dir_open,
1160 	.read		= generic_read_dir,
1161 	.iterate_shared	= xfs_file_readdir,
1162 	.llseek		= generic_file_llseek,
1163 	.unlocked_ioctl	= xfs_file_ioctl,
1164 #ifdef CONFIG_COMPAT
1165 	.compat_ioctl	= xfs_file_compat_ioctl,
1166 #endif
1167 	.fsync		= xfs_dir_fsync,
1168 };
1169