1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 28 #include <linux/dax.h> 29 #include <linux/falloc.h> 30 #include <linux/backing-dev.h> 31 #include <linux/mman.h> 32 #include <linux/fadvise.h> 33 #include <linux/mount.h> 34 35 static const struct vm_operations_struct xfs_file_vm_ops; 36 37 /* 38 * Decide if the given file range is aligned to the size of the fundamental 39 * allocation unit for the file. 40 */ 41 static bool 42 xfs_is_falloc_aligned( 43 struct xfs_inode *ip, 44 loff_t pos, 45 long long int len) 46 { 47 struct xfs_mount *mp = ip->i_mount; 48 uint64_t mask; 49 50 if (XFS_IS_REALTIME_INODE(ip)) { 51 if (!is_power_of_2(mp->m_sb.sb_rextsize)) { 52 u64 rextbytes; 53 u32 mod; 54 55 rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); 56 div_u64_rem(pos, rextbytes, &mod); 57 if (mod) 58 return false; 59 div_u64_rem(len, rextbytes, &mod); 60 return mod == 0; 61 } 62 mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; 63 } else { 64 mask = mp->m_sb.sb_blocksize - 1; 65 } 66 67 return !((pos | len) & mask); 68 } 69 70 /* 71 * Fsync operations on directories are much simpler than on regular files, 72 * as there is no file data to flush, and thus also no need for explicit 73 * cache flush operations, and there are no non-transaction metadata updates 74 * on directories either. 75 */ 76 STATIC int 77 xfs_dir_fsync( 78 struct file *file, 79 loff_t start, 80 loff_t end, 81 int datasync) 82 { 83 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 84 85 trace_xfs_dir_fsync(ip); 86 return xfs_log_force_inode(ip); 87 } 88 89 static xfs_csn_t 90 xfs_fsync_seq( 91 struct xfs_inode *ip, 92 bool datasync) 93 { 94 if (!xfs_ipincount(ip)) 95 return 0; 96 if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 97 return 0; 98 return ip->i_itemp->ili_commit_seq; 99 } 100 101 /* 102 * All metadata updates are logged, which means that we just have to flush the 103 * log up to the latest LSN that touched the inode. 104 * 105 * If we have concurrent fsync/fdatasync() calls, we need them to all block on 106 * the log force before we clear the ili_fsync_fields field. This ensures that 107 * we don't get a racing sync operation that does not wait for the metadata to 108 * hit the journal before returning. If we race with clearing ili_fsync_fields, 109 * then all that will happen is the log force will do nothing as the lsn will 110 * already be on disk. We can't race with setting ili_fsync_fields because that 111 * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock 112 * shared until after the ili_fsync_fields is cleared. 113 */ 114 static int 115 xfs_fsync_flush_log( 116 struct xfs_inode *ip, 117 bool datasync, 118 int *log_flushed) 119 { 120 int error = 0; 121 xfs_csn_t seq; 122 123 xfs_ilock(ip, XFS_ILOCK_SHARED); 124 seq = xfs_fsync_seq(ip, datasync); 125 if (seq) { 126 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 127 log_flushed); 128 129 spin_lock(&ip->i_itemp->ili_lock); 130 ip->i_itemp->ili_fsync_fields = 0; 131 spin_unlock(&ip->i_itemp->ili_lock); 132 } 133 xfs_iunlock(ip, XFS_ILOCK_SHARED); 134 return error; 135 } 136 137 STATIC int 138 xfs_file_fsync( 139 struct file *file, 140 loff_t start, 141 loff_t end, 142 int datasync) 143 { 144 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 145 struct xfs_mount *mp = ip->i_mount; 146 int error, err2; 147 int log_flushed = 0; 148 149 trace_xfs_file_fsync(ip); 150 151 error = file_write_and_wait_range(file, start, end); 152 if (error) 153 return error; 154 155 if (xfs_is_shutdown(mp)) 156 return -EIO; 157 158 xfs_iflags_clear(ip, XFS_ITRUNCATED); 159 160 /* 161 * If we have an RT and/or log subvolume we need to make sure to flush 162 * the write cache the device used for file data first. This is to 163 * ensure newly written file data make it to disk before logging the new 164 * inode size in case of an extending write. 165 */ 166 if (XFS_IS_REALTIME_INODE(ip)) 167 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 168 else if (mp->m_logdev_targp != mp->m_ddev_targp) 169 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 170 171 /* 172 * Any inode that has dirty modifications in the log is pinned. The 173 * racy check here for a pinned inode will not catch modifications 174 * that happen concurrently to the fsync call, but fsync semantics 175 * only require to sync previously completed I/O. 176 */ 177 if (xfs_ipincount(ip)) { 178 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 179 if (err2 && !error) 180 error = err2; 181 } 182 183 /* 184 * If we only have a single device, and the log force about was 185 * a no-op we might have to flush the data device cache here. 186 * This can only happen for fdatasync/O_DSYNC if we were overwriting 187 * an already allocated file and thus do not have any metadata to 188 * commit. 189 */ 190 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 191 mp->m_logdev_targp == mp->m_ddev_targp) { 192 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 193 if (err2 && !error) 194 error = err2; 195 } 196 197 return error; 198 } 199 200 static int 201 xfs_ilock_iocb( 202 struct kiocb *iocb, 203 unsigned int lock_mode) 204 { 205 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 206 207 if (iocb->ki_flags & IOCB_NOWAIT) { 208 if (!xfs_ilock_nowait(ip, lock_mode)) 209 return -EAGAIN; 210 } else { 211 xfs_ilock(ip, lock_mode); 212 } 213 214 return 0; 215 } 216 217 static int 218 xfs_ilock_iocb_for_write( 219 struct kiocb *iocb, 220 unsigned int *lock_mode) 221 { 222 ssize_t ret; 223 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 224 225 ret = xfs_ilock_iocb(iocb, *lock_mode); 226 if (ret) 227 return ret; 228 229 if (*lock_mode == XFS_IOLOCK_EXCL) 230 return 0; 231 if (!xfs_iflags_test(ip, XFS_IREMAPPING)) 232 return 0; 233 234 xfs_iunlock(ip, *lock_mode); 235 *lock_mode = XFS_IOLOCK_EXCL; 236 return xfs_ilock_iocb(iocb, *lock_mode); 237 } 238 239 static unsigned int 240 xfs_ilock_for_write_fault( 241 struct xfs_inode *ip) 242 { 243 /* get a shared lock if no remapping in progress */ 244 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 245 if (!xfs_iflags_test(ip, XFS_IREMAPPING)) 246 return XFS_MMAPLOCK_SHARED; 247 248 /* wait for remapping to complete */ 249 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 250 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 251 return XFS_MMAPLOCK_EXCL; 252 } 253 254 STATIC ssize_t 255 xfs_file_dio_read( 256 struct kiocb *iocb, 257 struct iov_iter *to) 258 { 259 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 260 ssize_t ret; 261 262 trace_xfs_file_direct_read(iocb, to); 263 264 if (!iov_iter_count(to)) 265 return 0; /* skip atime */ 266 267 file_accessed(iocb->ki_filp); 268 269 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 270 if (ret) 271 return ret; 272 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 273 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 274 275 return ret; 276 } 277 278 static noinline ssize_t 279 xfs_file_dax_read( 280 struct kiocb *iocb, 281 struct iov_iter *to) 282 { 283 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 284 ssize_t ret = 0; 285 286 trace_xfs_file_dax_read(iocb, to); 287 288 if (!iov_iter_count(to)) 289 return 0; /* skip atime */ 290 291 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 292 if (ret) 293 return ret; 294 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 295 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 296 297 file_accessed(iocb->ki_filp); 298 return ret; 299 } 300 301 STATIC ssize_t 302 xfs_file_buffered_read( 303 struct kiocb *iocb, 304 struct iov_iter *to) 305 { 306 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 307 ssize_t ret; 308 309 trace_xfs_file_buffered_read(iocb, to); 310 311 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 312 if (ret) 313 return ret; 314 ret = generic_file_read_iter(iocb, to); 315 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 316 317 return ret; 318 } 319 320 STATIC ssize_t 321 xfs_file_read_iter( 322 struct kiocb *iocb, 323 struct iov_iter *to) 324 { 325 struct inode *inode = file_inode(iocb->ki_filp); 326 struct xfs_mount *mp = XFS_I(inode)->i_mount; 327 ssize_t ret = 0; 328 329 XFS_STATS_INC(mp, xs_read_calls); 330 331 if (xfs_is_shutdown(mp)) 332 return -EIO; 333 334 if (IS_DAX(inode)) 335 ret = xfs_file_dax_read(iocb, to); 336 else if (iocb->ki_flags & IOCB_DIRECT) 337 ret = xfs_file_dio_read(iocb, to); 338 else 339 ret = xfs_file_buffered_read(iocb, to); 340 341 if (ret > 0) 342 XFS_STATS_ADD(mp, xs_read_bytes, ret); 343 return ret; 344 } 345 346 STATIC ssize_t 347 xfs_file_splice_read( 348 struct file *in, 349 loff_t *ppos, 350 struct pipe_inode_info *pipe, 351 size_t len, 352 unsigned int flags) 353 { 354 struct inode *inode = file_inode(in); 355 struct xfs_inode *ip = XFS_I(inode); 356 struct xfs_mount *mp = ip->i_mount; 357 ssize_t ret = 0; 358 359 XFS_STATS_INC(mp, xs_read_calls); 360 361 if (xfs_is_shutdown(mp)) 362 return -EIO; 363 364 trace_xfs_file_splice_read(ip, *ppos, len); 365 366 xfs_ilock(ip, XFS_IOLOCK_SHARED); 367 ret = filemap_splice_read(in, ppos, pipe, len, flags); 368 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 369 if (ret > 0) 370 XFS_STATS_ADD(mp, xs_read_bytes, ret); 371 return ret; 372 } 373 374 /* 375 * Common pre-write limit and setup checks. 376 * 377 * Called with the iolocked held either shared and exclusive according to 378 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 379 * if called for a direct write beyond i_size. 380 */ 381 STATIC ssize_t 382 xfs_file_write_checks( 383 struct kiocb *iocb, 384 struct iov_iter *from, 385 unsigned int *iolock) 386 { 387 struct file *file = iocb->ki_filp; 388 struct inode *inode = file->f_mapping->host; 389 struct xfs_inode *ip = XFS_I(inode); 390 ssize_t error = 0; 391 size_t count = iov_iter_count(from); 392 bool drained_dio = false; 393 loff_t isize; 394 395 restart: 396 error = generic_write_checks(iocb, from); 397 if (error <= 0) 398 return error; 399 400 if (iocb->ki_flags & IOCB_NOWAIT) { 401 error = break_layout(inode, false); 402 if (error == -EWOULDBLOCK) 403 error = -EAGAIN; 404 } else { 405 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 406 } 407 408 if (error) 409 return error; 410 411 /* 412 * For changing security info in file_remove_privs() we need i_rwsem 413 * exclusively. 414 */ 415 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 416 xfs_iunlock(ip, *iolock); 417 *iolock = XFS_IOLOCK_EXCL; 418 error = xfs_ilock_iocb(iocb, *iolock); 419 if (error) { 420 *iolock = 0; 421 return error; 422 } 423 goto restart; 424 } 425 426 /* 427 * If the offset is beyond the size of the file, we need to zero any 428 * blocks that fall between the existing EOF and the start of this 429 * write. If zeroing is needed and we are currently holding the iolock 430 * shared, we need to update it to exclusive which implies having to 431 * redo all checks before. 432 * 433 * We need to serialise against EOF updates that occur in IO completions 434 * here. We want to make sure that nobody is changing the size while we 435 * do this check until we have placed an IO barrier (i.e. hold the 436 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 437 * spinlock effectively forms a memory barrier once we have the 438 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 439 * hence be able to correctly determine if we need to run zeroing. 440 * 441 * We can do an unlocked check here safely as IO completion can only 442 * extend EOF. Truncate is locked out at this point, so the EOF can 443 * not move backwards, only forwards. Hence we only need to take the 444 * slow path and spin locks when we are at or beyond the current EOF. 445 */ 446 if (iocb->ki_pos <= i_size_read(inode)) 447 goto out; 448 449 spin_lock(&ip->i_flags_lock); 450 isize = i_size_read(inode); 451 if (iocb->ki_pos > isize) { 452 spin_unlock(&ip->i_flags_lock); 453 454 if (iocb->ki_flags & IOCB_NOWAIT) 455 return -EAGAIN; 456 457 if (!drained_dio) { 458 if (*iolock == XFS_IOLOCK_SHARED) { 459 xfs_iunlock(ip, *iolock); 460 *iolock = XFS_IOLOCK_EXCL; 461 xfs_ilock(ip, *iolock); 462 iov_iter_reexpand(from, count); 463 } 464 /* 465 * We now have an IO submission barrier in place, but 466 * AIO can do EOF updates during IO completion and hence 467 * we now need to wait for all of them to drain. Non-AIO 468 * DIO will have drained before we are given the 469 * XFS_IOLOCK_EXCL, and so for most cases this wait is a 470 * no-op. 471 */ 472 inode_dio_wait(inode); 473 drained_dio = true; 474 goto restart; 475 } 476 477 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 478 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); 479 if (error) 480 return error; 481 } else 482 spin_unlock(&ip->i_flags_lock); 483 484 out: 485 return kiocb_modified(iocb); 486 } 487 488 static int 489 xfs_dio_write_end_io( 490 struct kiocb *iocb, 491 ssize_t size, 492 int error, 493 unsigned flags) 494 { 495 struct inode *inode = file_inode(iocb->ki_filp); 496 struct xfs_inode *ip = XFS_I(inode); 497 loff_t offset = iocb->ki_pos; 498 unsigned int nofs_flag; 499 500 trace_xfs_end_io_direct_write(ip, offset, size); 501 502 if (xfs_is_shutdown(ip->i_mount)) 503 return -EIO; 504 505 if (error) 506 return error; 507 if (!size) 508 return 0; 509 510 /* 511 * Capture amount written on completion as we can't reliably account 512 * for it on submission. 513 */ 514 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 515 516 /* 517 * We can allocate memory here while doing writeback on behalf of 518 * memory reclaim. To avoid memory allocation deadlocks set the 519 * task-wide nofs context for the following operations. 520 */ 521 nofs_flag = memalloc_nofs_save(); 522 523 if (flags & IOMAP_DIO_COW) { 524 error = xfs_reflink_end_cow(ip, offset, size); 525 if (error) 526 goto out; 527 } 528 529 /* 530 * Unwritten conversion updates the in-core isize after extent 531 * conversion but before updating the on-disk size. Updating isize any 532 * earlier allows a racing dio read to find unwritten extents before 533 * they are converted. 534 */ 535 if (flags & IOMAP_DIO_UNWRITTEN) { 536 error = xfs_iomap_write_unwritten(ip, offset, size, true); 537 goto out; 538 } 539 540 /* 541 * We need to update the in-core inode size here so that we don't end up 542 * with the on-disk inode size being outside the in-core inode size. We 543 * have no other method of updating EOF for AIO, so always do it here 544 * if necessary. 545 * 546 * We need to lock the test/set EOF update as we can be racing with 547 * other IO completions here to update the EOF. Failing to serialise 548 * here can result in EOF moving backwards and Bad Things Happen when 549 * that occurs. 550 * 551 * As IO completion only ever extends EOF, we can do an unlocked check 552 * here to avoid taking the spinlock. If we land within the current EOF, 553 * then we do not need to do an extending update at all, and we don't 554 * need to take the lock to check this. If we race with an update moving 555 * EOF, then we'll either still be beyond EOF and need to take the lock, 556 * or we'll be within EOF and we don't need to take it at all. 557 */ 558 if (offset + size <= i_size_read(inode)) 559 goto out; 560 561 spin_lock(&ip->i_flags_lock); 562 if (offset + size > i_size_read(inode)) { 563 i_size_write(inode, offset + size); 564 spin_unlock(&ip->i_flags_lock); 565 error = xfs_setfilesize(ip, offset, size); 566 } else { 567 spin_unlock(&ip->i_flags_lock); 568 } 569 570 out: 571 memalloc_nofs_restore(nofs_flag); 572 return error; 573 } 574 575 static const struct iomap_dio_ops xfs_dio_write_ops = { 576 .end_io = xfs_dio_write_end_io, 577 }; 578 579 /* 580 * Handle block aligned direct I/O writes 581 */ 582 static noinline ssize_t 583 xfs_file_dio_write_aligned( 584 struct xfs_inode *ip, 585 struct kiocb *iocb, 586 struct iov_iter *from) 587 { 588 unsigned int iolock = XFS_IOLOCK_SHARED; 589 ssize_t ret; 590 591 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 592 if (ret) 593 return ret; 594 ret = xfs_file_write_checks(iocb, from, &iolock); 595 if (ret) 596 goto out_unlock; 597 598 /* 599 * We don't need to hold the IOLOCK exclusively across the IO, so demote 600 * the iolock back to shared if we had to take the exclusive lock in 601 * xfs_file_write_checks() for other reasons. 602 */ 603 if (iolock == XFS_IOLOCK_EXCL) { 604 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 605 iolock = XFS_IOLOCK_SHARED; 606 } 607 trace_xfs_file_direct_write(iocb, from); 608 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 609 &xfs_dio_write_ops, 0, NULL, 0); 610 out_unlock: 611 if (iolock) 612 xfs_iunlock(ip, iolock); 613 return ret; 614 } 615 616 /* 617 * Handle block unaligned direct I/O writes 618 * 619 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 620 * them to be done in parallel with reads and other direct I/O writes. However, 621 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 622 * to do sub-block zeroing and that requires serialisation against other direct 623 * I/O to the same block. In this case we need to serialise the submission of 624 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 625 * In the case where sub-block zeroing is not required, we can do concurrent 626 * sub-block dios to the same block successfully. 627 * 628 * Optimistically submit the I/O using the shared lock first, but use the 629 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 630 * if block allocation or partial block zeroing would be required. In that case 631 * we try again with the exclusive lock. 632 */ 633 static noinline ssize_t 634 xfs_file_dio_write_unaligned( 635 struct xfs_inode *ip, 636 struct kiocb *iocb, 637 struct iov_iter *from) 638 { 639 size_t isize = i_size_read(VFS_I(ip)); 640 size_t count = iov_iter_count(from); 641 unsigned int iolock = XFS_IOLOCK_SHARED; 642 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 643 ssize_t ret; 644 645 /* 646 * Extending writes need exclusivity because of the sub-block zeroing 647 * that the DIO code always does for partial tail blocks beyond EOF, so 648 * don't even bother trying the fast path in this case. 649 */ 650 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 651 if (iocb->ki_flags & IOCB_NOWAIT) 652 return -EAGAIN; 653 retry_exclusive: 654 iolock = XFS_IOLOCK_EXCL; 655 flags = IOMAP_DIO_FORCE_WAIT; 656 } 657 658 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 659 if (ret) 660 return ret; 661 662 /* 663 * We can't properly handle unaligned direct I/O to reflink files yet, 664 * as we can't unshare a partial block. 665 */ 666 if (xfs_is_cow_inode(ip)) { 667 trace_xfs_reflink_bounce_dio_write(iocb, from); 668 ret = -ENOTBLK; 669 goto out_unlock; 670 } 671 672 ret = xfs_file_write_checks(iocb, from, &iolock); 673 if (ret) 674 goto out_unlock; 675 676 /* 677 * If we are doing exclusive unaligned I/O, this must be the only I/O 678 * in-flight. Otherwise we risk data corruption due to unwritten extent 679 * conversions from the AIO end_io handler. Wait for all other I/O to 680 * drain first. 681 */ 682 if (flags & IOMAP_DIO_FORCE_WAIT) 683 inode_dio_wait(VFS_I(ip)); 684 685 trace_xfs_file_direct_write(iocb, from); 686 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 687 &xfs_dio_write_ops, flags, NULL, 0); 688 689 /* 690 * Retry unaligned I/O with exclusive blocking semantics if the DIO 691 * layer rejected it for mapping or locking reasons. If we are doing 692 * nonblocking user I/O, propagate the error. 693 */ 694 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 695 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 696 xfs_iunlock(ip, iolock); 697 goto retry_exclusive; 698 } 699 700 out_unlock: 701 if (iolock) 702 xfs_iunlock(ip, iolock); 703 return ret; 704 } 705 706 static ssize_t 707 xfs_file_dio_write( 708 struct kiocb *iocb, 709 struct iov_iter *from) 710 { 711 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 712 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 713 size_t count = iov_iter_count(from); 714 715 /* direct I/O must be aligned to device logical sector size */ 716 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 717 return -EINVAL; 718 if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) 719 return xfs_file_dio_write_unaligned(ip, iocb, from); 720 return xfs_file_dio_write_aligned(ip, iocb, from); 721 } 722 723 static noinline ssize_t 724 xfs_file_dax_write( 725 struct kiocb *iocb, 726 struct iov_iter *from) 727 { 728 struct inode *inode = iocb->ki_filp->f_mapping->host; 729 struct xfs_inode *ip = XFS_I(inode); 730 unsigned int iolock = XFS_IOLOCK_EXCL; 731 ssize_t ret, error = 0; 732 loff_t pos; 733 734 ret = xfs_ilock_iocb(iocb, iolock); 735 if (ret) 736 return ret; 737 ret = xfs_file_write_checks(iocb, from, &iolock); 738 if (ret) 739 goto out; 740 741 pos = iocb->ki_pos; 742 743 trace_xfs_file_dax_write(iocb, from); 744 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 745 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 746 i_size_write(inode, iocb->ki_pos); 747 error = xfs_setfilesize(ip, pos, ret); 748 } 749 out: 750 if (iolock) 751 xfs_iunlock(ip, iolock); 752 if (error) 753 return error; 754 755 if (ret > 0) { 756 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 757 758 /* Handle various SYNC-type writes */ 759 ret = generic_write_sync(iocb, ret); 760 } 761 return ret; 762 } 763 764 STATIC ssize_t 765 xfs_file_buffered_write( 766 struct kiocb *iocb, 767 struct iov_iter *from) 768 { 769 struct inode *inode = iocb->ki_filp->f_mapping->host; 770 struct xfs_inode *ip = XFS_I(inode); 771 ssize_t ret; 772 bool cleared_space = false; 773 unsigned int iolock; 774 775 write_retry: 776 iolock = XFS_IOLOCK_EXCL; 777 ret = xfs_ilock_iocb(iocb, iolock); 778 if (ret) 779 return ret; 780 781 ret = xfs_file_write_checks(iocb, from, &iolock); 782 if (ret) 783 goto out; 784 785 trace_xfs_file_buffered_write(iocb, from); 786 ret = iomap_file_buffered_write(iocb, from, 787 &xfs_buffered_write_iomap_ops); 788 789 /* 790 * If we hit a space limit, try to free up some lingering preallocated 791 * space before returning an error. In the case of ENOSPC, first try to 792 * write back all dirty inodes to free up some of the excess reserved 793 * metadata space. This reduces the chances that the eofblocks scan 794 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 795 * also behaves as a filter to prevent too many eofblocks scans from 796 * running at the same time. Use a synchronous scan to increase the 797 * effectiveness of the scan. 798 */ 799 if (ret == -EDQUOT && !cleared_space) { 800 xfs_iunlock(ip, iolock); 801 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 802 cleared_space = true; 803 goto write_retry; 804 } else if (ret == -ENOSPC && !cleared_space) { 805 struct xfs_icwalk icw = {0}; 806 807 cleared_space = true; 808 xfs_flush_inodes(ip->i_mount); 809 810 xfs_iunlock(ip, iolock); 811 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 812 xfs_blockgc_free_space(ip->i_mount, &icw); 813 goto write_retry; 814 } 815 816 out: 817 if (iolock) 818 xfs_iunlock(ip, iolock); 819 820 if (ret > 0) { 821 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 822 /* Handle various SYNC-type writes */ 823 ret = generic_write_sync(iocb, ret); 824 } 825 return ret; 826 } 827 828 STATIC ssize_t 829 xfs_file_write_iter( 830 struct kiocb *iocb, 831 struct iov_iter *from) 832 { 833 struct inode *inode = iocb->ki_filp->f_mapping->host; 834 struct xfs_inode *ip = XFS_I(inode); 835 ssize_t ret; 836 size_t ocount = iov_iter_count(from); 837 838 XFS_STATS_INC(ip->i_mount, xs_write_calls); 839 840 if (ocount == 0) 841 return 0; 842 843 if (xfs_is_shutdown(ip->i_mount)) 844 return -EIO; 845 846 if (IS_DAX(inode)) 847 return xfs_file_dax_write(iocb, from); 848 849 if (iocb->ki_flags & IOCB_DIRECT) { 850 /* 851 * Allow a directio write to fall back to a buffered 852 * write *only* in the case that we're doing a reflink 853 * CoW. In all other directio scenarios we do not 854 * allow an operation to fall back to buffered mode. 855 */ 856 ret = xfs_file_dio_write(iocb, from); 857 if (ret != -ENOTBLK) 858 return ret; 859 } 860 861 return xfs_file_buffered_write(iocb, from); 862 } 863 864 static void 865 xfs_wait_dax_page( 866 struct inode *inode) 867 { 868 struct xfs_inode *ip = XFS_I(inode); 869 870 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 871 schedule(); 872 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 873 } 874 875 int 876 xfs_break_dax_layouts( 877 struct inode *inode, 878 bool *retry) 879 { 880 struct page *page; 881 882 ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL)); 883 884 page = dax_layout_busy_page(inode->i_mapping); 885 if (!page) 886 return 0; 887 888 *retry = true; 889 return ___wait_var_event(&page->_refcount, 890 atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 891 0, 0, xfs_wait_dax_page(inode)); 892 } 893 894 int 895 xfs_break_layouts( 896 struct inode *inode, 897 uint *iolock, 898 enum layout_break_reason reason) 899 { 900 bool retry; 901 int error; 902 903 ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); 904 905 do { 906 retry = false; 907 switch (reason) { 908 case BREAK_UNMAP: 909 error = xfs_break_dax_layouts(inode, &retry); 910 if (error || retry) 911 break; 912 fallthrough; 913 case BREAK_WRITE: 914 error = xfs_break_leased_layouts(inode, iolock, &retry); 915 break; 916 default: 917 WARN_ON_ONCE(1); 918 error = -EINVAL; 919 } 920 } while (error == 0 && retry); 921 922 return error; 923 } 924 925 /* Does this file, inode, or mount want synchronous writes? */ 926 static inline bool xfs_file_sync_writes(struct file *filp) 927 { 928 struct xfs_inode *ip = XFS_I(file_inode(filp)); 929 930 if (xfs_has_wsync(ip->i_mount)) 931 return true; 932 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 933 return true; 934 if (IS_SYNC(file_inode(filp))) 935 return true; 936 937 return false; 938 } 939 940 #define XFS_FALLOC_FL_SUPPORTED \ 941 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 942 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ 943 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) 944 945 STATIC long 946 xfs_file_fallocate( 947 struct file *file, 948 int mode, 949 loff_t offset, 950 loff_t len) 951 { 952 struct inode *inode = file_inode(file); 953 struct xfs_inode *ip = XFS_I(inode); 954 long error; 955 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 956 loff_t new_size = 0; 957 bool do_file_insert = false; 958 959 if (!S_ISREG(inode->i_mode)) 960 return -EINVAL; 961 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 962 return -EOPNOTSUPP; 963 964 xfs_ilock(ip, iolock); 965 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 966 if (error) 967 goto out_unlock; 968 969 /* 970 * Must wait for all AIO to complete before we continue as AIO can 971 * change the file size on completion without holding any locks we 972 * currently hold. We must do this first because AIO can update both 973 * the on disk and in memory inode sizes, and the operations that follow 974 * require the in-memory size to be fully up-to-date. 975 */ 976 inode_dio_wait(inode); 977 978 /* 979 * Now AIO and DIO has drained we flush and (if necessary) invalidate 980 * the cached range over the first operation we are about to run. 981 * 982 * We care about zero and collapse here because they both run a hole 983 * punch over the range first. Because that can zero data, and the range 984 * of invalidation for the shift operations is much larger, we still do 985 * the required flush for collapse in xfs_prepare_shift(). 986 * 987 * Insert has the same range requirements as collapse, and we extend the 988 * file first which can zero data. Hence insert has the same 989 * flush/invalidate requirements as collapse and so they are both 990 * handled at the right time by xfs_prepare_shift(). 991 */ 992 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 993 FALLOC_FL_COLLAPSE_RANGE)) { 994 error = xfs_flush_unmap_range(ip, offset, len); 995 if (error) 996 goto out_unlock; 997 } 998 999 error = file_modified(file); 1000 if (error) 1001 goto out_unlock; 1002 1003 if (mode & FALLOC_FL_PUNCH_HOLE) { 1004 error = xfs_free_file_space(ip, offset, len); 1005 if (error) 1006 goto out_unlock; 1007 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { 1008 if (!xfs_is_falloc_aligned(ip, offset, len)) { 1009 error = -EINVAL; 1010 goto out_unlock; 1011 } 1012 1013 /* 1014 * There is no need to overlap collapse range with EOF, 1015 * in which case it is effectively a truncate operation 1016 */ 1017 if (offset + len >= i_size_read(inode)) { 1018 error = -EINVAL; 1019 goto out_unlock; 1020 } 1021 1022 new_size = i_size_read(inode) - len; 1023 1024 error = xfs_collapse_file_space(ip, offset, len); 1025 if (error) 1026 goto out_unlock; 1027 } else if (mode & FALLOC_FL_INSERT_RANGE) { 1028 loff_t isize = i_size_read(inode); 1029 1030 if (!xfs_is_falloc_aligned(ip, offset, len)) { 1031 error = -EINVAL; 1032 goto out_unlock; 1033 } 1034 1035 /* 1036 * New inode size must not exceed ->s_maxbytes, accounting for 1037 * possible signed overflow. 1038 */ 1039 if (inode->i_sb->s_maxbytes - isize < len) { 1040 error = -EFBIG; 1041 goto out_unlock; 1042 } 1043 new_size = isize + len; 1044 1045 /* Offset should be less than i_size */ 1046 if (offset >= isize) { 1047 error = -EINVAL; 1048 goto out_unlock; 1049 } 1050 do_file_insert = true; 1051 } else { 1052 if (!(mode & FALLOC_FL_KEEP_SIZE) && 1053 offset + len > i_size_read(inode)) { 1054 new_size = offset + len; 1055 error = inode_newsize_ok(inode, new_size); 1056 if (error) 1057 goto out_unlock; 1058 } 1059 1060 if (mode & FALLOC_FL_ZERO_RANGE) { 1061 /* 1062 * Punch a hole and prealloc the range. We use a hole 1063 * punch rather than unwritten extent conversion for two 1064 * reasons: 1065 * 1066 * 1.) Hole punch handles partial block zeroing for us. 1067 * 2.) If prealloc returns ENOSPC, the file range is 1068 * still zero-valued by virtue of the hole punch. 1069 */ 1070 unsigned int blksize = i_blocksize(inode); 1071 1072 trace_xfs_zero_file_space(ip); 1073 1074 error = xfs_free_file_space(ip, offset, len); 1075 if (error) 1076 goto out_unlock; 1077 1078 len = round_up(offset + len, blksize) - 1079 round_down(offset, blksize); 1080 offset = round_down(offset, blksize); 1081 } else if (mode & FALLOC_FL_UNSHARE_RANGE) { 1082 error = xfs_reflink_unshare(ip, offset, len); 1083 if (error) 1084 goto out_unlock; 1085 } else { 1086 /* 1087 * If always_cow mode we can't use preallocations and 1088 * thus should not create them. 1089 */ 1090 if (xfs_is_always_cow_inode(ip)) { 1091 error = -EOPNOTSUPP; 1092 goto out_unlock; 1093 } 1094 } 1095 1096 if (!xfs_is_always_cow_inode(ip)) { 1097 error = xfs_alloc_file_space(ip, offset, len); 1098 if (error) 1099 goto out_unlock; 1100 } 1101 } 1102 1103 /* Change file size if needed */ 1104 if (new_size) { 1105 struct iattr iattr; 1106 1107 iattr.ia_valid = ATTR_SIZE; 1108 iattr.ia_size = new_size; 1109 error = xfs_vn_setattr_size(file_mnt_idmap(file), 1110 file_dentry(file), &iattr); 1111 if (error) 1112 goto out_unlock; 1113 } 1114 1115 /* 1116 * Perform hole insertion now that the file size has been 1117 * updated so that if we crash during the operation we don't 1118 * leave shifted extents past EOF and hence losing access to 1119 * the data that is contained within them. 1120 */ 1121 if (do_file_insert) { 1122 error = xfs_insert_file_space(ip, offset, len); 1123 if (error) 1124 goto out_unlock; 1125 } 1126 1127 if (xfs_file_sync_writes(file)) 1128 error = xfs_log_force_inode(ip); 1129 1130 out_unlock: 1131 xfs_iunlock(ip, iolock); 1132 return error; 1133 } 1134 1135 STATIC int 1136 xfs_file_fadvise( 1137 struct file *file, 1138 loff_t start, 1139 loff_t end, 1140 int advice) 1141 { 1142 struct xfs_inode *ip = XFS_I(file_inode(file)); 1143 int ret; 1144 int lockflags = 0; 1145 1146 /* 1147 * Operations creating pages in page cache need protection from hole 1148 * punching and similar ops 1149 */ 1150 if (advice == POSIX_FADV_WILLNEED) { 1151 lockflags = XFS_IOLOCK_SHARED; 1152 xfs_ilock(ip, lockflags); 1153 } 1154 ret = generic_fadvise(file, start, end, advice); 1155 if (lockflags) 1156 xfs_iunlock(ip, lockflags); 1157 return ret; 1158 } 1159 1160 STATIC loff_t 1161 xfs_file_remap_range( 1162 struct file *file_in, 1163 loff_t pos_in, 1164 struct file *file_out, 1165 loff_t pos_out, 1166 loff_t len, 1167 unsigned int remap_flags) 1168 { 1169 struct inode *inode_in = file_inode(file_in); 1170 struct xfs_inode *src = XFS_I(inode_in); 1171 struct inode *inode_out = file_inode(file_out); 1172 struct xfs_inode *dest = XFS_I(inode_out); 1173 struct xfs_mount *mp = src->i_mount; 1174 loff_t remapped = 0; 1175 xfs_extlen_t cowextsize; 1176 int ret; 1177 1178 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1179 return -EINVAL; 1180 1181 if (!xfs_has_reflink(mp)) 1182 return -EOPNOTSUPP; 1183 1184 if (xfs_is_shutdown(mp)) 1185 return -EIO; 1186 1187 /* Prepare and then clone file data. */ 1188 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1189 &len, remap_flags); 1190 if (ret || len == 0) 1191 return ret; 1192 1193 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1194 1195 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1196 &remapped); 1197 if (ret) 1198 goto out_unlock; 1199 1200 /* 1201 * Carry the cowextsize hint from src to dest if we're sharing the 1202 * entire source file to the entire destination file, the source file 1203 * has a cowextsize hint, and the destination file does not. 1204 */ 1205 cowextsize = 0; 1206 if (pos_in == 0 && len == i_size_read(inode_in) && 1207 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1208 pos_out == 0 && len >= i_size_read(inode_out) && 1209 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1210 cowextsize = src->i_cowextsize; 1211 1212 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1213 remap_flags); 1214 if (ret) 1215 goto out_unlock; 1216 1217 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1218 xfs_log_force_inode(dest); 1219 out_unlock: 1220 xfs_iunlock2_remapping(src, dest); 1221 if (ret) 1222 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1223 return remapped > 0 ? remapped : ret; 1224 } 1225 1226 STATIC int 1227 xfs_file_open( 1228 struct inode *inode, 1229 struct file *file) 1230 { 1231 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1232 return -EIO; 1233 file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | 1234 FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT; 1235 return generic_file_open(inode, file); 1236 } 1237 1238 STATIC int 1239 xfs_dir_open( 1240 struct inode *inode, 1241 struct file *file) 1242 { 1243 struct xfs_inode *ip = XFS_I(inode); 1244 unsigned int mode; 1245 int error; 1246 1247 error = xfs_file_open(inode, file); 1248 if (error) 1249 return error; 1250 1251 /* 1252 * If there are any blocks, read-ahead block 0 as we're almost 1253 * certain to have the next operation be a read there. 1254 */ 1255 mode = xfs_ilock_data_map_shared(ip); 1256 if (ip->i_df.if_nextents > 0) 1257 error = xfs_dir3_data_readahead(ip, 0, 0); 1258 xfs_iunlock(ip, mode); 1259 return error; 1260 } 1261 1262 STATIC int 1263 xfs_file_release( 1264 struct inode *inode, 1265 struct file *filp) 1266 { 1267 return xfs_release(XFS_I(inode)); 1268 } 1269 1270 STATIC int 1271 xfs_file_readdir( 1272 struct file *file, 1273 struct dir_context *ctx) 1274 { 1275 struct inode *inode = file_inode(file); 1276 xfs_inode_t *ip = XFS_I(inode); 1277 size_t bufsize; 1278 1279 /* 1280 * The Linux API doesn't pass down the total size of the buffer 1281 * we read into down to the filesystem. With the filldir concept 1282 * it's not needed for correct information, but the XFS dir2 leaf 1283 * code wants an estimate of the buffer size to calculate it's 1284 * readahead window and size the buffers used for mapping to 1285 * physical blocks. 1286 * 1287 * Try to give it an estimate that's good enough, maybe at some 1288 * point we can change the ->readdir prototype to include the 1289 * buffer size. For now we use the current glibc buffer size. 1290 */ 1291 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1292 1293 return xfs_readdir(NULL, ip, ctx, bufsize); 1294 } 1295 1296 STATIC loff_t 1297 xfs_file_llseek( 1298 struct file *file, 1299 loff_t offset, 1300 int whence) 1301 { 1302 struct inode *inode = file->f_mapping->host; 1303 1304 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1305 return -EIO; 1306 1307 switch (whence) { 1308 default: 1309 return generic_file_llseek(file, offset, whence); 1310 case SEEK_HOLE: 1311 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1312 break; 1313 case SEEK_DATA: 1314 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1315 break; 1316 } 1317 1318 if (offset < 0) 1319 return offset; 1320 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1321 } 1322 1323 #ifdef CONFIG_FS_DAX 1324 static inline vm_fault_t 1325 xfs_dax_fault( 1326 struct vm_fault *vmf, 1327 unsigned int order, 1328 bool write_fault, 1329 pfn_t *pfn) 1330 { 1331 return dax_iomap_fault(vmf, order, pfn, NULL, 1332 (write_fault && !vmf->cow_page) ? 1333 &xfs_dax_write_iomap_ops : 1334 &xfs_read_iomap_ops); 1335 } 1336 #else 1337 static inline vm_fault_t 1338 xfs_dax_fault( 1339 struct vm_fault *vmf, 1340 unsigned int order, 1341 bool write_fault, 1342 pfn_t *pfn) 1343 { 1344 ASSERT(0); 1345 return VM_FAULT_SIGBUS; 1346 } 1347 #endif 1348 1349 /* 1350 * Locking for serialisation of IO during page faults. This results in a lock 1351 * ordering of: 1352 * 1353 * mmap_lock (MM) 1354 * sb_start_pagefault(vfs, freeze) 1355 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1356 * page_lock (MM) 1357 * i_lock (XFS - extent map serialisation) 1358 */ 1359 static vm_fault_t 1360 __xfs_filemap_fault( 1361 struct vm_fault *vmf, 1362 unsigned int order, 1363 bool write_fault) 1364 { 1365 struct inode *inode = file_inode(vmf->vma->vm_file); 1366 struct xfs_inode *ip = XFS_I(inode); 1367 vm_fault_t ret; 1368 unsigned int lock_mode = 0; 1369 1370 trace_xfs_filemap_fault(ip, order, write_fault); 1371 1372 if (write_fault) { 1373 sb_start_pagefault(inode->i_sb); 1374 file_update_time(vmf->vma->vm_file); 1375 } 1376 1377 if (IS_DAX(inode) || write_fault) 1378 lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); 1379 1380 if (IS_DAX(inode)) { 1381 pfn_t pfn; 1382 1383 ret = xfs_dax_fault(vmf, order, write_fault, &pfn); 1384 if (ret & VM_FAULT_NEEDDSYNC) 1385 ret = dax_finish_sync_fault(vmf, order, pfn); 1386 } else if (write_fault) { 1387 ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); 1388 } else { 1389 ret = filemap_fault(vmf); 1390 } 1391 1392 if (lock_mode) 1393 xfs_iunlock(XFS_I(inode), lock_mode); 1394 1395 if (write_fault) 1396 sb_end_pagefault(inode->i_sb); 1397 return ret; 1398 } 1399 1400 static inline bool 1401 xfs_is_write_fault( 1402 struct vm_fault *vmf) 1403 { 1404 return (vmf->flags & FAULT_FLAG_WRITE) && 1405 (vmf->vma->vm_flags & VM_SHARED); 1406 } 1407 1408 static vm_fault_t 1409 xfs_filemap_fault( 1410 struct vm_fault *vmf) 1411 { 1412 /* DAX can shortcut the normal fault path on write faults! */ 1413 return __xfs_filemap_fault(vmf, 0, 1414 IS_DAX(file_inode(vmf->vma->vm_file)) && 1415 xfs_is_write_fault(vmf)); 1416 } 1417 1418 static vm_fault_t 1419 xfs_filemap_huge_fault( 1420 struct vm_fault *vmf, 1421 unsigned int order) 1422 { 1423 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1424 return VM_FAULT_FALLBACK; 1425 1426 /* DAX can shortcut the normal fault path on write faults! */ 1427 return __xfs_filemap_fault(vmf, order, 1428 xfs_is_write_fault(vmf)); 1429 } 1430 1431 static vm_fault_t 1432 xfs_filemap_page_mkwrite( 1433 struct vm_fault *vmf) 1434 { 1435 return __xfs_filemap_fault(vmf, 0, true); 1436 } 1437 1438 /* 1439 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1440 * on write faults. In reality, it needs to serialise against truncate and 1441 * prepare memory for writing so handle is as standard write fault. 1442 */ 1443 static vm_fault_t 1444 xfs_filemap_pfn_mkwrite( 1445 struct vm_fault *vmf) 1446 { 1447 1448 return __xfs_filemap_fault(vmf, 0, true); 1449 } 1450 1451 static const struct vm_operations_struct xfs_file_vm_ops = { 1452 .fault = xfs_filemap_fault, 1453 .huge_fault = xfs_filemap_huge_fault, 1454 .map_pages = filemap_map_pages, 1455 .page_mkwrite = xfs_filemap_page_mkwrite, 1456 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1457 }; 1458 1459 STATIC int 1460 xfs_file_mmap( 1461 struct file *file, 1462 struct vm_area_struct *vma) 1463 { 1464 struct inode *inode = file_inode(file); 1465 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 1466 1467 /* 1468 * We don't support synchronous mappings for non-DAX files and 1469 * for DAX files if underneath dax_device is not synchronous. 1470 */ 1471 if (!daxdev_mapping_supported(vma, target->bt_daxdev)) 1472 return -EOPNOTSUPP; 1473 1474 file_accessed(file); 1475 vma->vm_ops = &xfs_file_vm_ops; 1476 if (IS_DAX(inode)) 1477 vm_flags_set(vma, VM_HUGEPAGE); 1478 return 0; 1479 } 1480 1481 const struct file_operations xfs_file_operations = { 1482 .llseek = xfs_file_llseek, 1483 .read_iter = xfs_file_read_iter, 1484 .write_iter = xfs_file_write_iter, 1485 .splice_read = xfs_file_splice_read, 1486 .splice_write = iter_file_splice_write, 1487 .iopoll = iocb_bio_iopoll, 1488 .unlocked_ioctl = xfs_file_ioctl, 1489 #ifdef CONFIG_COMPAT 1490 .compat_ioctl = xfs_file_compat_ioctl, 1491 #endif 1492 .mmap = xfs_file_mmap, 1493 .mmap_supported_flags = MAP_SYNC, 1494 .open = xfs_file_open, 1495 .release = xfs_file_release, 1496 .fsync = xfs_file_fsync, 1497 .get_unmapped_area = thp_get_unmapped_area, 1498 .fallocate = xfs_file_fallocate, 1499 .fadvise = xfs_file_fadvise, 1500 .remap_file_range = xfs_file_remap_range, 1501 }; 1502 1503 const struct file_operations xfs_dir_file_operations = { 1504 .open = xfs_dir_open, 1505 .read = generic_read_dir, 1506 .iterate_shared = xfs_file_readdir, 1507 .llseek = generic_file_llseek, 1508 .unlocked_ioctl = xfs_file_ioctl, 1509 #ifdef CONFIG_COMPAT 1510 .compat_ioctl = xfs_file_compat_ioctl, 1511 #endif 1512 .fsync = xfs_dir_fsync, 1513 }; 1514