1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 #include "xfs_file.h" 28 29 #include <linux/dax.h> 30 #include <linux/falloc.h> 31 #include <linux/backing-dev.h> 32 #include <linux/mman.h> 33 #include <linux/fadvise.h> 34 #include <linux/mount.h> 35 36 static const struct vm_operations_struct xfs_file_vm_ops; 37 38 /* 39 * Decide if the given file range is aligned to the size of the fundamental 40 * allocation unit for the file. 41 */ 42 bool 43 xfs_is_falloc_aligned( 44 struct xfs_inode *ip, 45 loff_t pos, 46 long long int len) 47 { 48 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); 49 50 if (!is_power_of_2(alloc_unit)) 51 return isaligned_64(pos, alloc_unit) && 52 isaligned_64(len, alloc_unit); 53 54 return !((pos | len) & (alloc_unit - 1)); 55 } 56 57 /* 58 * Fsync operations on directories are much simpler than on regular files, 59 * as there is no file data to flush, and thus also no need for explicit 60 * cache flush operations, and there are no non-transaction metadata updates 61 * on directories either. 62 */ 63 STATIC int 64 xfs_dir_fsync( 65 struct file *file, 66 loff_t start, 67 loff_t end, 68 int datasync) 69 { 70 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 71 72 trace_xfs_dir_fsync(ip); 73 return xfs_log_force_inode(ip); 74 } 75 76 static xfs_csn_t 77 xfs_fsync_seq( 78 struct xfs_inode *ip, 79 bool datasync) 80 { 81 if (!xfs_ipincount(ip)) 82 return 0; 83 if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 84 return 0; 85 return ip->i_itemp->ili_commit_seq; 86 } 87 88 /* 89 * All metadata updates are logged, which means that we just have to flush the 90 * log up to the latest LSN that touched the inode. 91 * 92 * If we have concurrent fsync/fdatasync() calls, we need them to all block on 93 * the log force before we clear the ili_fsync_fields field. This ensures that 94 * we don't get a racing sync operation that does not wait for the metadata to 95 * hit the journal before returning. If we race with clearing ili_fsync_fields, 96 * then all that will happen is the log force will do nothing as the lsn will 97 * already be on disk. We can't race with setting ili_fsync_fields because that 98 * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock 99 * shared until after the ili_fsync_fields is cleared. 100 */ 101 static int 102 xfs_fsync_flush_log( 103 struct xfs_inode *ip, 104 bool datasync, 105 int *log_flushed) 106 { 107 int error = 0; 108 xfs_csn_t seq; 109 110 xfs_ilock(ip, XFS_ILOCK_SHARED); 111 seq = xfs_fsync_seq(ip, datasync); 112 if (seq) { 113 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 114 log_flushed); 115 116 spin_lock(&ip->i_itemp->ili_lock); 117 ip->i_itemp->ili_fsync_fields = 0; 118 spin_unlock(&ip->i_itemp->ili_lock); 119 } 120 xfs_iunlock(ip, XFS_ILOCK_SHARED); 121 return error; 122 } 123 124 STATIC int 125 xfs_file_fsync( 126 struct file *file, 127 loff_t start, 128 loff_t end, 129 int datasync) 130 { 131 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 132 struct xfs_mount *mp = ip->i_mount; 133 int error, err2; 134 int log_flushed = 0; 135 136 trace_xfs_file_fsync(ip); 137 138 error = file_write_and_wait_range(file, start, end); 139 if (error) 140 return error; 141 142 if (xfs_is_shutdown(mp)) 143 return -EIO; 144 145 xfs_iflags_clear(ip, XFS_ITRUNCATED); 146 147 /* 148 * If we have an RT and/or log subvolume we need to make sure to flush 149 * the write cache the device used for file data first. This is to 150 * ensure newly written file data make it to disk before logging the new 151 * inode size in case of an extending write. 152 */ 153 if (XFS_IS_REALTIME_INODE(ip)) 154 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 155 else if (mp->m_logdev_targp != mp->m_ddev_targp) 156 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 157 158 /* 159 * Any inode that has dirty modifications in the log is pinned. The 160 * racy check here for a pinned inode will not catch modifications 161 * that happen concurrently to the fsync call, but fsync semantics 162 * only require to sync previously completed I/O. 163 */ 164 if (xfs_ipincount(ip)) { 165 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 166 if (err2 && !error) 167 error = err2; 168 } 169 170 /* 171 * If we only have a single device, and the log force about was 172 * a no-op we might have to flush the data device cache here. 173 * This can only happen for fdatasync/O_DSYNC if we were overwriting 174 * an already allocated file and thus do not have any metadata to 175 * commit. 176 */ 177 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 178 mp->m_logdev_targp == mp->m_ddev_targp) { 179 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 180 if (err2 && !error) 181 error = err2; 182 } 183 184 return error; 185 } 186 187 static int 188 xfs_ilock_iocb( 189 struct kiocb *iocb, 190 unsigned int lock_mode) 191 { 192 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 193 194 if (iocb->ki_flags & IOCB_NOWAIT) { 195 if (!xfs_ilock_nowait(ip, lock_mode)) 196 return -EAGAIN; 197 } else { 198 xfs_ilock(ip, lock_mode); 199 } 200 201 return 0; 202 } 203 204 static int 205 xfs_ilock_iocb_for_write( 206 struct kiocb *iocb, 207 unsigned int *lock_mode) 208 { 209 ssize_t ret; 210 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 211 212 ret = xfs_ilock_iocb(iocb, *lock_mode); 213 if (ret) 214 return ret; 215 216 /* 217 * If a reflink remap is in progress we always need to take the iolock 218 * exclusively to wait for it to finish. 219 */ 220 if (*lock_mode == XFS_IOLOCK_SHARED && 221 xfs_iflags_test(ip, XFS_IREMAPPING)) { 222 xfs_iunlock(ip, *lock_mode); 223 *lock_mode = XFS_IOLOCK_EXCL; 224 return xfs_ilock_iocb(iocb, *lock_mode); 225 } 226 227 return 0; 228 } 229 230 STATIC ssize_t 231 xfs_file_dio_read( 232 struct kiocb *iocb, 233 struct iov_iter *to) 234 { 235 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 236 ssize_t ret; 237 238 trace_xfs_file_direct_read(iocb, to); 239 240 if (!iov_iter_count(to)) 241 return 0; /* skip atime */ 242 243 file_accessed(iocb->ki_filp); 244 245 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 246 if (ret) 247 return ret; 248 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 249 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 250 251 return ret; 252 } 253 254 static noinline ssize_t 255 xfs_file_dax_read( 256 struct kiocb *iocb, 257 struct iov_iter *to) 258 { 259 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 260 ssize_t ret = 0; 261 262 trace_xfs_file_dax_read(iocb, to); 263 264 if (!iov_iter_count(to)) 265 return 0; /* skip atime */ 266 267 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 268 if (ret) 269 return ret; 270 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 271 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 272 273 file_accessed(iocb->ki_filp); 274 return ret; 275 } 276 277 STATIC ssize_t 278 xfs_file_buffered_read( 279 struct kiocb *iocb, 280 struct iov_iter *to) 281 { 282 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 283 ssize_t ret; 284 285 trace_xfs_file_buffered_read(iocb, to); 286 287 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 288 if (ret) 289 return ret; 290 ret = generic_file_read_iter(iocb, to); 291 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 292 293 return ret; 294 } 295 296 STATIC ssize_t 297 xfs_file_read_iter( 298 struct kiocb *iocb, 299 struct iov_iter *to) 300 { 301 struct inode *inode = file_inode(iocb->ki_filp); 302 struct xfs_mount *mp = XFS_I(inode)->i_mount; 303 ssize_t ret = 0; 304 305 XFS_STATS_INC(mp, xs_read_calls); 306 307 if (xfs_is_shutdown(mp)) 308 return -EIO; 309 310 if (IS_DAX(inode)) 311 ret = xfs_file_dax_read(iocb, to); 312 else if (iocb->ki_flags & IOCB_DIRECT) 313 ret = xfs_file_dio_read(iocb, to); 314 else 315 ret = xfs_file_buffered_read(iocb, to); 316 317 if (ret > 0) 318 XFS_STATS_ADD(mp, xs_read_bytes, ret); 319 return ret; 320 } 321 322 STATIC ssize_t 323 xfs_file_splice_read( 324 struct file *in, 325 loff_t *ppos, 326 struct pipe_inode_info *pipe, 327 size_t len, 328 unsigned int flags) 329 { 330 struct inode *inode = file_inode(in); 331 struct xfs_inode *ip = XFS_I(inode); 332 struct xfs_mount *mp = ip->i_mount; 333 ssize_t ret = 0; 334 335 XFS_STATS_INC(mp, xs_read_calls); 336 337 if (xfs_is_shutdown(mp)) 338 return -EIO; 339 340 trace_xfs_file_splice_read(ip, *ppos, len); 341 342 xfs_ilock(ip, XFS_IOLOCK_SHARED); 343 ret = filemap_splice_read(in, ppos, pipe, len, flags); 344 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 345 if (ret > 0) 346 XFS_STATS_ADD(mp, xs_read_bytes, ret); 347 return ret; 348 } 349 350 /* 351 * Common pre-write limit and setup checks. 352 * 353 * Called with the iolocked held either shared and exclusive according to 354 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 355 * if called for a direct write beyond i_size. 356 */ 357 STATIC ssize_t 358 xfs_file_write_checks( 359 struct kiocb *iocb, 360 struct iov_iter *from, 361 unsigned int *iolock) 362 { 363 struct file *file = iocb->ki_filp; 364 struct inode *inode = file->f_mapping->host; 365 struct xfs_inode *ip = XFS_I(inode); 366 ssize_t error = 0; 367 size_t count = iov_iter_count(from); 368 bool drained_dio = false; 369 loff_t isize; 370 371 restart: 372 error = generic_write_checks(iocb, from); 373 if (error <= 0) 374 return error; 375 376 if (iocb->ki_flags & IOCB_NOWAIT) { 377 error = break_layout(inode, false); 378 if (error == -EWOULDBLOCK) 379 error = -EAGAIN; 380 } else { 381 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 382 } 383 384 if (error) 385 return error; 386 387 /* 388 * For changing security info in file_remove_privs() we need i_rwsem 389 * exclusively. 390 */ 391 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 392 xfs_iunlock(ip, *iolock); 393 *iolock = XFS_IOLOCK_EXCL; 394 error = xfs_ilock_iocb(iocb, *iolock); 395 if (error) { 396 *iolock = 0; 397 return error; 398 } 399 goto restart; 400 } 401 402 /* 403 * If the offset is beyond the size of the file, we need to zero any 404 * blocks that fall between the existing EOF and the start of this 405 * write. If zeroing is needed and we are currently holding the iolock 406 * shared, we need to update it to exclusive which implies having to 407 * redo all checks before. 408 * 409 * We need to serialise against EOF updates that occur in IO completions 410 * here. We want to make sure that nobody is changing the size while we 411 * do this check until we have placed an IO barrier (i.e. hold the 412 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 413 * spinlock effectively forms a memory barrier once we have the 414 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 415 * hence be able to correctly determine if we need to run zeroing. 416 * 417 * We can do an unlocked check here safely as IO completion can only 418 * extend EOF. Truncate is locked out at this point, so the EOF can 419 * not move backwards, only forwards. Hence we only need to take the 420 * slow path and spin locks when we are at or beyond the current EOF. 421 */ 422 if (iocb->ki_pos <= i_size_read(inode)) 423 goto out; 424 425 spin_lock(&ip->i_flags_lock); 426 isize = i_size_read(inode); 427 if (iocb->ki_pos > isize) { 428 spin_unlock(&ip->i_flags_lock); 429 430 if (iocb->ki_flags & IOCB_NOWAIT) 431 return -EAGAIN; 432 433 if (!drained_dio) { 434 if (*iolock == XFS_IOLOCK_SHARED) { 435 xfs_iunlock(ip, *iolock); 436 *iolock = XFS_IOLOCK_EXCL; 437 xfs_ilock(ip, *iolock); 438 iov_iter_reexpand(from, count); 439 } 440 /* 441 * We now have an IO submission barrier in place, but 442 * AIO can do EOF updates during IO completion and hence 443 * we now need to wait for all of them to drain. Non-AIO 444 * DIO will have drained before we are given the 445 * XFS_IOLOCK_EXCL, and so for most cases this wait is a 446 * no-op. 447 */ 448 inode_dio_wait(inode); 449 drained_dio = true; 450 goto restart; 451 } 452 453 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 454 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); 455 if (error) 456 return error; 457 } else 458 spin_unlock(&ip->i_flags_lock); 459 460 out: 461 return kiocb_modified(iocb); 462 } 463 464 static int 465 xfs_dio_write_end_io( 466 struct kiocb *iocb, 467 ssize_t size, 468 int error, 469 unsigned flags) 470 { 471 struct inode *inode = file_inode(iocb->ki_filp); 472 struct xfs_inode *ip = XFS_I(inode); 473 loff_t offset = iocb->ki_pos; 474 unsigned int nofs_flag; 475 476 trace_xfs_end_io_direct_write(ip, offset, size); 477 478 if (xfs_is_shutdown(ip->i_mount)) 479 return -EIO; 480 481 if (error) 482 return error; 483 if (!size) 484 return 0; 485 486 /* 487 * Capture amount written on completion as we can't reliably account 488 * for it on submission. 489 */ 490 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 491 492 /* 493 * We can allocate memory here while doing writeback on behalf of 494 * memory reclaim. To avoid memory allocation deadlocks set the 495 * task-wide nofs context for the following operations. 496 */ 497 nofs_flag = memalloc_nofs_save(); 498 499 if (flags & IOMAP_DIO_COW) { 500 error = xfs_reflink_end_cow(ip, offset, size); 501 if (error) 502 goto out; 503 } 504 505 /* 506 * Unwritten conversion updates the in-core isize after extent 507 * conversion but before updating the on-disk size. Updating isize any 508 * earlier allows a racing dio read to find unwritten extents before 509 * they are converted. 510 */ 511 if (flags & IOMAP_DIO_UNWRITTEN) { 512 error = xfs_iomap_write_unwritten(ip, offset, size, true); 513 goto out; 514 } 515 516 /* 517 * We need to update the in-core inode size here so that we don't end up 518 * with the on-disk inode size being outside the in-core inode size. We 519 * have no other method of updating EOF for AIO, so always do it here 520 * if necessary. 521 * 522 * We need to lock the test/set EOF update as we can be racing with 523 * other IO completions here to update the EOF. Failing to serialise 524 * here can result in EOF moving backwards and Bad Things Happen when 525 * that occurs. 526 * 527 * As IO completion only ever extends EOF, we can do an unlocked check 528 * here to avoid taking the spinlock. If we land within the current EOF, 529 * then we do not need to do an extending update at all, and we don't 530 * need to take the lock to check this. If we race with an update moving 531 * EOF, then we'll either still be beyond EOF and need to take the lock, 532 * or we'll be within EOF and we don't need to take it at all. 533 */ 534 if (offset + size <= i_size_read(inode)) 535 goto out; 536 537 spin_lock(&ip->i_flags_lock); 538 if (offset + size > i_size_read(inode)) { 539 i_size_write(inode, offset + size); 540 spin_unlock(&ip->i_flags_lock); 541 error = xfs_setfilesize(ip, offset, size); 542 } else { 543 spin_unlock(&ip->i_flags_lock); 544 } 545 546 out: 547 memalloc_nofs_restore(nofs_flag); 548 return error; 549 } 550 551 static const struct iomap_dio_ops xfs_dio_write_ops = { 552 .end_io = xfs_dio_write_end_io, 553 }; 554 555 /* 556 * Handle block aligned direct I/O writes 557 */ 558 static noinline ssize_t 559 xfs_file_dio_write_aligned( 560 struct xfs_inode *ip, 561 struct kiocb *iocb, 562 struct iov_iter *from) 563 { 564 unsigned int iolock = XFS_IOLOCK_SHARED; 565 ssize_t ret; 566 567 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 568 if (ret) 569 return ret; 570 ret = xfs_file_write_checks(iocb, from, &iolock); 571 if (ret) 572 goto out_unlock; 573 574 /* 575 * We don't need to hold the IOLOCK exclusively across the IO, so demote 576 * the iolock back to shared if we had to take the exclusive lock in 577 * xfs_file_write_checks() for other reasons. 578 */ 579 if (iolock == XFS_IOLOCK_EXCL) { 580 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 581 iolock = XFS_IOLOCK_SHARED; 582 } 583 trace_xfs_file_direct_write(iocb, from); 584 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 585 &xfs_dio_write_ops, 0, NULL, 0); 586 out_unlock: 587 if (iolock) 588 xfs_iunlock(ip, iolock); 589 return ret; 590 } 591 592 /* 593 * Handle block unaligned direct I/O writes 594 * 595 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 596 * them to be done in parallel with reads and other direct I/O writes. However, 597 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 598 * to do sub-block zeroing and that requires serialisation against other direct 599 * I/O to the same block. In this case we need to serialise the submission of 600 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 601 * In the case where sub-block zeroing is not required, we can do concurrent 602 * sub-block dios to the same block successfully. 603 * 604 * Optimistically submit the I/O using the shared lock first, but use the 605 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 606 * if block allocation or partial block zeroing would be required. In that case 607 * we try again with the exclusive lock. 608 */ 609 static noinline ssize_t 610 xfs_file_dio_write_unaligned( 611 struct xfs_inode *ip, 612 struct kiocb *iocb, 613 struct iov_iter *from) 614 { 615 size_t isize = i_size_read(VFS_I(ip)); 616 size_t count = iov_iter_count(from); 617 unsigned int iolock = XFS_IOLOCK_SHARED; 618 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 619 ssize_t ret; 620 621 /* 622 * Extending writes need exclusivity because of the sub-block zeroing 623 * that the DIO code always does for partial tail blocks beyond EOF, so 624 * don't even bother trying the fast path in this case. 625 */ 626 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 627 if (iocb->ki_flags & IOCB_NOWAIT) 628 return -EAGAIN; 629 retry_exclusive: 630 iolock = XFS_IOLOCK_EXCL; 631 flags = IOMAP_DIO_FORCE_WAIT; 632 } 633 634 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 635 if (ret) 636 return ret; 637 638 /* 639 * We can't properly handle unaligned direct I/O to reflink files yet, 640 * as we can't unshare a partial block. 641 */ 642 if (xfs_is_cow_inode(ip)) { 643 trace_xfs_reflink_bounce_dio_write(iocb, from); 644 ret = -ENOTBLK; 645 goto out_unlock; 646 } 647 648 ret = xfs_file_write_checks(iocb, from, &iolock); 649 if (ret) 650 goto out_unlock; 651 652 /* 653 * If we are doing exclusive unaligned I/O, this must be the only I/O 654 * in-flight. Otherwise we risk data corruption due to unwritten extent 655 * conversions from the AIO end_io handler. Wait for all other I/O to 656 * drain first. 657 */ 658 if (flags & IOMAP_DIO_FORCE_WAIT) 659 inode_dio_wait(VFS_I(ip)); 660 661 trace_xfs_file_direct_write(iocb, from); 662 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 663 &xfs_dio_write_ops, flags, NULL, 0); 664 665 /* 666 * Retry unaligned I/O with exclusive blocking semantics if the DIO 667 * layer rejected it for mapping or locking reasons. If we are doing 668 * nonblocking user I/O, propagate the error. 669 */ 670 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 671 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 672 xfs_iunlock(ip, iolock); 673 goto retry_exclusive; 674 } 675 676 out_unlock: 677 if (iolock) 678 xfs_iunlock(ip, iolock); 679 return ret; 680 } 681 682 static ssize_t 683 xfs_file_dio_write( 684 struct kiocb *iocb, 685 struct iov_iter *from) 686 { 687 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 688 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 689 size_t count = iov_iter_count(from); 690 691 /* direct I/O must be aligned to device logical sector size */ 692 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 693 return -EINVAL; 694 if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) 695 return xfs_file_dio_write_unaligned(ip, iocb, from); 696 return xfs_file_dio_write_aligned(ip, iocb, from); 697 } 698 699 static noinline ssize_t 700 xfs_file_dax_write( 701 struct kiocb *iocb, 702 struct iov_iter *from) 703 { 704 struct inode *inode = iocb->ki_filp->f_mapping->host; 705 struct xfs_inode *ip = XFS_I(inode); 706 unsigned int iolock = XFS_IOLOCK_EXCL; 707 ssize_t ret, error = 0; 708 loff_t pos; 709 710 ret = xfs_ilock_iocb(iocb, iolock); 711 if (ret) 712 return ret; 713 ret = xfs_file_write_checks(iocb, from, &iolock); 714 if (ret) 715 goto out; 716 717 pos = iocb->ki_pos; 718 719 trace_xfs_file_dax_write(iocb, from); 720 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 721 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 722 i_size_write(inode, iocb->ki_pos); 723 error = xfs_setfilesize(ip, pos, ret); 724 } 725 out: 726 if (iolock) 727 xfs_iunlock(ip, iolock); 728 if (error) 729 return error; 730 731 if (ret > 0) { 732 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 733 734 /* Handle various SYNC-type writes */ 735 ret = generic_write_sync(iocb, ret); 736 } 737 return ret; 738 } 739 740 STATIC ssize_t 741 xfs_file_buffered_write( 742 struct kiocb *iocb, 743 struct iov_iter *from) 744 { 745 struct inode *inode = iocb->ki_filp->f_mapping->host; 746 struct xfs_inode *ip = XFS_I(inode); 747 ssize_t ret; 748 bool cleared_space = false; 749 unsigned int iolock; 750 751 write_retry: 752 iolock = XFS_IOLOCK_EXCL; 753 ret = xfs_ilock_iocb(iocb, iolock); 754 if (ret) 755 return ret; 756 757 ret = xfs_file_write_checks(iocb, from, &iolock); 758 if (ret) 759 goto out; 760 761 trace_xfs_file_buffered_write(iocb, from); 762 ret = iomap_file_buffered_write(iocb, from, 763 &xfs_buffered_write_iomap_ops); 764 765 /* 766 * If we hit a space limit, try to free up some lingering preallocated 767 * space before returning an error. In the case of ENOSPC, first try to 768 * write back all dirty inodes to free up some of the excess reserved 769 * metadata space. This reduces the chances that the eofblocks scan 770 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 771 * also behaves as a filter to prevent too many eofblocks scans from 772 * running at the same time. Use a synchronous scan to increase the 773 * effectiveness of the scan. 774 */ 775 if (ret == -EDQUOT && !cleared_space) { 776 xfs_iunlock(ip, iolock); 777 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 778 cleared_space = true; 779 goto write_retry; 780 } else if (ret == -ENOSPC && !cleared_space) { 781 struct xfs_icwalk icw = {0}; 782 783 cleared_space = true; 784 xfs_flush_inodes(ip->i_mount); 785 786 xfs_iunlock(ip, iolock); 787 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 788 xfs_blockgc_free_space(ip->i_mount, &icw); 789 goto write_retry; 790 } 791 792 out: 793 if (iolock) 794 xfs_iunlock(ip, iolock); 795 796 if (ret > 0) { 797 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 798 /* Handle various SYNC-type writes */ 799 ret = generic_write_sync(iocb, ret); 800 } 801 return ret; 802 } 803 804 STATIC ssize_t 805 xfs_file_write_iter( 806 struct kiocb *iocb, 807 struct iov_iter *from) 808 { 809 struct inode *inode = iocb->ki_filp->f_mapping->host; 810 struct xfs_inode *ip = XFS_I(inode); 811 ssize_t ret; 812 size_t ocount = iov_iter_count(from); 813 814 XFS_STATS_INC(ip->i_mount, xs_write_calls); 815 816 if (ocount == 0) 817 return 0; 818 819 if (xfs_is_shutdown(ip->i_mount)) 820 return -EIO; 821 822 if (IS_DAX(inode)) 823 return xfs_file_dax_write(iocb, from); 824 825 if (iocb->ki_flags & IOCB_DIRECT) { 826 /* 827 * Allow a directio write to fall back to a buffered 828 * write *only* in the case that we're doing a reflink 829 * CoW. In all other directio scenarios we do not 830 * allow an operation to fall back to buffered mode. 831 */ 832 ret = xfs_file_dio_write(iocb, from); 833 if (ret != -ENOTBLK) 834 return ret; 835 } 836 837 return xfs_file_buffered_write(iocb, from); 838 } 839 840 /* Does this file, inode, or mount want synchronous writes? */ 841 static inline bool xfs_file_sync_writes(struct file *filp) 842 { 843 struct xfs_inode *ip = XFS_I(file_inode(filp)); 844 845 if (xfs_has_wsync(ip->i_mount)) 846 return true; 847 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 848 return true; 849 if (IS_SYNC(file_inode(filp))) 850 return true; 851 852 return false; 853 } 854 855 #define XFS_FALLOC_FL_SUPPORTED \ 856 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 857 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ 858 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) 859 860 STATIC long 861 xfs_file_fallocate( 862 struct file *file, 863 int mode, 864 loff_t offset, 865 loff_t len) 866 { 867 struct inode *inode = file_inode(file); 868 struct xfs_inode *ip = XFS_I(inode); 869 long error; 870 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 871 loff_t new_size = 0; 872 bool do_file_insert = false; 873 874 if (!S_ISREG(inode->i_mode)) 875 return -EINVAL; 876 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 877 return -EOPNOTSUPP; 878 879 xfs_ilock(ip, iolock); 880 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 881 if (error) 882 goto out_unlock; 883 884 /* 885 * Must wait for all AIO to complete before we continue as AIO can 886 * change the file size on completion without holding any locks we 887 * currently hold. We must do this first because AIO can update both 888 * the on disk and in memory inode sizes, and the operations that follow 889 * require the in-memory size to be fully up-to-date. 890 */ 891 inode_dio_wait(inode); 892 893 /* 894 * Now AIO and DIO has drained we flush and (if necessary) invalidate 895 * the cached range over the first operation we are about to run. 896 * 897 * We care about zero and collapse here because they both run a hole 898 * punch over the range first. Because that can zero data, and the range 899 * of invalidation for the shift operations is much larger, we still do 900 * the required flush for collapse in xfs_prepare_shift(). 901 * 902 * Insert has the same range requirements as collapse, and we extend the 903 * file first which can zero data. Hence insert has the same 904 * flush/invalidate requirements as collapse and so they are both 905 * handled at the right time by xfs_prepare_shift(). 906 */ 907 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 908 FALLOC_FL_COLLAPSE_RANGE)) { 909 error = xfs_flush_unmap_range(ip, offset, len); 910 if (error) 911 goto out_unlock; 912 } 913 914 error = file_modified(file); 915 if (error) 916 goto out_unlock; 917 918 if (mode & FALLOC_FL_PUNCH_HOLE) { 919 error = xfs_free_file_space(ip, offset, len); 920 if (error) 921 goto out_unlock; 922 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { 923 if (!xfs_is_falloc_aligned(ip, offset, len)) { 924 error = -EINVAL; 925 goto out_unlock; 926 } 927 928 /* 929 * There is no need to overlap collapse range with EOF, 930 * in which case it is effectively a truncate operation 931 */ 932 if (offset + len >= i_size_read(inode)) { 933 error = -EINVAL; 934 goto out_unlock; 935 } 936 937 new_size = i_size_read(inode) - len; 938 939 error = xfs_collapse_file_space(ip, offset, len); 940 if (error) 941 goto out_unlock; 942 } else if (mode & FALLOC_FL_INSERT_RANGE) { 943 loff_t isize = i_size_read(inode); 944 945 if (!xfs_is_falloc_aligned(ip, offset, len)) { 946 error = -EINVAL; 947 goto out_unlock; 948 } 949 950 /* 951 * New inode size must not exceed ->s_maxbytes, accounting for 952 * possible signed overflow. 953 */ 954 if (inode->i_sb->s_maxbytes - isize < len) { 955 error = -EFBIG; 956 goto out_unlock; 957 } 958 new_size = isize + len; 959 960 /* Offset should be less than i_size */ 961 if (offset >= isize) { 962 error = -EINVAL; 963 goto out_unlock; 964 } 965 do_file_insert = true; 966 } else { 967 if (!(mode & FALLOC_FL_KEEP_SIZE) && 968 offset + len > i_size_read(inode)) { 969 new_size = offset + len; 970 error = inode_newsize_ok(inode, new_size); 971 if (error) 972 goto out_unlock; 973 } 974 975 if (mode & FALLOC_FL_ZERO_RANGE) { 976 /* 977 * Punch a hole and prealloc the range. We use a hole 978 * punch rather than unwritten extent conversion for two 979 * reasons: 980 * 981 * 1.) Hole punch handles partial block zeroing for us. 982 * 2.) If prealloc returns ENOSPC, the file range is 983 * still zero-valued by virtue of the hole punch. 984 */ 985 unsigned int blksize = i_blocksize(inode); 986 987 trace_xfs_zero_file_space(ip); 988 989 error = xfs_free_file_space(ip, offset, len); 990 if (error) 991 goto out_unlock; 992 993 len = round_up(offset + len, blksize) - 994 round_down(offset, blksize); 995 offset = round_down(offset, blksize); 996 } else if (mode & FALLOC_FL_UNSHARE_RANGE) { 997 error = xfs_reflink_unshare(ip, offset, len); 998 if (error) 999 goto out_unlock; 1000 } else { 1001 /* 1002 * If always_cow mode we can't use preallocations and 1003 * thus should not create them. 1004 */ 1005 if (xfs_is_always_cow_inode(ip)) { 1006 error = -EOPNOTSUPP; 1007 goto out_unlock; 1008 } 1009 } 1010 1011 if (!xfs_is_always_cow_inode(ip)) { 1012 error = xfs_alloc_file_space(ip, offset, len); 1013 if (error) 1014 goto out_unlock; 1015 } 1016 } 1017 1018 /* Change file size if needed */ 1019 if (new_size) { 1020 struct iattr iattr; 1021 1022 iattr.ia_valid = ATTR_SIZE; 1023 iattr.ia_size = new_size; 1024 error = xfs_vn_setattr_size(file_mnt_idmap(file), 1025 file_dentry(file), &iattr); 1026 if (error) 1027 goto out_unlock; 1028 } 1029 1030 /* 1031 * Perform hole insertion now that the file size has been 1032 * updated so that if we crash during the operation we don't 1033 * leave shifted extents past EOF and hence losing access to 1034 * the data that is contained within them. 1035 */ 1036 if (do_file_insert) { 1037 error = xfs_insert_file_space(ip, offset, len); 1038 if (error) 1039 goto out_unlock; 1040 } 1041 1042 if (xfs_file_sync_writes(file)) 1043 error = xfs_log_force_inode(ip); 1044 1045 out_unlock: 1046 xfs_iunlock(ip, iolock); 1047 return error; 1048 } 1049 1050 STATIC int 1051 xfs_file_fadvise( 1052 struct file *file, 1053 loff_t start, 1054 loff_t end, 1055 int advice) 1056 { 1057 struct xfs_inode *ip = XFS_I(file_inode(file)); 1058 int ret; 1059 int lockflags = 0; 1060 1061 /* 1062 * Operations creating pages in page cache need protection from hole 1063 * punching and similar ops 1064 */ 1065 if (advice == POSIX_FADV_WILLNEED) { 1066 lockflags = XFS_IOLOCK_SHARED; 1067 xfs_ilock(ip, lockflags); 1068 } 1069 ret = generic_fadvise(file, start, end, advice); 1070 if (lockflags) 1071 xfs_iunlock(ip, lockflags); 1072 return ret; 1073 } 1074 1075 STATIC loff_t 1076 xfs_file_remap_range( 1077 struct file *file_in, 1078 loff_t pos_in, 1079 struct file *file_out, 1080 loff_t pos_out, 1081 loff_t len, 1082 unsigned int remap_flags) 1083 { 1084 struct inode *inode_in = file_inode(file_in); 1085 struct xfs_inode *src = XFS_I(inode_in); 1086 struct inode *inode_out = file_inode(file_out); 1087 struct xfs_inode *dest = XFS_I(inode_out); 1088 struct xfs_mount *mp = src->i_mount; 1089 loff_t remapped = 0; 1090 xfs_extlen_t cowextsize; 1091 int ret; 1092 1093 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1094 return -EINVAL; 1095 1096 if (!xfs_has_reflink(mp)) 1097 return -EOPNOTSUPP; 1098 1099 if (xfs_is_shutdown(mp)) 1100 return -EIO; 1101 1102 /* Prepare and then clone file data. */ 1103 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1104 &len, remap_flags); 1105 if (ret || len == 0) 1106 return ret; 1107 1108 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1109 1110 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1111 &remapped); 1112 if (ret) 1113 goto out_unlock; 1114 1115 /* 1116 * Carry the cowextsize hint from src to dest if we're sharing the 1117 * entire source file to the entire destination file, the source file 1118 * has a cowextsize hint, and the destination file does not. 1119 */ 1120 cowextsize = 0; 1121 if (pos_in == 0 && len == i_size_read(inode_in) && 1122 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1123 pos_out == 0 && len >= i_size_read(inode_out) && 1124 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1125 cowextsize = src->i_cowextsize; 1126 1127 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1128 remap_flags); 1129 if (ret) 1130 goto out_unlock; 1131 1132 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1133 xfs_log_force_inode(dest); 1134 out_unlock: 1135 xfs_iunlock2_remapping(src, dest); 1136 if (ret) 1137 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1138 return remapped > 0 ? remapped : ret; 1139 } 1140 1141 STATIC int 1142 xfs_file_open( 1143 struct inode *inode, 1144 struct file *file) 1145 { 1146 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1147 return -EIO; 1148 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 1149 return generic_file_open(inode, file); 1150 } 1151 1152 STATIC int 1153 xfs_dir_open( 1154 struct inode *inode, 1155 struct file *file) 1156 { 1157 struct xfs_inode *ip = XFS_I(inode); 1158 unsigned int mode; 1159 int error; 1160 1161 if (xfs_is_shutdown(ip->i_mount)) 1162 return -EIO; 1163 error = generic_file_open(inode, file); 1164 if (error) 1165 return error; 1166 1167 /* 1168 * If there are any blocks, read-ahead block 0 as we're almost 1169 * certain to have the next operation be a read there. 1170 */ 1171 mode = xfs_ilock_data_map_shared(ip); 1172 if (ip->i_df.if_nextents > 0) 1173 error = xfs_dir3_data_readahead(ip, 0, 0); 1174 xfs_iunlock(ip, mode); 1175 return error; 1176 } 1177 1178 STATIC int 1179 xfs_file_release( 1180 struct inode *inode, 1181 struct file *filp) 1182 { 1183 return xfs_release(XFS_I(inode)); 1184 } 1185 1186 STATIC int 1187 xfs_file_readdir( 1188 struct file *file, 1189 struct dir_context *ctx) 1190 { 1191 struct inode *inode = file_inode(file); 1192 xfs_inode_t *ip = XFS_I(inode); 1193 size_t bufsize; 1194 1195 /* 1196 * The Linux API doesn't pass down the total size of the buffer 1197 * we read into down to the filesystem. With the filldir concept 1198 * it's not needed for correct information, but the XFS dir2 leaf 1199 * code wants an estimate of the buffer size to calculate it's 1200 * readahead window and size the buffers used for mapping to 1201 * physical blocks. 1202 * 1203 * Try to give it an estimate that's good enough, maybe at some 1204 * point we can change the ->readdir prototype to include the 1205 * buffer size. For now we use the current glibc buffer size. 1206 */ 1207 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1208 1209 return xfs_readdir(NULL, ip, ctx, bufsize); 1210 } 1211 1212 STATIC loff_t 1213 xfs_file_llseek( 1214 struct file *file, 1215 loff_t offset, 1216 int whence) 1217 { 1218 struct inode *inode = file->f_mapping->host; 1219 1220 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1221 return -EIO; 1222 1223 switch (whence) { 1224 default: 1225 return generic_file_llseek(file, offset, whence); 1226 case SEEK_HOLE: 1227 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1228 break; 1229 case SEEK_DATA: 1230 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1231 break; 1232 } 1233 1234 if (offset < 0) 1235 return offset; 1236 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1237 } 1238 1239 static inline vm_fault_t 1240 xfs_dax_fault_locked( 1241 struct vm_fault *vmf, 1242 unsigned int order, 1243 bool write_fault) 1244 { 1245 vm_fault_t ret; 1246 pfn_t pfn; 1247 1248 if (!IS_ENABLED(CONFIG_FS_DAX)) { 1249 ASSERT(0); 1250 return VM_FAULT_SIGBUS; 1251 } 1252 ret = dax_iomap_fault(vmf, order, &pfn, NULL, 1253 (write_fault && !vmf->cow_page) ? 1254 &xfs_dax_write_iomap_ops : 1255 &xfs_read_iomap_ops); 1256 if (ret & VM_FAULT_NEEDDSYNC) 1257 ret = dax_finish_sync_fault(vmf, order, pfn); 1258 return ret; 1259 } 1260 1261 static vm_fault_t 1262 xfs_dax_read_fault( 1263 struct vm_fault *vmf, 1264 unsigned int order) 1265 { 1266 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1267 vm_fault_t ret; 1268 1269 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1270 ret = xfs_dax_fault_locked(vmf, order, false); 1271 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1272 1273 return ret; 1274 } 1275 1276 static vm_fault_t 1277 xfs_write_fault( 1278 struct vm_fault *vmf, 1279 unsigned int order) 1280 { 1281 struct inode *inode = file_inode(vmf->vma->vm_file); 1282 struct xfs_inode *ip = XFS_I(inode); 1283 unsigned int lock_mode = XFS_MMAPLOCK_SHARED; 1284 vm_fault_t ret; 1285 1286 sb_start_pagefault(inode->i_sb); 1287 file_update_time(vmf->vma->vm_file); 1288 1289 /* 1290 * Normally we only need the shared mmaplock, but if a reflink remap is 1291 * in progress we take the exclusive lock to wait for the remap to 1292 * finish before taking a write fault. 1293 */ 1294 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1295 if (xfs_iflags_test(ip, XFS_IREMAPPING)) { 1296 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1297 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1298 lock_mode = XFS_MMAPLOCK_EXCL; 1299 } 1300 1301 if (IS_DAX(inode)) 1302 ret = xfs_dax_fault_locked(vmf, order, true); 1303 else 1304 ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); 1305 xfs_iunlock(ip, lock_mode); 1306 1307 sb_end_pagefault(inode->i_sb); 1308 return ret; 1309 } 1310 1311 /* 1312 * Locking for serialisation of IO during page faults. This results in a lock 1313 * ordering of: 1314 * 1315 * mmap_lock (MM) 1316 * sb_start_pagefault(vfs, freeze) 1317 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1318 * page_lock (MM) 1319 * i_lock (XFS - extent map serialisation) 1320 */ 1321 static vm_fault_t 1322 __xfs_filemap_fault( 1323 struct vm_fault *vmf, 1324 unsigned int order, 1325 bool write_fault) 1326 { 1327 struct inode *inode = file_inode(vmf->vma->vm_file); 1328 1329 trace_xfs_filemap_fault(XFS_I(inode), order, write_fault); 1330 1331 if (write_fault) 1332 return xfs_write_fault(vmf, order); 1333 if (IS_DAX(inode)) 1334 return xfs_dax_read_fault(vmf, order); 1335 return filemap_fault(vmf); 1336 } 1337 1338 static inline bool 1339 xfs_is_write_fault( 1340 struct vm_fault *vmf) 1341 { 1342 return (vmf->flags & FAULT_FLAG_WRITE) && 1343 (vmf->vma->vm_flags & VM_SHARED); 1344 } 1345 1346 static vm_fault_t 1347 xfs_filemap_fault( 1348 struct vm_fault *vmf) 1349 { 1350 /* DAX can shortcut the normal fault path on write faults! */ 1351 return __xfs_filemap_fault(vmf, 0, 1352 IS_DAX(file_inode(vmf->vma->vm_file)) && 1353 xfs_is_write_fault(vmf)); 1354 } 1355 1356 static vm_fault_t 1357 xfs_filemap_huge_fault( 1358 struct vm_fault *vmf, 1359 unsigned int order) 1360 { 1361 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1362 return VM_FAULT_FALLBACK; 1363 1364 /* DAX can shortcut the normal fault path on write faults! */ 1365 return __xfs_filemap_fault(vmf, order, 1366 xfs_is_write_fault(vmf)); 1367 } 1368 1369 static vm_fault_t 1370 xfs_filemap_page_mkwrite( 1371 struct vm_fault *vmf) 1372 { 1373 return __xfs_filemap_fault(vmf, 0, true); 1374 } 1375 1376 /* 1377 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1378 * on write faults. In reality, it needs to serialise against truncate and 1379 * prepare memory for writing so handle is as standard write fault. 1380 */ 1381 static vm_fault_t 1382 xfs_filemap_pfn_mkwrite( 1383 struct vm_fault *vmf) 1384 { 1385 1386 return __xfs_filemap_fault(vmf, 0, true); 1387 } 1388 1389 static const struct vm_operations_struct xfs_file_vm_ops = { 1390 .fault = xfs_filemap_fault, 1391 .huge_fault = xfs_filemap_huge_fault, 1392 .map_pages = filemap_map_pages, 1393 .page_mkwrite = xfs_filemap_page_mkwrite, 1394 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1395 }; 1396 1397 STATIC int 1398 xfs_file_mmap( 1399 struct file *file, 1400 struct vm_area_struct *vma) 1401 { 1402 struct inode *inode = file_inode(file); 1403 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 1404 1405 /* 1406 * We don't support synchronous mappings for non-DAX files and 1407 * for DAX files if underneath dax_device is not synchronous. 1408 */ 1409 if (!daxdev_mapping_supported(vma, target->bt_daxdev)) 1410 return -EOPNOTSUPP; 1411 1412 file_accessed(file); 1413 vma->vm_ops = &xfs_file_vm_ops; 1414 if (IS_DAX(inode)) 1415 vm_flags_set(vma, VM_HUGEPAGE); 1416 return 0; 1417 } 1418 1419 const struct file_operations xfs_file_operations = { 1420 .llseek = xfs_file_llseek, 1421 .read_iter = xfs_file_read_iter, 1422 .write_iter = xfs_file_write_iter, 1423 .splice_read = xfs_file_splice_read, 1424 .splice_write = iter_file_splice_write, 1425 .iopoll = iocb_bio_iopoll, 1426 .unlocked_ioctl = xfs_file_ioctl, 1427 #ifdef CONFIG_COMPAT 1428 .compat_ioctl = xfs_file_compat_ioctl, 1429 #endif 1430 .mmap = xfs_file_mmap, 1431 .open = xfs_file_open, 1432 .release = xfs_file_release, 1433 .fsync = xfs_file_fsync, 1434 .get_unmapped_area = thp_get_unmapped_area, 1435 .fallocate = xfs_file_fallocate, 1436 .fadvise = xfs_file_fadvise, 1437 .remap_file_range = xfs_file_remap_range, 1438 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 1439 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, 1440 }; 1441 1442 const struct file_operations xfs_dir_file_operations = { 1443 .open = xfs_dir_open, 1444 .read = generic_read_dir, 1445 .iterate_shared = xfs_file_readdir, 1446 .llseek = generic_file_llseek, 1447 .unlocked_ioctl = xfs_file_ioctl, 1448 #ifdef CONFIG_COMPAT 1449 .compat_ioctl = xfs_file_compat_ioctl, 1450 #endif 1451 .fsync = xfs_dir_fsync, 1452 }; 1453