1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 #include "xfs_file.h" 28 #include "xfs_aops.h" 29 #include "xfs_zone_alloc.h" 30 #include "xfs_error.h" 31 #include "xfs_errortag.h" 32 33 #include <linux/dax.h> 34 #include <linux/falloc.h> 35 #include <linux/backing-dev.h> 36 #include <linux/mman.h> 37 #include <linux/fadvise.h> 38 #include <linux/mount.h> 39 #include <linux/filelock.h> 40 41 static const struct vm_operations_struct xfs_file_vm_ops; 42 43 /* 44 * Decide if the given file range is aligned to the size of the fundamental 45 * allocation unit for the file. 46 */ 47 bool 48 xfs_is_falloc_aligned( 49 struct xfs_inode *ip, 50 loff_t pos, 51 long long int len) 52 { 53 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); 54 55 if (!is_power_of_2(alloc_unit)) 56 return isaligned_64(pos, alloc_unit) && 57 isaligned_64(len, alloc_unit); 58 59 return !((pos | len) & (alloc_unit - 1)); 60 } 61 62 /* 63 * Fsync operations on directories are much simpler than on regular files, 64 * as there is no file data to flush, and thus also no need for explicit 65 * cache flush operations, and there are no non-transaction metadata updates 66 * on directories either. 67 */ 68 STATIC int 69 xfs_dir_fsync( 70 struct file *file, 71 loff_t start, 72 loff_t end, 73 int datasync) 74 { 75 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 76 77 trace_xfs_dir_fsync(ip); 78 return xfs_log_force_inode(ip); 79 } 80 81 /* 82 * All metadata updates are logged, which means that we just have to push the 83 * journal to the required sequence number than holds the updates. We track 84 * datasync commits separately to full sync commits, and hence only need to 85 * select the correct sequence number for the log force here. 86 * 87 * We don't have to serialise against concurrent modifications, as we do not 88 * have to wait for modifications that have not yet completed. We define a 89 * transaction commit as completing when the commit sequence number is updated, 90 * hence if the sequence number has not updated, the sync operation has been 91 * run before the commit completed and we don't have to wait for it. 92 * 93 * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain 94 * set on the log item until - at least - the journal flush completes. In 95 * reality, they are only cleared when the inode is fully unpinned (i.e. 96 * persistent in the journal and not dirty in the CIL), and so we rely on 97 * xfs_log_force_seq() either skipping sequences that have been persisted or 98 * waiting on sequences that are still in flight to correctly order concurrent 99 * sync operations. 100 */ 101 static int 102 xfs_fsync_flush_log( 103 struct xfs_inode *ip, 104 bool datasync, 105 int *log_flushed) 106 { 107 struct xfs_inode_log_item *iip = ip->i_itemp; 108 xfs_csn_t seq = 0; 109 110 spin_lock(&iip->ili_lock); 111 if (datasync) 112 seq = iip->ili_datasync_seq; 113 else 114 seq = iip->ili_commit_seq; 115 spin_unlock(&iip->ili_lock); 116 117 if (!seq) 118 return 0; 119 120 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 121 log_flushed); 122 } 123 124 STATIC int 125 xfs_file_fsync( 126 struct file *file, 127 loff_t start, 128 loff_t end, 129 int datasync) 130 { 131 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 132 struct xfs_mount *mp = ip->i_mount; 133 int error, err2; 134 int log_flushed = 0; 135 136 trace_xfs_file_fsync(ip); 137 138 error = file_write_and_wait_range(file, start, end); 139 if (error) 140 return error; 141 142 if (xfs_is_shutdown(mp)) 143 return -EIO; 144 145 xfs_iflags_clear(ip, XFS_ITRUNCATED); 146 147 /* 148 * If we have an RT and/or log subvolume we need to make sure to flush 149 * the write cache the device used for file data first. This is to 150 * ensure newly written file data make it to disk before logging the new 151 * inode size in case of an extending write. 152 */ 153 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) 154 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 155 else if (mp->m_logdev_targp != mp->m_ddev_targp) 156 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 157 158 /* 159 * If the inode has a inode log item attached, it may need the journal 160 * flushed to persist any changes the log item might be tracking. 161 */ 162 if (ip->i_itemp) { 163 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 164 if (err2 && !error) 165 error = err2; 166 } 167 168 /* 169 * If we only have a single device, and the log force about was 170 * a no-op we might have to flush the data device cache here. 171 * This can only happen for fdatasync/O_DSYNC if we were overwriting 172 * an already allocated file and thus do not have any metadata to 173 * commit. 174 */ 175 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 176 mp->m_logdev_targp == mp->m_ddev_targp) { 177 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 178 if (err2 && !error) 179 error = err2; 180 } 181 182 return error; 183 } 184 185 static int 186 xfs_ilock_iocb( 187 struct kiocb *iocb, 188 unsigned int lock_mode) 189 { 190 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 191 192 if (iocb->ki_flags & IOCB_NOWAIT) { 193 if (!xfs_ilock_nowait(ip, lock_mode)) 194 return -EAGAIN; 195 } else { 196 xfs_ilock(ip, lock_mode); 197 } 198 199 return 0; 200 } 201 202 static int 203 xfs_ilock_iocb_for_write( 204 struct kiocb *iocb, 205 unsigned int *lock_mode) 206 { 207 ssize_t ret; 208 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 209 210 ret = xfs_ilock_iocb(iocb, *lock_mode); 211 if (ret) 212 return ret; 213 214 /* 215 * If a reflink remap is in progress we always need to take the iolock 216 * exclusively to wait for it to finish. 217 */ 218 if (*lock_mode == XFS_IOLOCK_SHARED && 219 xfs_iflags_test(ip, XFS_IREMAPPING)) { 220 xfs_iunlock(ip, *lock_mode); 221 *lock_mode = XFS_IOLOCK_EXCL; 222 return xfs_ilock_iocb(iocb, *lock_mode); 223 } 224 225 return 0; 226 } 227 228 /* 229 * Bounce buffering dio reads need a user context to copy back the data. 230 * Use an ioend to provide that. 231 */ 232 static void 233 xfs_dio_read_bounce_submit_io( 234 const struct iomap_iter *iter, 235 struct bio *bio, 236 loff_t file_offset) 237 { 238 iomap_init_ioend(iter->inode, bio, file_offset, IOMAP_IOEND_DIRECT); 239 bio->bi_end_io = xfs_end_bio; 240 submit_bio(bio); 241 } 242 243 static const struct iomap_dio_ops xfs_dio_read_bounce_ops = { 244 .submit_io = xfs_dio_read_bounce_submit_io, 245 .bio_set = &iomap_ioend_bioset, 246 }; 247 248 STATIC ssize_t 249 xfs_file_dio_read( 250 struct kiocb *iocb, 251 struct iov_iter *to) 252 { 253 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 254 unsigned int dio_flags = 0; 255 const struct iomap_dio_ops *dio_ops = NULL; 256 ssize_t ret; 257 258 trace_xfs_file_direct_read(iocb, to); 259 260 if (!iov_iter_count(to)) 261 return 0; /* skip atime */ 262 263 file_accessed(iocb->ki_filp); 264 265 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 266 if (ret) 267 return ret; 268 if (mapping_stable_writes(iocb->ki_filp->f_mapping)) { 269 dio_ops = &xfs_dio_read_bounce_ops; 270 dio_flags |= IOMAP_DIO_BOUNCE; 271 } 272 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, dio_ops, dio_flags, 273 NULL, 0); 274 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 275 276 return ret; 277 } 278 279 static noinline ssize_t 280 xfs_file_dax_read( 281 struct kiocb *iocb, 282 struct iov_iter *to) 283 { 284 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 285 ssize_t ret = 0; 286 287 trace_xfs_file_dax_read(iocb, to); 288 289 if (!iov_iter_count(to)) 290 return 0; /* skip atime */ 291 292 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 293 if (ret) 294 return ret; 295 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 296 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 297 298 file_accessed(iocb->ki_filp); 299 return ret; 300 } 301 302 STATIC ssize_t 303 xfs_file_buffered_read( 304 struct kiocb *iocb, 305 struct iov_iter *to) 306 { 307 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 308 ssize_t ret; 309 310 trace_xfs_file_buffered_read(iocb, to); 311 312 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 313 if (ret) 314 return ret; 315 ret = generic_file_read_iter(iocb, to); 316 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 317 318 return ret; 319 } 320 321 STATIC ssize_t 322 xfs_file_read_iter( 323 struct kiocb *iocb, 324 struct iov_iter *to) 325 { 326 struct inode *inode = file_inode(iocb->ki_filp); 327 struct xfs_mount *mp = XFS_I(inode)->i_mount; 328 ssize_t ret = 0; 329 330 XFS_STATS_INC(mp, xs_read_calls); 331 332 if (xfs_is_shutdown(mp)) 333 return -EIO; 334 335 if (IS_DAX(inode)) 336 ret = xfs_file_dax_read(iocb, to); 337 else if (iocb->ki_flags & IOCB_DIRECT) 338 ret = xfs_file_dio_read(iocb, to); 339 else 340 ret = xfs_file_buffered_read(iocb, to); 341 342 if (ret > 0) 343 XFS_STATS_ADD(mp, xs_read_bytes, ret); 344 return ret; 345 } 346 347 STATIC ssize_t 348 xfs_file_splice_read( 349 struct file *in, 350 loff_t *ppos, 351 struct pipe_inode_info *pipe, 352 size_t len, 353 unsigned int flags) 354 { 355 struct inode *inode = file_inode(in); 356 struct xfs_inode *ip = XFS_I(inode); 357 struct xfs_mount *mp = ip->i_mount; 358 ssize_t ret = 0; 359 360 XFS_STATS_INC(mp, xs_read_calls); 361 362 if (xfs_is_shutdown(mp)) 363 return -EIO; 364 365 trace_xfs_file_splice_read(ip, *ppos, len); 366 367 xfs_ilock(ip, XFS_IOLOCK_SHARED); 368 ret = filemap_splice_read(in, ppos, pipe, len, flags); 369 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 370 if (ret > 0) 371 XFS_STATS_ADD(mp, xs_read_bytes, ret); 372 return ret; 373 } 374 375 /* 376 * Take care of zeroing post-EOF blocks when they might exist. 377 * 378 * Returns 0 if successfully, a negative error for a failure, or 1 if this 379 * function dropped the iolock and reacquired it exclusively and the caller 380 * needs to restart the write sanity checks. 381 */ 382 static ssize_t 383 xfs_file_write_zero_eof( 384 struct kiocb *iocb, 385 struct iov_iter *from, 386 unsigned int *iolock, 387 size_t count, 388 bool *drained_dio, 389 struct xfs_zone_alloc_ctx *ac) 390 { 391 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 392 loff_t isize; 393 int error; 394 395 /* 396 * We need to serialise against EOF updates that occur in IO completions 397 * here. We want to make sure that nobody is changing the size while 398 * we do this check until we have placed an IO barrier (i.e. hold 399 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 400 * spinlock effectively forms a memory barrier once we have 401 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 402 * hence be able to correctly determine if we need to run zeroing. 403 */ 404 spin_lock(&ip->i_flags_lock); 405 isize = i_size_read(VFS_I(ip)); 406 if (iocb->ki_pos <= isize) { 407 spin_unlock(&ip->i_flags_lock); 408 return 0; 409 } 410 spin_unlock(&ip->i_flags_lock); 411 412 if (iocb->ki_flags & IOCB_NOWAIT) 413 return -EAGAIN; 414 415 if (!*drained_dio) { 416 /* 417 * If zeroing is needed and we are currently holding the iolock 418 * shared, we need to update it to exclusive which implies 419 * having to redo all checks before. 420 */ 421 if (*iolock == XFS_IOLOCK_SHARED) { 422 xfs_iunlock(ip, *iolock); 423 *iolock = XFS_IOLOCK_EXCL; 424 xfs_ilock(ip, *iolock); 425 iov_iter_reexpand(from, count); 426 } 427 428 /* 429 * We now have an IO submission barrier in place, but AIO can do 430 * EOF updates during IO completion and hence we now need to 431 * wait for all of them to drain. Non-AIO DIO will have drained 432 * before we are given the XFS_IOLOCK_EXCL, and so for most 433 * cases this wait is a no-op. 434 */ 435 inode_dio_wait(VFS_I(ip)); 436 *drained_dio = true; 437 return 1; 438 } 439 440 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 441 442 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 443 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); 444 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 445 446 return error; 447 } 448 449 /* 450 * Common pre-write limit and setup checks. 451 * 452 * Called with the iolock held either shared and exclusive according to 453 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 454 * if called for a direct write beyond i_size. 455 */ 456 STATIC ssize_t 457 xfs_file_write_checks( 458 struct kiocb *iocb, 459 struct iov_iter *from, 460 unsigned int *iolock, 461 struct xfs_zone_alloc_ctx *ac) 462 { 463 struct inode *inode = iocb->ki_filp->f_mapping->host; 464 size_t count = iov_iter_count(from); 465 bool drained_dio = false; 466 ssize_t error; 467 468 restart: 469 error = generic_write_checks(iocb, from); 470 if (error <= 0) 471 return error; 472 473 if (iocb->ki_flags & IOCB_NOWAIT) { 474 error = break_layout(inode, false); 475 if (error == -EWOULDBLOCK) 476 error = -EAGAIN; 477 } else { 478 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 479 } 480 481 if (error) 482 return error; 483 484 /* 485 * For changing security info in file_remove_privs() we need i_rwsem 486 * exclusively. 487 */ 488 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 489 xfs_iunlock(XFS_I(inode), *iolock); 490 *iolock = XFS_IOLOCK_EXCL; 491 error = xfs_ilock_iocb(iocb, *iolock); 492 if (error) { 493 *iolock = 0; 494 return error; 495 } 496 goto restart; 497 } 498 499 /* 500 * If the offset is beyond the size of the file, we need to zero all 501 * blocks that fall between the existing EOF and the start of this 502 * write. 503 * 504 * We can do an unlocked check for i_size here safely as I/O completion 505 * can only extend EOF. Truncate is locked out at this point, so the 506 * EOF can not move backwards, only forwards. Hence we only need to take 507 * the slow path when we are at or beyond the current EOF. 508 */ 509 if (iocb->ki_pos > i_size_read(inode)) { 510 error = xfs_file_write_zero_eof(iocb, from, iolock, count, 511 &drained_dio, ac); 512 if (error == 1) 513 goto restart; 514 if (error) 515 return error; 516 } 517 518 return kiocb_modified(iocb); 519 } 520 521 static ssize_t 522 xfs_zoned_write_space_reserve( 523 struct xfs_mount *mp, 524 struct kiocb *iocb, 525 struct iov_iter *from, 526 unsigned int flags, 527 struct xfs_zone_alloc_ctx *ac) 528 { 529 loff_t count = iov_iter_count(from); 530 int error; 531 532 if (iocb->ki_flags & IOCB_NOWAIT) 533 flags |= XFS_ZR_NOWAIT; 534 535 /* 536 * Check the rlimit and LFS boundary first so that we don't over-reserve 537 * by possibly a lot. 538 * 539 * The generic write path will redo this check later, and it might have 540 * changed by then. If it got expanded we'll stick to our earlier 541 * smaller limit, and if it is decreased the new smaller limit will be 542 * used and our extra space reservation will be returned after finishing 543 * the write. 544 */ 545 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); 546 if (error) 547 return error; 548 549 /* 550 * Sloppily round up count to file system blocks. 551 * 552 * This will often reserve an extra block, but that avoids having to look 553 * at the start offset, which isn't stable for O_APPEND until taking the 554 * iolock. Also we need to reserve a block each for zeroing the old 555 * EOF block and the new start block if they are unaligned. 556 * 557 * Any remaining block will be returned after the write. 558 */ 559 return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, 560 flags, ac); 561 } 562 563 static int 564 xfs_dio_write_end_io( 565 struct kiocb *iocb, 566 ssize_t size, 567 int error, 568 unsigned flags) 569 { 570 struct inode *inode = file_inode(iocb->ki_filp); 571 struct xfs_inode *ip = XFS_I(inode); 572 loff_t offset = iocb->ki_pos; 573 unsigned int nofs_flag; 574 575 ASSERT(!xfs_is_zoned_inode(ip) || 576 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 577 578 trace_xfs_end_io_direct_write(ip, offset, size); 579 580 if (xfs_is_shutdown(ip->i_mount)) 581 return -EIO; 582 583 if (error) 584 return error; 585 if (!size) 586 return 0; 587 588 /* 589 * Capture amount written on completion as we can't reliably account 590 * for it on submission. 591 */ 592 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 593 594 /* 595 * We can allocate memory here while doing writeback on behalf of 596 * memory reclaim. To avoid memory allocation deadlocks set the 597 * task-wide nofs context for the following operations. 598 */ 599 nofs_flag = memalloc_nofs_save(); 600 601 if (flags & IOMAP_DIO_COW) { 602 if (iocb->ki_flags & IOCB_ATOMIC) 603 error = xfs_reflink_end_atomic_cow(ip, offset, size); 604 else 605 error = xfs_reflink_end_cow(ip, offset, size); 606 if (error) 607 goto out; 608 } 609 610 /* 611 * Unwritten conversion updates the in-core isize after extent 612 * conversion but before updating the on-disk size. Updating isize any 613 * earlier allows a racing dio read to find unwritten extents before 614 * they are converted. 615 */ 616 if (flags & IOMAP_DIO_UNWRITTEN) { 617 error = xfs_iomap_write_unwritten(ip, offset, size, true); 618 goto out; 619 } 620 621 /* 622 * We need to update the in-core inode size here so that we don't end up 623 * with the on-disk inode size being outside the in-core inode size. We 624 * have no other method of updating EOF for AIO, so always do it here 625 * if necessary. 626 * 627 * We need to lock the test/set EOF update as we can be racing with 628 * other IO completions here to update the EOF. Failing to serialise 629 * here can result in EOF moving backwards and Bad Things Happen when 630 * that occurs. 631 * 632 * As IO completion only ever extends EOF, we can do an unlocked check 633 * here to avoid taking the spinlock. If we land within the current EOF, 634 * then we do not need to do an extending update at all, and we don't 635 * need to take the lock to check this. If we race with an update moving 636 * EOF, then we'll either still be beyond EOF and need to take the lock, 637 * or we'll be within EOF and we don't need to take it at all. 638 */ 639 if (offset + size <= i_size_read(inode)) 640 goto out; 641 642 spin_lock(&ip->i_flags_lock); 643 if (offset + size > i_size_read(inode)) { 644 i_size_write(inode, offset + size); 645 spin_unlock(&ip->i_flags_lock); 646 error = xfs_setfilesize(ip, offset, size); 647 } else { 648 spin_unlock(&ip->i_flags_lock); 649 } 650 651 out: 652 memalloc_nofs_restore(nofs_flag); 653 return error; 654 } 655 656 static const struct iomap_dio_ops xfs_dio_write_ops = { 657 .end_io = xfs_dio_write_end_io, 658 }; 659 660 static void 661 xfs_dio_zoned_submit_io( 662 const struct iomap_iter *iter, 663 struct bio *bio, 664 loff_t file_offset) 665 { 666 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; 667 struct xfs_zone_alloc_ctx *ac = iter->private; 668 xfs_filblks_t count_fsb; 669 struct iomap_ioend *ioend; 670 671 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); 672 if (count_fsb > ac->reserved_blocks) { 673 xfs_err(mp, 674 "allocation (%lld) larger than reservation (%lld).", 675 count_fsb, ac->reserved_blocks); 676 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 677 bio_io_error(bio); 678 return; 679 } 680 ac->reserved_blocks -= count_fsb; 681 682 bio->bi_end_io = xfs_end_bio; 683 ioend = iomap_init_ioend(iter->inode, bio, file_offset, 684 IOMAP_IOEND_DIRECT); 685 xfs_zone_alloc_and_submit(ioend, &ac->open_zone); 686 } 687 688 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 689 .bio_set = &iomap_ioend_bioset, 690 .submit_io = xfs_dio_zoned_submit_io, 691 .end_io = xfs_dio_write_end_io, 692 }; 693 694 /* 695 * Handle block aligned direct I/O writes. 696 */ 697 static noinline ssize_t 698 xfs_file_dio_write_aligned( 699 struct xfs_inode *ip, 700 struct kiocb *iocb, 701 struct iov_iter *from, 702 const struct iomap_ops *ops, 703 const struct iomap_dio_ops *dops, 704 struct xfs_zone_alloc_ctx *ac) 705 { 706 unsigned int iolock = XFS_IOLOCK_SHARED; 707 unsigned int dio_flags = 0; 708 ssize_t ret; 709 710 /* 711 * For always COW inodes, each bio must be aligned to the file system 712 * block size and not just the device sector size because we need to 713 * allocate a block-aligned amount of space for each write. 714 */ 715 if (xfs_is_always_cow_inode(ip)) 716 dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; 717 718 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 719 if (ret) 720 return ret; 721 ret = xfs_file_write_checks(iocb, from, &iolock, ac); 722 if (ret) 723 goto out_unlock; 724 725 /* 726 * We don't need to hold the IOLOCK exclusively across the IO, so demote 727 * the iolock back to shared if we had to take the exclusive lock in 728 * xfs_file_write_checks() for other reasons. 729 */ 730 if (iolock == XFS_IOLOCK_EXCL) { 731 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 732 iolock = XFS_IOLOCK_SHARED; 733 } 734 if (mapping_stable_writes(iocb->ki_filp->f_mapping)) 735 dio_flags |= IOMAP_DIO_BOUNCE; 736 trace_xfs_file_direct_write(iocb, from); 737 ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); 738 out_unlock: 739 xfs_iunlock(ip, iolock); 740 return ret; 741 } 742 743 /* 744 * Handle block aligned direct I/O writes to zoned devices. 745 */ 746 static noinline ssize_t 747 xfs_file_dio_write_zoned( 748 struct xfs_inode *ip, 749 struct kiocb *iocb, 750 struct iov_iter *from) 751 { 752 struct xfs_zone_alloc_ctx ac = { }; 753 ssize_t ret; 754 755 ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); 756 if (ret < 0) 757 return ret; 758 ret = xfs_file_dio_write_aligned(ip, iocb, from, 759 &xfs_zoned_direct_write_iomap_ops, 760 &xfs_dio_zoned_write_ops, &ac); 761 xfs_zoned_space_unreserve(ip->i_mount, &ac); 762 return ret; 763 } 764 765 /* 766 * Handle block atomic writes 767 * 768 * Two methods of atomic writes are supported: 769 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the 770 * disk 771 * - COW-based, which uses a COW fork as a staging extent for data updates 772 * before atomically updating extent mappings for the range being written 773 * 774 */ 775 static noinline ssize_t 776 xfs_file_dio_write_atomic( 777 struct xfs_inode *ip, 778 struct kiocb *iocb, 779 struct iov_iter *from) 780 { 781 unsigned int iolock = XFS_IOLOCK_SHARED; 782 ssize_t ret, ocount = iov_iter_count(from); 783 unsigned int dio_flags = 0; 784 const struct iomap_ops *dops; 785 786 /* 787 * HW offload should be faster, so try that first if it is already 788 * known that the write length is not too large. 789 */ 790 if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) 791 dops = &xfs_atomic_write_cow_iomap_ops; 792 else 793 dops = &xfs_direct_write_iomap_ops; 794 795 retry: 796 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 797 if (ret) 798 return ret; 799 800 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 801 if (ret) 802 goto out_unlock; 803 804 /* Demote similar to xfs_file_dio_write_aligned() */ 805 if (iolock == XFS_IOLOCK_EXCL) { 806 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 807 iolock = XFS_IOLOCK_SHARED; 808 } 809 810 trace_xfs_file_direct_write(iocb, from); 811 if (mapping_stable_writes(iocb->ki_filp->f_mapping)) 812 dio_flags |= IOMAP_DIO_BOUNCE; 813 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, dio_flags, 814 NULL, 0); 815 816 /* 817 * The retry mechanism is based on the ->iomap_begin method returning 818 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not 819 * possible. The REQ_ATOMIC-based method typically not be possible if 820 * the write spans multiple extents or the disk blocks are misaligned. 821 */ 822 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { 823 xfs_iunlock(ip, iolock); 824 dops = &xfs_atomic_write_cow_iomap_ops; 825 goto retry; 826 } 827 828 out_unlock: 829 if (iolock) 830 xfs_iunlock(ip, iolock); 831 return ret; 832 } 833 834 /* 835 * Handle block unaligned direct I/O writes 836 * 837 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 838 * them to be done in parallel with reads and other direct I/O writes. However, 839 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 840 * to do sub-block zeroing and that requires serialisation against other direct 841 * I/O to the same block. In this case we need to serialise the submission of 842 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 843 * In the case where sub-block zeroing is not required, we can do concurrent 844 * sub-block dios to the same block successfully. 845 * 846 * Optimistically submit the I/O using the shared lock first, but use the 847 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 848 * if block allocation or partial block zeroing would be required. In that case 849 * we try again with the exclusive lock. 850 */ 851 static noinline ssize_t 852 xfs_file_dio_write_unaligned( 853 struct xfs_inode *ip, 854 struct kiocb *iocb, 855 struct iov_iter *from) 856 { 857 size_t isize = i_size_read(VFS_I(ip)); 858 size_t count = iov_iter_count(from); 859 unsigned int iolock = XFS_IOLOCK_SHARED; 860 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 861 ssize_t ret; 862 863 /* 864 * Extending writes need exclusivity because of the sub-block zeroing 865 * that the DIO code always does for partial tail blocks beyond EOF, so 866 * don't even bother trying the fast path in this case. 867 */ 868 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 869 if (iocb->ki_flags & IOCB_NOWAIT) 870 return -EAGAIN; 871 retry_exclusive: 872 iolock = XFS_IOLOCK_EXCL; 873 flags = IOMAP_DIO_FORCE_WAIT; 874 } 875 876 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 877 if (ret) 878 return ret; 879 880 /* 881 * We can't properly handle unaligned direct I/O to reflink files yet, 882 * as we can't unshare a partial block. 883 */ 884 if (xfs_is_cow_inode(ip)) { 885 trace_xfs_reflink_bounce_dio_write(iocb, from); 886 ret = -ENOTBLK; 887 goto out_unlock; 888 } 889 890 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 891 if (ret) 892 goto out_unlock; 893 894 /* 895 * If we are doing exclusive unaligned I/O, this must be the only I/O 896 * in-flight. Otherwise we risk data corruption due to unwritten extent 897 * conversions from the AIO end_io handler. Wait for all other I/O to 898 * drain first. 899 */ 900 if (flags & IOMAP_DIO_FORCE_WAIT) 901 inode_dio_wait(VFS_I(ip)); 902 903 if (mapping_stable_writes(iocb->ki_filp->f_mapping)) 904 flags |= IOMAP_DIO_BOUNCE; 905 906 trace_xfs_file_direct_write(iocb, from); 907 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 908 &xfs_dio_write_ops, flags, NULL, 0); 909 910 /* 911 * Retry unaligned I/O with exclusive blocking semantics if the DIO 912 * layer rejected it for mapping or locking reasons. If we are doing 913 * nonblocking user I/O, propagate the error. 914 */ 915 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 916 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 917 xfs_iunlock(ip, iolock); 918 goto retry_exclusive; 919 } 920 921 out_unlock: 922 if (iolock) 923 xfs_iunlock(ip, iolock); 924 return ret; 925 } 926 927 static ssize_t 928 xfs_file_dio_write( 929 struct kiocb *iocb, 930 struct iov_iter *from) 931 { 932 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 933 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 934 size_t count = iov_iter_count(from); 935 936 /* direct I/O must be aligned to device logical sector size */ 937 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 938 return -EINVAL; 939 940 if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) 941 return xfs_file_dio_write_unaligned(ip, iocb, from); 942 if (xfs_is_zoned_inode(ip)) 943 return xfs_file_dio_write_zoned(ip, iocb, from); 944 if (iocb->ki_flags & IOCB_ATOMIC) 945 return xfs_file_dio_write_atomic(ip, iocb, from); 946 return xfs_file_dio_write_aligned(ip, iocb, from, 947 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); 948 } 949 950 static noinline ssize_t 951 xfs_file_dax_write( 952 struct kiocb *iocb, 953 struct iov_iter *from) 954 { 955 struct inode *inode = iocb->ki_filp->f_mapping->host; 956 struct xfs_inode *ip = XFS_I(inode); 957 unsigned int iolock = XFS_IOLOCK_EXCL; 958 ssize_t ret, error = 0; 959 loff_t pos; 960 961 ret = xfs_ilock_iocb(iocb, iolock); 962 if (ret) 963 return ret; 964 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 965 if (ret) 966 goto out; 967 968 pos = iocb->ki_pos; 969 970 trace_xfs_file_dax_write(iocb, from); 971 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 972 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 973 i_size_write(inode, iocb->ki_pos); 974 error = xfs_setfilesize(ip, pos, ret); 975 } 976 out: 977 if (iolock) 978 xfs_iunlock(ip, iolock); 979 if (error) 980 return error; 981 982 if (ret > 0) { 983 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 984 985 /* Handle various SYNC-type writes */ 986 ret = generic_write_sync(iocb, ret); 987 } 988 return ret; 989 } 990 991 STATIC ssize_t 992 xfs_file_buffered_write( 993 struct kiocb *iocb, 994 struct iov_iter *from) 995 { 996 struct inode *inode = iocb->ki_filp->f_mapping->host; 997 struct xfs_inode *ip = XFS_I(inode); 998 ssize_t ret; 999 bool cleared_space = false; 1000 unsigned int iolock; 1001 1002 write_retry: 1003 iolock = XFS_IOLOCK_EXCL; 1004 ret = xfs_ilock_iocb(iocb, iolock); 1005 if (ret) 1006 return ret; 1007 1008 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 1009 if (ret) 1010 goto out; 1011 1012 trace_xfs_file_buffered_write(iocb, from); 1013 ret = iomap_file_buffered_write(iocb, from, 1014 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 1015 NULL); 1016 1017 /* 1018 * If we hit a space limit, try to free up some lingering preallocated 1019 * space before returning an error. In the case of ENOSPC, first try to 1020 * write back all dirty inodes to free up some of the excess reserved 1021 * metadata space. This reduces the chances that the eofblocks scan 1022 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 1023 * also behaves as a filter to prevent too many eofblocks scans from 1024 * running at the same time. Use a synchronous scan to increase the 1025 * effectiveness of the scan. 1026 */ 1027 if (ret == -EDQUOT && !cleared_space) { 1028 xfs_iunlock(ip, iolock); 1029 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 1030 cleared_space = true; 1031 goto write_retry; 1032 } else if (ret == -ENOSPC && !cleared_space) { 1033 struct xfs_icwalk icw = {0}; 1034 1035 cleared_space = true; 1036 xfs_flush_inodes(ip->i_mount); 1037 1038 xfs_iunlock(ip, iolock); 1039 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 1040 xfs_blockgc_free_space(ip->i_mount, &icw); 1041 goto write_retry; 1042 } 1043 1044 out: 1045 if (iolock) 1046 xfs_iunlock(ip, iolock); 1047 1048 if (ret > 0) { 1049 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 1050 /* Handle various SYNC-type writes */ 1051 ret = generic_write_sync(iocb, ret); 1052 } 1053 return ret; 1054 } 1055 1056 STATIC ssize_t 1057 xfs_file_buffered_write_zoned( 1058 struct kiocb *iocb, 1059 struct iov_iter *from) 1060 { 1061 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 1062 struct xfs_mount *mp = ip->i_mount; 1063 unsigned int iolock = XFS_IOLOCK_EXCL; 1064 bool cleared_space = false; 1065 struct xfs_zone_alloc_ctx ac = { }; 1066 ssize_t ret; 1067 1068 ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); 1069 if (ret < 0) 1070 return ret; 1071 1072 ret = xfs_ilock_iocb(iocb, iolock); 1073 if (ret) 1074 goto out_unreserve; 1075 1076 ret = xfs_file_write_checks(iocb, from, &iolock, &ac); 1077 if (ret) 1078 goto out_unlock; 1079 1080 /* 1081 * Truncate the iter to the length that we were actually able to 1082 * allocate blocks for. This needs to happen after 1083 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND 1084 * writes. 1085 */ 1086 iov_iter_truncate(from, 1087 XFS_FSB_TO_B(mp, ac.reserved_blocks) - 1088 (iocb->ki_pos & mp->m_blockmask)); 1089 if (!iov_iter_count(from)) 1090 goto out_unlock; 1091 1092 retry: 1093 trace_xfs_file_buffered_write(iocb, from); 1094 ret = iomap_file_buffered_write(iocb, from, 1095 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 1096 &ac); 1097 if (ret == -ENOSPC && !cleared_space) { 1098 /* 1099 * Kick off writeback to convert delalloc space and release the 1100 * usually too pessimistic indirect block reservations. 1101 */ 1102 xfs_flush_inodes(mp); 1103 cleared_space = true; 1104 goto retry; 1105 } 1106 1107 out_unlock: 1108 xfs_iunlock(ip, iolock); 1109 out_unreserve: 1110 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1111 if (ret > 0) { 1112 XFS_STATS_ADD(mp, xs_write_bytes, ret); 1113 ret = generic_write_sync(iocb, ret); 1114 } 1115 return ret; 1116 } 1117 1118 STATIC ssize_t 1119 xfs_file_write_iter( 1120 struct kiocb *iocb, 1121 struct iov_iter *from) 1122 { 1123 struct inode *inode = iocb->ki_filp->f_mapping->host; 1124 struct xfs_inode *ip = XFS_I(inode); 1125 ssize_t ret; 1126 size_t ocount = iov_iter_count(from); 1127 1128 XFS_STATS_INC(ip->i_mount, xs_write_calls); 1129 1130 if (ocount == 0) 1131 return 0; 1132 1133 if (xfs_is_shutdown(ip->i_mount)) 1134 return -EIO; 1135 1136 if (iocb->ki_flags & IOCB_ATOMIC) { 1137 if (ocount < xfs_get_atomic_write_min(ip)) 1138 return -EINVAL; 1139 1140 if (ocount > xfs_get_atomic_write_max(ip)) 1141 return -EINVAL; 1142 1143 ret = generic_atomic_write_valid(iocb, from); 1144 if (ret) 1145 return ret; 1146 } 1147 1148 if (IS_DAX(inode)) 1149 return xfs_file_dax_write(iocb, from); 1150 1151 if (iocb->ki_flags & IOCB_DIRECT) { 1152 /* 1153 * Allow a directio write to fall back to a buffered 1154 * write *only* in the case that we're doing a reflink 1155 * CoW. In all other directio scenarios we do not 1156 * allow an operation to fall back to buffered mode. 1157 */ 1158 ret = xfs_file_dio_write(iocb, from); 1159 if (ret != -ENOTBLK) 1160 return ret; 1161 } 1162 1163 if (xfs_is_zoned_inode(ip)) 1164 return xfs_file_buffered_write_zoned(iocb, from); 1165 return xfs_file_buffered_write(iocb, from); 1166 } 1167 1168 /* Does this file, inode, or mount want synchronous writes? */ 1169 static inline bool xfs_file_sync_writes(struct file *filp) 1170 { 1171 struct xfs_inode *ip = XFS_I(file_inode(filp)); 1172 1173 if (xfs_has_wsync(ip->i_mount)) 1174 return true; 1175 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 1176 return true; 1177 if (IS_SYNC(file_inode(filp))) 1178 return true; 1179 1180 return false; 1181 } 1182 1183 static int 1184 xfs_falloc_newsize( 1185 struct file *file, 1186 int mode, 1187 loff_t offset, 1188 loff_t len, 1189 loff_t *new_size) 1190 { 1191 struct inode *inode = file_inode(file); 1192 1193 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) 1194 return 0; 1195 *new_size = offset + len; 1196 return inode_newsize_ok(inode, *new_size); 1197 } 1198 1199 static int 1200 xfs_falloc_setsize( 1201 struct file *file, 1202 loff_t new_size) 1203 { 1204 struct iattr iattr = { 1205 .ia_valid = ATTR_SIZE, 1206 .ia_size = new_size, 1207 }; 1208 1209 if (!new_size) 1210 return 0; 1211 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), 1212 &iattr); 1213 } 1214 1215 static int 1216 xfs_falloc_collapse_range( 1217 struct file *file, 1218 loff_t offset, 1219 loff_t len, 1220 struct xfs_zone_alloc_ctx *ac) 1221 { 1222 struct inode *inode = file_inode(file); 1223 loff_t new_size = i_size_read(inode) - len; 1224 int error; 1225 1226 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1227 return -EINVAL; 1228 1229 /* 1230 * There is no need to overlap collapse range with EOF, in which case it 1231 * is effectively a truncate operation 1232 */ 1233 if (offset + len >= i_size_read(inode)) 1234 return -EINVAL; 1235 1236 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); 1237 if (error) 1238 return error; 1239 return xfs_falloc_setsize(file, new_size); 1240 } 1241 1242 static int 1243 xfs_falloc_insert_range( 1244 struct file *file, 1245 loff_t offset, 1246 loff_t len) 1247 { 1248 struct inode *inode = file_inode(file); 1249 loff_t isize = i_size_read(inode); 1250 int error; 1251 1252 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1253 return -EINVAL; 1254 1255 /* 1256 * New inode size must not exceed ->s_maxbytes, accounting for 1257 * possible signed overflow. 1258 */ 1259 if (inode->i_sb->s_maxbytes - isize < len) 1260 return -EFBIG; 1261 1262 /* Offset should be less than i_size */ 1263 if (offset >= isize) 1264 return -EINVAL; 1265 1266 error = xfs_falloc_setsize(file, isize + len); 1267 if (error) 1268 return error; 1269 1270 /* 1271 * Perform hole insertion now that the file size has been updated so 1272 * that if we crash during the operation we don't leave shifted extents 1273 * past EOF and hence losing access to the data that is contained within 1274 * them. 1275 */ 1276 return xfs_insert_file_space(XFS_I(inode), offset, len); 1277 } 1278 1279 /* 1280 * For various operations we need to zero up to one block at each end of 1281 * the affected range. For zoned file systems this will require a space 1282 * allocation, for which we need a reservation ahead of time. 1283 */ 1284 #define XFS_ZONED_ZERO_EDGE_SPACE_RES 2 1285 1286 /* 1287 * Zero range implements a full zeroing mechanism but is only used in limited 1288 * situations. It is more efficient to allocate unwritten extents than to 1289 * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG 1290 * kernels for added test coverage. 1291 * 1292 * On zoned file systems, the error is already injected by 1293 * xfs_file_zoned_fallocate, which then reserves the additional space needed. 1294 * We only check for this extra space reservation here. 1295 */ 1296 static inline bool 1297 xfs_falloc_force_zero( 1298 struct xfs_inode *ip, 1299 struct xfs_zone_alloc_ctx *ac) 1300 { 1301 if (xfs_is_zoned_inode(ip)) { 1302 if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) { 1303 ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG)); 1304 return true; 1305 } 1306 return false; 1307 } 1308 return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE); 1309 } 1310 1311 /* 1312 * Punch a hole and prealloc the range. We use a hole punch rather than 1313 * unwritten extent conversion for two reasons: 1314 * 1315 * 1.) Hole punch handles partial block zeroing for us. 1316 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by 1317 * virtue of the hole punch. 1318 */ 1319 static int 1320 xfs_falloc_zero_range( 1321 struct file *file, 1322 int mode, 1323 loff_t offset, 1324 loff_t len, 1325 struct xfs_zone_alloc_ctx *ac) 1326 { 1327 struct inode *inode = file_inode(file); 1328 struct xfs_inode *ip = XFS_I(inode); 1329 unsigned int blksize = i_blocksize(inode); 1330 loff_t new_size = 0; 1331 int error; 1332 1333 trace_xfs_zero_file_space(ip); 1334 1335 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1336 if (error) 1337 return error; 1338 1339 if (xfs_falloc_force_zero(ip, ac)) { 1340 error = xfs_zero_range(ip, offset, len, ac, NULL); 1341 } else { 1342 error = xfs_free_file_space(ip, offset, len, ac); 1343 if (error) 1344 return error; 1345 1346 len = round_up(offset + len, blksize) - 1347 round_down(offset, blksize); 1348 offset = round_down(offset, blksize); 1349 error = xfs_alloc_file_space(ip, offset, len); 1350 } 1351 if (error) 1352 return error; 1353 return xfs_falloc_setsize(file, new_size); 1354 } 1355 1356 static int 1357 xfs_falloc_unshare_range( 1358 struct file *file, 1359 int mode, 1360 loff_t offset, 1361 loff_t len) 1362 { 1363 struct inode *inode = file_inode(file); 1364 loff_t new_size = 0; 1365 int error; 1366 1367 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1368 if (error) 1369 return error; 1370 1371 error = xfs_reflink_unshare(XFS_I(inode), offset, len); 1372 if (error) 1373 return error; 1374 1375 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1376 if (error) 1377 return error; 1378 return xfs_falloc_setsize(file, new_size); 1379 } 1380 1381 static int 1382 xfs_falloc_allocate_range( 1383 struct file *file, 1384 int mode, 1385 loff_t offset, 1386 loff_t len) 1387 { 1388 struct inode *inode = file_inode(file); 1389 loff_t new_size = 0; 1390 int error; 1391 1392 /* 1393 * If always_cow mode we can't use preallocations and thus should not 1394 * create them. 1395 */ 1396 if (xfs_is_always_cow_inode(XFS_I(inode))) 1397 return -EOPNOTSUPP; 1398 1399 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1400 if (error) 1401 return error; 1402 1403 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1404 if (error) 1405 return error; 1406 return xfs_falloc_setsize(file, new_size); 1407 } 1408 1409 #define XFS_FALLOC_FL_SUPPORTED \ 1410 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ 1411 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ 1412 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ 1413 FALLOC_FL_UNSHARE_RANGE) 1414 1415 STATIC long 1416 __xfs_file_fallocate( 1417 struct file *file, 1418 int mode, 1419 loff_t offset, 1420 loff_t len, 1421 struct xfs_zone_alloc_ctx *ac) 1422 { 1423 struct inode *inode = file_inode(file); 1424 struct xfs_inode *ip = XFS_I(inode); 1425 long error; 1426 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1427 1428 xfs_ilock(ip, iolock); 1429 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 1430 if (error) 1431 goto out_unlock; 1432 1433 /* 1434 * Must wait for all AIO to complete before we continue as AIO can 1435 * change the file size on completion without holding any locks we 1436 * currently hold. We must do this first because AIO can update both 1437 * the on disk and in memory inode sizes, and the operations that follow 1438 * require the in-memory size to be fully up-to-date. 1439 */ 1440 inode_dio_wait(inode); 1441 1442 error = file_modified(file); 1443 if (error) 1444 goto out_unlock; 1445 1446 switch (mode & FALLOC_FL_MODE_MASK) { 1447 case FALLOC_FL_PUNCH_HOLE: 1448 error = xfs_free_file_space(ip, offset, len, ac); 1449 break; 1450 case FALLOC_FL_COLLAPSE_RANGE: 1451 error = xfs_falloc_collapse_range(file, offset, len, ac); 1452 break; 1453 case FALLOC_FL_INSERT_RANGE: 1454 error = xfs_falloc_insert_range(file, offset, len); 1455 break; 1456 case FALLOC_FL_ZERO_RANGE: 1457 error = xfs_falloc_zero_range(file, mode, offset, len, ac); 1458 break; 1459 case FALLOC_FL_UNSHARE_RANGE: 1460 error = xfs_falloc_unshare_range(file, mode, offset, len); 1461 break; 1462 case FALLOC_FL_ALLOCATE_RANGE: 1463 error = xfs_falloc_allocate_range(file, mode, offset, len); 1464 break; 1465 default: 1466 error = -EOPNOTSUPP; 1467 break; 1468 } 1469 1470 if (!error && xfs_file_sync_writes(file)) 1471 error = xfs_log_force_inode(ip); 1472 1473 out_unlock: 1474 xfs_iunlock(ip, iolock); 1475 return error; 1476 } 1477 1478 static long 1479 xfs_file_zoned_fallocate( 1480 struct file *file, 1481 int mode, 1482 loff_t offset, 1483 loff_t len) 1484 { 1485 struct xfs_zone_alloc_ctx ac = { }; 1486 struct xfs_inode *ip = XFS_I(file_inode(file)); 1487 struct xfs_mount *mp = ip->i_mount; 1488 xfs_filblks_t count_fsb; 1489 int error; 1490 1491 /* 1492 * If full zeroing is forced by the error injection knob, we need a 1493 * space reservation that covers the entire range. See the comment in 1494 * xfs_zoned_write_space_reserve for the rationale for the calculation. 1495 * Otherwise just reserve space for the two boundary blocks. 1496 */ 1497 count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES; 1498 if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE && 1499 XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE)) 1500 count_fsb += XFS_B_TO_FSB(mp, len) + 1; 1501 1502 error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac); 1503 if (error) 1504 return error; 1505 error = __xfs_file_fallocate(file, mode, offset, len, &ac); 1506 xfs_zoned_space_unreserve(mp, &ac); 1507 return error; 1508 } 1509 1510 static long 1511 xfs_file_fallocate( 1512 struct file *file, 1513 int mode, 1514 loff_t offset, 1515 loff_t len) 1516 { 1517 struct inode *inode = file_inode(file); 1518 1519 if (!S_ISREG(inode->i_mode)) 1520 return -EINVAL; 1521 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1522 return -EOPNOTSUPP; 1523 1524 /* 1525 * For zoned file systems, zeroing the first and last block of a hole 1526 * punch requires allocating a new block to rewrite the remaining data 1527 * and new zeroes out of place. Get a reservations for those before 1528 * taking the iolock. Dip into the reserved pool because we are 1529 * expected to be able to punch a hole even on a completely full 1530 * file system. 1531 */ 1532 if (xfs_is_zoned_inode(XFS_I(inode)) && 1533 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 1534 FALLOC_FL_COLLAPSE_RANGE))) 1535 return xfs_file_zoned_fallocate(file, mode, offset, len); 1536 return __xfs_file_fallocate(file, mode, offset, len, NULL); 1537 } 1538 1539 STATIC int 1540 xfs_file_fadvise( 1541 struct file *file, 1542 loff_t start, 1543 loff_t end, 1544 int advice) 1545 { 1546 struct xfs_inode *ip = XFS_I(file_inode(file)); 1547 int ret; 1548 int lockflags = 0; 1549 1550 /* 1551 * Operations creating pages in page cache need protection from hole 1552 * punching and similar ops 1553 */ 1554 if (advice == POSIX_FADV_WILLNEED) { 1555 lockflags = XFS_IOLOCK_SHARED; 1556 xfs_ilock(ip, lockflags); 1557 } 1558 ret = generic_fadvise(file, start, end, advice); 1559 if (lockflags) 1560 xfs_iunlock(ip, lockflags); 1561 return ret; 1562 } 1563 1564 STATIC loff_t 1565 xfs_file_remap_range( 1566 struct file *file_in, 1567 loff_t pos_in, 1568 struct file *file_out, 1569 loff_t pos_out, 1570 loff_t len, 1571 unsigned int remap_flags) 1572 { 1573 struct inode *inode_in = file_inode(file_in); 1574 struct xfs_inode *src = XFS_I(inode_in); 1575 struct inode *inode_out = file_inode(file_out); 1576 struct xfs_inode *dest = XFS_I(inode_out); 1577 struct xfs_mount *mp = src->i_mount; 1578 loff_t remapped = 0; 1579 xfs_extlen_t cowextsize; 1580 int ret; 1581 1582 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1583 return -EINVAL; 1584 1585 if (!xfs_has_reflink(mp)) 1586 return -EOPNOTSUPP; 1587 1588 if (xfs_is_shutdown(mp)) 1589 return -EIO; 1590 1591 /* Prepare and then clone file data. */ 1592 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1593 &len, remap_flags); 1594 if (ret || len == 0) 1595 return ret; 1596 1597 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1598 1599 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1600 &remapped); 1601 if (ret) 1602 goto out_unlock; 1603 1604 /* 1605 * Carry the cowextsize hint from src to dest if we're sharing the 1606 * entire source file to the entire destination file, the source file 1607 * has a cowextsize hint, and the destination file does not. 1608 */ 1609 cowextsize = 0; 1610 if (pos_in == 0 && len == i_size_read(inode_in) && 1611 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1612 pos_out == 0 && len >= i_size_read(inode_out) && 1613 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1614 cowextsize = src->i_cowextsize; 1615 1616 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1617 remap_flags); 1618 if (ret) 1619 goto out_unlock; 1620 1621 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1622 xfs_log_force_inode(dest); 1623 out_unlock: 1624 xfs_iunlock2_remapping(src, dest); 1625 if (ret) 1626 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1627 /* 1628 * If the caller did not set CAN_SHORTEN, then it is not prepared to 1629 * handle partial results -- either the whole remap succeeds, or we 1630 * must say why it did not. In this case, any error should be returned 1631 * to the caller. 1632 */ 1633 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) 1634 return ret; 1635 return remapped > 0 ? remapped : ret; 1636 } 1637 1638 STATIC int 1639 xfs_file_open( 1640 struct inode *inode, 1641 struct file *file) 1642 { 1643 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1644 return -EIO; 1645 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 1646 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) 1647 file->f_mode |= FMODE_CAN_ATOMIC_WRITE; 1648 return generic_file_open(inode, file); 1649 } 1650 1651 STATIC int 1652 xfs_dir_open( 1653 struct inode *inode, 1654 struct file *file) 1655 { 1656 struct xfs_inode *ip = XFS_I(inode); 1657 unsigned int mode; 1658 int error; 1659 1660 if (xfs_is_shutdown(ip->i_mount)) 1661 return -EIO; 1662 error = generic_file_open(inode, file); 1663 if (error) 1664 return error; 1665 1666 /* 1667 * If there are any blocks, read-ahead block 0 as we're almost 1668 * certain to have the next operation be a read there. 1669 */ 1670 mode = xfs_ilock_data_map_shared(ip); 1671 if (ip->i_df.if_nextents > 0) 1672 error = xfs_dir3_data_readahead(ip, 0, 0); 1673 xfs_iunlock(ip, mode); 1674 return error; 1675 } 1676 1677 /* 1678 * Don't bother propagating errors. We're just doing cleanup, and the caller 1679 * ignores the return value anyway. 1680 */ 1681 STATIC int 1682 xfs_file_release( 1683 struct inode *inode, 1684 struct file *file) 1685 { 1686 struct xfs_inode *ip = XFS_I(inode); 1687 struct xfs_mount *mp = ip->i_mount; 1688 1689 /* 1690 * If this is a read-only mount or the file system has been shut down, 1691 * don't generate I/O. 1692 */ 1693 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) 1694 return 0; 1695 1696 /* 1697 * If we previously truncated this file and removed old data in the 1698 * process, we want to initiate "early" writeout on the last close. 1699 * This is an attempt to combat the notorious NULL files problem which 1700 * is particularly noticeable from a truncate down, buffered (re-)write 1701 * (delalloc), followed by a crash. What we are effectively doing here 1702 * is significantly reducing the time window where we'd otherwise be 1703 * exposed to that problem. 1704 */ 1705 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { 1706 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); 1707 if (ip->i_delayed_blks > 0) 1708 filemap_flush(inode->i_mapping); 1709 } 1710 1711 /* 1712 * XFS aggressively preallocates post-EOF space to generate contiguous 1713 * allocations for writers that append to the end of the file. 1714 * 1715 * To support workloads that close and reopen the file frequently, these 1716 * preallocations usually persist after a close unless it is the first 1717 * close for the inode. This is a tradeoff to generate tightly packed 1718 * data layouts for unpacking tarballs or similar archives that write 1719 * one file after another without going back to it while keeping the 1720 * preallocation for files that have recurring open/write/close cycles. 1721 * 1722 * This heuristic is skipped for inodes with the append-only flag as 1723 * that flag is rather pointless for inodes written only once. 1724 * 1725 * There is no point in freeing blocks here for open but unlinked files 1726 * as they will be taken care of by the inactivation path soon. 1727 * 1728 * When releasing a read-only context, don't flush data or trim post-EOF 1729 * blocks. This avoids open/read/close workloads from removing EOF 1730 * blocks that other writers depend upon to reduce fragmentation. 1731 * 1732 * Inodes on the zoned RT device never have preallocations, so skip 1733 * taking the locks below. 1734 */ 1735 if (!inode->i_nlink || 1736 !(file->f_mode & FMODE_WRITE) || 1737 (ip->i_diflags & XFS_DIFLAG_APPEND) || 1738 xfs_is_zoned_inode(ip)) 1739 return 0; 1740 1741 /* 1742 * If we can't get the iolock just skip truncating the blocks past EOF 1743 * because we could deadlock with the mmap_lock otherwise. We'll get 1744 * another chance to drop them once the last reference to the inode is 1745 * dropped, so we'll never leak blocks permanently. 1746 */ 1747 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1748 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1749 if (xfs_can_free_eofblocks(ip) && 1750 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) 1751 xfs_free_eofblocks(ip); 1752 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1753 } 1754 1755 return 0; 1756 } 1757 1758 STATIC int 1759 xfs_file_readdir( 1760 struct file *file, 1761 struct dir_context *ctx) 1762 { 1763 struct inode *inode = file_inode(file); 1764 xfs_inode_t *ip = XFS_I(inode); 1765 size_t bufsize; 1766 1767 /* 1768 * The Linux API doesn't pass down the total size of the buffer 1769 * we read into down to the filesystem. With the filldir concept 1770 * it's not needed for correct information, but the XFS dir2 leaf 1771 * code wants an estimate of the buffer size to calculate it's 1772 * readahead window and size the buffers used for mapping to 1773 * physical blocks. 1774 * 1775 * Try to give it an estimate that's good enough, maybe at some 1776 * point we can change the ->readdir prototype to include the 1777 * buffer size. For now we use the current glibc buffer size. 1778 */ 1779 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1780 1781 return xfs_readdir(NULL, ip, ctx, bufsize); 1782 } 1783 1784 STATIC loff_t 1785 xfs_file_llseek( 1786 struct file *file, 1787 loff_t offset, 1788 int whence) 1789 { 1790 struct inode *inode = file->f_mapping->host; 1791 1792 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1793 return -EIO; 1794 1795 switch (whence) { 1796 default: 1797 return generic_file_llseek(file, offset, whence); 1798 case SEEK_HOLE: 1799 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1800 break; 1801 case SEEK_DATA: 1802 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1803 break; 1804 } 1805 1806 if (offset < 0) 1807 return offset; 1808 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1809 } 1810 1811 static inline vm_fault_t 1812 xfs_dax_fault_locked( 1813 struct vm_fault *vmf, 1814 unsigned int order, 1815 bool write_fault) 1816 { 1817 vm_fault_t ret; 1818 unsigned long pfn; 1819 1820 if (!IS_ENABLED(CONFIG_FS_DAX)) { 1821 ASSERT(0); 1822 return VM_FAULT_SIGBUS; 1823 } 1824 ret = dax_iomap_fault(vmf, order, &pfn, NULL, 1825 (write_fault && !vmf->cow_page) ? 1826 &xfs_dax_write_iomap_ops : 1827 &xfs_read_iomap_ops); 1828 if (ret & VM_FAULT_NEEDDSYNC) 1829 ret = dax_finish_sync_fault(vmf, order, pfn); 1830 return ret; 1831 } 1832 1833 static vm_fault_t 1834 xfs_dax_read_fault( 1835 struct vm_fault *vmf, 1836 unsigned int order) 1837 { 1838 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1839 vm_fault_t ret; 1840 1841 trace_xfs_read_fault(ip, order); 1842 1843 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1844 ret = xfs_dax_fault_locked(vmf, order, false); 1845 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1846 1847 return ret; 1848 } 1849 1850 /* 1851 * Locking for serialisation of IO during page faults. This results in a lock 1852 * ordering of: 1853 * 1854 * mmap_lock (MM) 1855 * sb_start_pagefault(vfs, freeze) 1856 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1857 * page_lock (MM) 1858 * i_lock (XFS - extent map serialisation) 1859 */ 1860 static vm_fault_t 1861 __xfs_write_fault( 1862 struct vm_fault *vmf, 1863 unsigned int order, 1864 struct xfs_zone_alloc_ctx *ac) 1865 { 1866 struct inode *inode = file_inode(vmf->vma->vm_file); 1867 struct xfs_inode *ip = XFS_I(inode); 1868 unsigned int lock_mode = XFS_MMAPLOCK_SHARED; 1869 vm_fault_t ret; 1870 1871 trace_xfs_write_fault(ip, order); 1872 1873 sb_start_pagefault(inode->i_sb); 1874 file_update_time(vmf->vma->vm_file); 1875 1876 /* 1877 * Normally we only need the shared mmaplock, but if a reflink remap is 1878 * in progress we take the exclusive lock to wait for the remap to 1879 * finish before taking a write fault. 1880 */ 1881 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1882 if (xfs_iflags_test(ip, XFS_IREMAPPING)) { 1883 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1884 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1885 lock_mode = XFS_MMAPLOCK_EXCL; 1886 } 1887 1888 if (IS_DAX(inode)) 1889 ret = xfs_dax_fault_locked(vmf, order, true); 1890 else 1891 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, 1892 ac); 1893 xfs_iunlock(ip, lock_mode); 1894 1895 sb_end_pagefault(inode->i_sb); 1896 return ret; 1897 } 1898 1899 static vm_fault_t 1900 xfs_write_fault_zoned( 1901 struct vm_fault *vmf, 1902 unsigned int order) 1903 { 1904 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1905 unsigned int len = folio_size(page_folio(vmf->page)); 1906 struct xfs_zone_alloc_ctx ac = { }; 1907 int error; 1908 vm_fault_t ret; 1909 1910 /* 1911 * This could over-allocate as it doesn't check for truncation. 1912 * 1913 * But as the overallocation is limited to less than a folio and will be 1914 * release instantly that's just fine. 1915 */ 1916 error = xfs_zoned_space_reserve(ip->i_mount, 1917 XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); 1918 if (error < 0) 1919 return vmf_fs_error(error); 1920 ret = __xfs_write_fault(vmf, order, &ac); 1921 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1922 return ret; 1923 } 1924 1925 static vm_fault_t 1926 xfs_write_fault( 1927 struct vm_fault *vmf, 1928 unsigned int order) 1929 { 1930 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) 1931 return xfs_write_fault_zoned(vmf, order); 1932 return __xfs_write_fault(vmf, order, NULL); 1933 } 1934 1935 static inline bool 1936 xfs_is_write_fault( 1937 struct vm_fault *vmf) 1938 { 1939 return (vmf->flags & FAULT_FLAG_WRITE) && 1940 (vmf->vma->vm_flags & VM_SHARED); 1941 } 1942 1943 static vm_fault_t 1944 xfs_filemap_fault( 1945 struct vm_fault *vmf) 1946 { 1947 struct inode *inode = file_inode(vmf->vma->vm_file); 1948 1949 /* DAX can shortcut the normal fault path on write faults! */ 1950 if (IS_DAX(inode)) { 1951 if (xfs_is_write_fault(vmf)) 1952 return xfs_write_fault(vmf, 0); 1953 return xfs_dax_read_fault(vmf, 0); 1954 } 1955 1956 trace_xfs_read_fault(XFS_I(inode), 0); 1957 return filemap_fault(vmf); 1958 } 1959 1960 static vm_fault_t 1961 xfs_filemap_huge_fault( 1962 struct vm_fault *vmf, 1963 unsigned int order) 1964 { 1965 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1966 return VM_FAULT_FALLBACK; 1967 1968 /* DAX can shortcut the normal fault path on write faults! */ 1969 if (xfs_is_write_fault(vmf)) 1970 return xfs_write_fault(vmf, order); 1971 return xfs_dax_read_fault(vmf, order); 1972 } 1973 1974 static vm_fault_t 1975 xfs_filemap_page_mkwrite( 1976 struct vm_fault *vmf) 1977 { 1978 return xfs_write_fault(vmf, 0); 1979 } 1980 1981 /* 1982 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1983 * on write faults. In reality, it needs to serialise against truncate and 1984 * prepare memory for writing so handle is as standard write fault. 1985 */ 1986 static vm_fault_t 1987 xfs_filemap_pfn_mkwrite( 1988 struct vm_fault *vmf) 1989 { 1990 return xfs_write_fault(vmf, 0); 1991 } 1992 1993 static const struct vm_operations_struct xfs_file_vm_ops = { 1994 .fault = xfs_filemap_fault, 1995 .huge_fault = xfs_filemap_huge_fault, 1996 .map_pages = filemap_map_pages, 1997 .page_mkwrite = xfs_filemap_page_mkwrite, 1998 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1999 }; 2000 2001 STATIC int 2002 xfs_file_mmap_prepare( 2003 struct vm_area_desc *desc) 2004 { 2005 struct file *file = desc->file; 2006 struct inode *inode = file_inode(file); 2007 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 2008 2009 /* 2010 * We don't support synchronous mappings for non-DAX files and 2011 * for DAX files if underneath dax_device is not synchronous. 2012 */ 2013 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), 2014 target->bt_daxdev)) 2015 return -EOPNOTSUPP; 2016 2017 file_accessed(file); 2018 desc->vm_ops = &xfs_file_vm_ops; 2019 if (IS_DAX(inode)) 2020 desc->vm_flags |= VM_HUGEPAGE; 2021 return 0; 2022 } 2023 2024 const struct file_operations xfs_file_operations = { 2025 .llseek = xfs_file_llseek, 2026 .read_iter = xfs_file_read_iter, 2027 .write_iter = xfs_file_write_iter, 2028 .splice_read = xfs_file_splice_read, 2029 .splice_write = iter_file_splice_write, 2030 .iopoll = iocb_bio_iopoll, 2031 .unlocked_ioctl = xfs_file_ioctl, 2032 #ifdef CONFIG_COMPAT 2033 .compat_ioctl = xfs_file_compat_ioctl, 2034 #endif 2035 .mmap_prepare = xfs_file_mmap_prepare, 2036 .open = xfs_file_open, 2037 .release = xfs_file_release, 2038 .fsync = xfs_file_fsync, 2039 .get_unmapped_area = thp_get_unmapped_area, 2040 .fallocate = xfs_file_fallocate, 2041 .fadvise = xfs_file_fadvise, 2042 .remap_file_range = xfs_file_remap_range, 2043 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 2044 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | 2045 FOP_DONTCACHE, 2046 .setlease = generic_setlease, 2047 }; 2048 2049 const struct file_operations xfs_dir_file_operations = { 2050 .open = xfs_dir_open, 2051 .read = generic_read_dir, 2052 .iterate_shared = xfs_file_readdir, 2053 .llseek = generic_file_llseek, 2054 .unlocked_ioctl = xfs_file_ioctl, 2055 #ifdef CONFIG_COMPAT 2056 .compat_ioctl = xfs_file_compat_ioctl, 2057 #endif 2058 .fsync = xfs_dir_fsync, 2059 .setlease = generic_setlease, 2060 }; 2061