1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 #include "xfs_file.h" 28 #include "xfs_aops.h" 29 #include "xfs_zone_alloc.h" 30 31 #include <linux/dax.h> 32 #include <linux/falloc.h> 33 #include <linux/backing-dev.h> 34 #include <linux/mman.h> 35 #include <linux/fadvise.h> 36 #include <linux/mount.h> 37 38 static const struct vm_operations_struct xfs_file_vm_ops; 39 40 /* 41 * Decide if the given file range is aligned to the size of the fundamental 42 * allocation unit for the file. 43 */ 44 bool 45 xfs_is_falloc_aligned( 46 struct xfs_inode *ip, 47 loff_t pos, 48 long long int len) 49 { 50 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); 51 52 if (!is_power_of_2(alloc_unit)) 53 return isaligned_64(pos, alloc_unit) && 54 isaligned_64(len, alloc_unit); 55 56 return !((pos | len) & (alloc_unit - 1)); 57 } 58 59 /* 60 * Fsync operations on directories are much simpler than on regular files, 61 * as there is no file data to flush, and thus also no need for explicit 62 * cache flush operations, and there are no non-transaction metadata updates 63 * on directories either. 64 */ 65 STATIC int 66 xfs_dir_fsync( 67 struct file *file, 68 loff_t start, 69 loff_t end, 70 int datasync) 71 { 72 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 73 74 trace_xfs_dir_fsync(ip); 75 return xfs_log_force_inode(ip); 76 } 77 78 /* 79 * All metadata updates are logged, which means that we just have to push the 80 * journal to the required sequence number than holds the updates. We track 81 * datasync commits separately to full sync commits, and hence only need to 82 * select the correct sequence number for the log force here. 83 * 84 * We don't have to serialise against concurrent modifications, as we do not 85 * have to wait for modifications that have not yet completed. We define a 86 * transaction commit as completing when the commit sequence number is updated, 87 * hence if the sequence number has not updated, the sync operation has been 88 * run before the commit completed and we don't have to wait for it. 89 * 90 * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain 91 * set on the log item until - at least - the journal flush completes. In 92 * reality, they are only cleared when the inode is fully unpinned (i.e. 93 * persistent in the journal and not dirty in the CIL), and so we rely on 94 * xfs_log_force_seq() either skipping sequences that have been persisted or 95 * waiting on sequences that are still in flight to correctly order concurrent 96 * sync operations. 97 */ 98 static int 99 xfs_fsync_flush_log( 100 struct xfs_inode *ip, 101 bool datasync, 102 int *log_flushed) 103 { 104 struct xfs_inode_log_item *iip = ip->i_itemp; 105 xfs_csn_t seq = 0; 106 107 spin_lock(&iip->ili_lock); 108 if (datasync) 109 seq = iip->ili_datasync_seq; 110 else 111 seq = iip->ili_commit_seq; 112 spin_unlock(&iip->ili_lock); 113 114 if (!seq) 115 return 0; 116 117 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 118 log_flushed); 119 } 120 121 STATIC int 122 xfs_file_fsync( 123 struct file *file, 124 loff_t start, 125 loff_t end, 126 int datasync) 127 { 128 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 129 struct xfs_mount *mp = ip->i_mount; 130 int error, err2; 131 int log_flushed = 0; 132 133 trace_xfs_file_fsync(ip); 134 135 error = file_write_and_wait_range(file, start, end); 136 if (error) 137 return error; 138 139 if (xfs_is_shutdown(mp)) 140 return -EIO; 141 142 xfs_iflags_clear(ip, XFS_ITRUNCATED); 143 144 /* 145 * If we have an RT and/or log subvolume we need to make sure to flush 146 * the write cache the device used for file data first. This is to 147 * ensure newly written file data make it to disk before logging the new 148 * inode size in case of an extending write. 149 */ 150 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) 151 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 152 else if (mp->m_logdev_targp != mp->m_ddev_targp) 153 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 154 155 /* 156 * If the inode has a inode log item attached, it may need the journal 157 * flushed to persist any changes the log item might be tracking. 158 */ 159 if (ip->i_itemp) { 160 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 161 if (err2 && !error) 162 error = err2; 163 } 164 165 /* 166 * If we only have a single device, and the log force about was 167 * a no-op we might have to flush the data device cache here. 168 * This can only happen for fdatasync/O_DSYNC if we were overwriting 169 * an already allocated file and thus do not have any metadata to 170 * commit. 171 */ 172 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 173 mp->m_logdev_targp == mp->m_ddev_targp) { 174 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 175 if (err2 && !error) 176 error = err2; 177 } 178 179 return error; 180 } 181 182 static int 183 xfs_ilock_iocb( 184 struct kiocb *iocb, 185 unsigned int lock_mode) 186 { 187 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 188 189 if (iocb->ki_flags & IOCB_NOWAIT) { 190 if (!xfs_ilock_nowait(ip, lock_mode)) 191 return -EAGAIN; 192 } else { 193 xfs_ilock(ip, lock_mode); 194 } 195 196 return 0; 197 } 198 199 static int 200 xfs_ilock_iocb_for_write( 201 struct kiocb *iocb, 202 unsigned int *lock_mode) 203 { 204 ssize_t ret; 205 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 206 207 ret = xfs_ilock_iocb(iocb, *lock_mode); 208 if (ret) 209 return ret; 210 211 /* 212 * If a reflink remap is in progress we always need to take the iolock 213 * exclusively to wait for it to finish. 214 */ 215 if (*lock_mode == XFS_IOLOCK_SHARED && 216 xfs_iflags_test(ip, XFS_IREMAPPING)) { 217 xfs_iunlock(ip, *lock_mode); 218 *lock_mode = XFS_IOLOCK_EXCL; 219 return xfs_ilock_iocb(iocb, *lock_mode); 220 } 221 222 return 0; 223 } 224 225 STATIC ssize_t 226 xfs_file_dio_read( 227 struct kiocb *iocb, 228 struct iov_iter *to) 229 { 230 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 231 ssize_t ret; 232 233 trace_xfs_file_direct_read(iocb, to); 234 235 if (!iov_iter_count(to)) 236 return 0; /* skip atime */ 237 238 file_accessed(iocb->ki_filp); 239 240 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 241 if (ret) 242 return ret; 243 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 244 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 245 246 return ret; 247 } 248 249 static noinline ssize_t 250 xfs_file_dax_read( 251 struct kiocb *iocb, 252 struct iov_iter *to) 253 { 254 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 255 ssize_t ret = 0; 256 257 trace_xfs_file_dax_read(iocb, to); 258 259 if (!iov_iter_count(to)) 260 return 0; /* skip atime */ 261 262 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 263 if (ret) 264 return ret; 265 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 266 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 267 268 file_accessed(iocb->ki_filp); 269 return ret; 270 } 271 272 STATIC ssize_t 273 xfs_file_buffered_read( 274 struct kiocb *iocb, 275 struct iov_iter *to) 276 { 277 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 278 ssize_t ret; 279 280 trace_xfs_file_buffered_read(iocb, to); 281 282 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 283 if (ret) 284 return ret; 285 ret = generic_file_read_iter(iocb, to); 286 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 287 288 return ret; 289 } 290 291 STATIC ssize_t 292 xfs_file_read_iter( 293 struct kiocb *iocb, 294 struct iov_iter *to) 295 { 296 struct inode *inode = file_inode(iocb->ki_filp); 297 struct xfs_mount *mp = XFS_I(inode)->i_mount; 298 ssize_t ret = 0; 299 300 XFS_STATS_INC(mp, xs_read_calls); 301 302 if (xfs_is_shutdown(mp)) 303 return -EIO; 304 305 if (IS_DAX(inode)) 306 ret = xfs_file_dax_read(iocb, to); 307 else if (iocb->ki_flags & IOCB_DIRECT) 308 ret = xfs_file_dio_read(iocb, to); 309 else 310 ret = xfs_file_buffered_read(iocb, to); 311 312 if (ret > 0) 313 XFS_STATS_ADD(mp, xs_read_bytes, ret); 314 return ret; 315 } 316 317 STATIC ssize_t 318 xfs_file_splice_read( 319 struct file *in, 320 loff_t *ppos, 321 struct pipe_inode_info *pipe, 322 size_t len, 323 unsigned int flags) 324 { 325 struct inode *inode = file_inode(in); 326 struct xfs_inode *ip = XFS_I(inode); 327 struct xfs_mount *mp = ip->i_mount; 328 ssize_t ret = 0; 329 330 XFS_STATS_INC(mp, xs_read_calls); 331 332 if (xfs_is_shutdown(mp)) 333 return -EIO; 334 335 trace_xfs_file_splice_read(ip, *ppos, len); 336 337 xfs_ilock(ip, XFS_IOLOCK_SHARED); 338 ret = filemap_splice_read(in, ppos, pipe, len, flags); 339 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 340 if (ret > 0) 341 XFS_STATS_ADD(mp, xs_read_bytes, ret); 342 return ret; 343 } 344 345 /* 346 * Take care of zeroing post-EOF blocks when they might exist. 347 * 348 * Returns 0 if successfully, a negative error for a failure, or 1 if this 349 * function dropped the iolock and reacquired it exclusively and the caller 350 * needs to restart the write sanity checks. 351 */ 352 static ssize_t 353 xfs_file_write_zero_eof( 354 struct kiocb *iocb, 355 struct iov_iter *from, 356 unsigned int *iolock, 357 size_t count, 358 bool *drained_dio, 359 struct xfs_zone_alloc_ctx *ac) 360 { 361 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 362 loff_t isize; 363 int error; 364 365 /* 366 * We need to serialise against EOF updates that occur in IO completions 367 * here. We want to make sure that nobody is changing the size while 368 * we do this check until we have placed an IO barrier (i.e. hold 369 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 370 * spinlock effectively forms a memory barrier once we have 371 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 372 * hence be able to correctly determine if we need to run zeroing. 373 */ 374 spin_lock(&ip->i_flags_lock); 375 isize = i_size_read(VFS_I(ip)); 376 if (iocb->ki_pos <= isize) { 377 spin_unlock(&ip->i_flags_lock); 378 return 0; 379 } 380 spin_unlock(&ip->i_flags_lock); 381 382 if (iocb->ki_flags & IOCB_NOWAIT) 383 return -EAGAIN; 384 385 if (!*drained_dio) { 386 /* 387 * If zeroing is needed and we are currently holding the iolock 388 * shared, we need to update it to exclusive which implies 389 * having to redo all checks before. 390 */ 391 if (*iolock == XFS_IOLOCK_SHARED) { 392 xfs_iunlock(ip, *iolock); 393 *iolock = XFS_IOLOCK_EXCL; 394 xfs_ilock(ip, *iolock); 395 iov_iter_reexpand(from, count); 396 } 397 398 /* 399 * We now have an IO submission barrier in place, but AIO can do 400 * EOF updates during IO completion and hence we now need to 401 * wait for all of them to drain. Non-AIO DIO will have drained 402 * before we are given the XFS_IOLOCK_EXCL, and so for most 403 * cases this wait is a no-op. 404 */ 405 inode_dio_wait(VFS_I(ip)); 406 *drained_dio = true; 407 return 1; 408 } 409 410 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 411 412 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 413 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); 414 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 415 416 return error; 417 } 418 419 /* 420 * Common pre-write limit and setup checks. 421 * 422 * Called with the iolock held either shared and exclusive according to 423 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 424 * if called for a direct write beyond i_size. 425 */ 426 STATIC ssize_t 427 xfs_file_write_checks( 428 struct kiocb *iocb, 429 struct iov_iter *from, 430 unsigned int *iolock, 431 struct xfs_zone_alloc_ctx *ac) 432 { 433 struct inode *inode = iocb->ki_filp->f_mapping->host; 434 size_t count = iov_iter_count(from); 435 bool drained_dio = false; 436 ssize_t error; 437 438 restart: 439 error = generic_write_checks(iocb, from); 440 if (error <= 0) 441 return error; 442 443 if (iocb->ki_flags & IOCB_NOWAIT) { 444 error = break_layout(inode, false); 445 if (error == -EWOULDBLOCK) 446 error = -EAGAIN; 447 } else { 448 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 449 } 450 451 if (error) 452 return error; 453 454 /* 455 * For changing security info in file_remove_privs() we need i_rwsem 456 * exclusively. 457 */ 458 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 459 xfs_iunlock(XFS_I(inode), *iolock); 460 *iolock = XFS_IOLOCK_EXCL; 461 error = xfs_ilock_iocb(iocb, *iolock); 462 if (error) { 463 *iolock = 0; 464 return error; 465 } 466 goto restart; 467 } 468 469 /* 470 * If the offset is beyond the size of the file, we need to zero all 471 * blocks that fall between the existing EOF and the start of this 472 * write. 473 * 474 * We can do an unlocked check for i_size here safely as I/O completion 475 * can only extend EOF. Truncate is locked out at this point, so the 476 * EOF can not move backwards, only forwards. Hence we only need to take 477 * the slow path when we are at or beyond the current EOF. 478 */ 479 if (iocb->ki_pos > i_size_read(inode)) { 480 error = xfs_file_write_zero_eof(iocb, from, iolock, count, 481 &drained_dio, ac); 482 if (error == 1) 483 goto restart; 484 if (error) 485 return error; 486 } 487 488 return kiocb_modified(iocb); 489 } 490 491 static ssize_t 492 xfs_zoned_write_space_reserve( 493 struct xfs_mount *mp, 494 struct kiocb *iocb, 495 struct iov_iter *from, 496 unsigned int flags, 497 struct xfs_zone_alloc_ctx *ac) 498 { 499 loff_t count = iov_iter_count(from); 500 int error; 501 502 if (iocb->ki_flags & IOCB_NOWAIT) 503 flags |= XFS_ZR_NOWAIT; 504 505 /* 506 * Check the rlimit and LFS boundary first so that we don't over-reserve 507 * by possibly a lot. 508 * 509 * The generic write path will redo this check later, and it might have 510 * changed by then. If it got expanded we'll stick to our earlier 511 * smaller limit, and if it is decreased the new smaller limit will be 512 * used and our extra space reservation will be returned after finishing 513 * the write. 514 */ 515 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); 516 if (error) 517 return error; 518 519 /* 520 * Sloppily round up count to file system blocks. 521 * 522 * This will often reserve an extra block, but that avoids having to look 523 * at the start offset, which isn't stable for O_APPEND until taking the 524 * iolock. Also we need to reserve a block each for zeroing the old 525 * EOF block and the new start block if they are unaligned. 526 * 527 * Any remaining block will be returned after the write. 528 */ 529 return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, 530 flags, ac); 531 } 532 533 static int 534 xfs_dio_write_end_io( 535 struct kiocb *iocb, 536 ssize_t size, 537 int error, 538 unsigned flags) 539 { 540 struct inode *inode = file_inode(iocb->ki_filp); 541 struct xfs_inode *ip = XFS_I(inode); 542 loff_t offset = iocb->ki_pos; 543 unsigned int nofs_flag; 544 545 ASSERT(!xfs_is_zoned_inode(ip) || 546 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 547 548 trace_xfs_end_io_direct_write(ip, offset, size); 549 550 if (xfs_is_shutdown(ip->i_mount)) 551 return -EIO; 552 553 if (error) 554 return error; 555 if (!size) 556 return 0; 557 558 /* 559 * Capture amount written on completion as we can't reliably account 560 * for it on submission. 561 */ 562 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 563 564 /* 565 * We can allocate memory here while doing writeback on behalf of 566 * memory reclaim. To avoid memory allocation deadlocks set the 567 * task-wide nofs context for the following operations. 568 */ 569 nofs_flag = memalloc_nofs_save(); 570 571 if (flags & IOMAP_DIO_COW) { 572 if (iocb->ki_flags & IOCB_ATOMIC) 573 error = xfs_reflink_end_atomic_cow(ip, offset, size); 574 else 575 error = xfs_reflink_end_cow(ip, offset, size); 576 if (error) 577 goto out; 578 } 579 580 /* 581 * Unwritten conversion updates the in-core isize after extent 582 * conversion but before updating the on-disk size. Updating isize any 583 * earlier allows a racing dio read to find unwritten extents before 584 * they are converted. 585 */ 586 if (flags & IOMAP_DIO_UNWRITTEN) { 587 error = xfs_iomap_write_unwritten(ip, offset, size, true); 588 goto out; 589 } 590 591 /* 592 * We need to update the in-core inode size here so that we don't end up 593 * with the on-disk inode size being outside the in-core inode size. We 594 * have no other method of updating EOF for AIO, so always do it here 595 * if necessary. 596 * 597 * We need to lock the test/set EOF update as we can be racing with 598 * other IO completions here to update the EOF. Failing to serialise 599 * here can result in EOF moving backwards and Bad Things Happen when 600 * that occurs. 601 * 602 * As IO completion only ever extends EOF, we can do an unlocked check 603 * here to avoid taking the spinlock. If we land within the current EOF, 604 * then we do not need to do an extending update at all, and we don't 605 * need to take the lock to check this. If we race with an update moving 606 * EOF, then we'll either still be beyond EOF and need to take the lock, 607 * or we'll be within EOF and we don't need to take it at all. 608 */ 609 if (offset + size <= i_size_read(inode)) 610 goto out; 611 612 spin_lock(&ip->i_flags_lock); 613 if (offset + size > i_size_read(inode)) { 614 i_size_write(inode, offset + size); 615 spin_unlock(&ip->i_flags_lock); 616 error = xfs_setfilesize(ip, offset, size); 617 } else { 618 spin_unlock(&ip->i_flags_lock); 619 } 620 621 out: 622 memalloc_nofs_restore(nofs_flag); 623 return error; 624 } 625 626 static const struct iomap_dio_ops xfs_dio_write_ops = { 627 .end_io = xfs_dio_write_end_io, 628 }; 629 630 static void 631 xfs_dio_zoned_submit_io( 632 const struct iomap_iter *iter, 633 struct bio *bio, 634 loff_t file_offset) 635 { 636 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; 637 struct xfs_zone_alloc_ctx *ac = iter->private; 638 xfs_filblks_t count_fsb; 639 struct iomap_ioend *ioend; 640 641 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); 642 if (count_fsb > ac->reserved_blocks) { 643 xfs_err(mp, 644 "allocation (%lld) larger than reservation (%lld).", 645 count_fsb, ac->reserved_blocks); 646 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 647 bio_io_error(bio); 648 return; 649 } 650 ac->reserved_blocks -= count_fsb; 651 652 bio->bi_end_io = xfs_end_bio; 653 ioend = iomap_init_ioend(iter->inode, bio, file_offset, 654 IOMAP_IOEND_DIRECT); 655 xfs_zone_alloc_and_submit(ioend, &ac->open_zone); 656 } 657 658 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 659 .bio_set = &iomap_ioend_bioset, 660 .submit_io = xfs_dio_zoned_submit_io, 661 .end_io = xfs_dio_write_end_io, 662 }; 663 664 /* 665 * Handle block aligned direct I/O writes. 666 */ 667 static noinline ssize_t 668 xfs_file_dio_write_aligned( 669 struct xfs_inode *ip, 670 struct kiocb *iocb, 671 struct iov_iter *from, 672 const struct iomap_ops *ops, 673 const struct iomap_dio_ops *dops, 674 struct xfs_zone_alloc_ctx *ac) 675 { 676 unsigned int iolock = XFS_IOLOCK_SHARED; 677 ssize_t ret; 678 679 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 680 if (ret) 681 return ret; 682 ret = xfs_file_write_checks(iocb, from, &iolock, ac); 683 if (ret) 684 goto out_unlock; 685 686 /* 687 * We don't need to hold the IOLOCK exclusively across the IO, so demote 688 * the iolock back to shared if we had to take the exclusive lock in 689 * xfs_file_write_checks() for other reasons. 690 */ 691 if (iolock == XFS_IOLOCK_EXCL) { 692 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 693 iolock = XFS_IOLOCK_SHARED; 694 } 695 trace_xfs_file_direct_write(iocb, from); 696 ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); 697 out_unlock: 698 xfs_iunlock(ip, iolock); 699 return ret; 700 } 701 702 /* 703 * Handle block aligned direct I/O writes to zoned devices. 704 */ 705 static noinline ssize_t 706 xfs_file_dio_write_zoned( 707 struct xfs_inode *ip, 708 struct kiocb *iocb, 709 struct iov_iter *from) 710 { 711 struct xfs_zone_alloc_ctx ac = { }; 712 ssize_t ret; 713 714 ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); 715 if (ret < 0) 716 return ret; 717 ret = xfs_file_dio_write_aligned(ip, iocb, from, 718 &xfs_zoned_direct_write_iomap_ops, 719 &xfs_dio_zoned_write_ops, &ac); 720 xfs_zoned_space_unreserve(ip->i_mount, &ac); 721 return ret; 722 } 723 724 /* 725 * Handle block atomic writes 726 * 727 * Two methods of atomic writes are supported: 728 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the 729 * disk 730 * - COW-based, which uses a COW fork as a staging extent for data updates 731 * before atomically updating extent mappings for the range being written 732 * 733 */ 734 static noinline ssize_t 735 xfs_file_dio_write_atomic( 736 struct xfs_inode *ip, 737 struct kiocb *iocb, 738 struct iov_iter *from) 739 { 740 unsigned int iolock = XFS_IOLOCK_SHARED; 741 ssize_t ret, ocount = iov_iter_count(from); 742 const struct iomap_ops *dops; 743 744 /* 745 * HW offload should be faster, so try that first if it is already 746 * known that the write length is not too large. 747 */ 748 if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) 749 dops = &xfs_atomic_write_cow_iomap_ops; 750 else 751 dops = &xfs_direct_write_iomap_ops; 752 753 retry: 754 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 755 if (ret) 756 return ret; 757 758 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 759 if (ret) 760 goto out_unlock; 761 762 /* Demote similar to xfs_file_dio_write_aligned() */ 763 if (iolock == XFS_IOLOCK_EXCL) { 764 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 765 iolock = XFS_IOLOCK_SHARED; 766 } 767 768 trace_xfs_file_direct_write(iocb, from); 769 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, 770 0, NULL, 0); 771 772 /* 773 * The retry mechanism is based on the ->iomap_begin method returning 774 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not 775 * possible. The REQ_ATOMIC-based method typically not be possible if 776 * the write spans multiple extents or the disk blocks are misaligned. 777 */ 778 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { 779 xfs_iunlock(ip, iolock); 780 dops = &xfs_atomic_write_cow_iomap_ops; 781 goto retry; 782 } 783 784 out_unlock: 785 if (iolock) 786 xfs_iunlock(ip, iolock); 787 return ret; 788 } 789 790 /* 791 * Handle block unaligned direct I/O writes 792 * 793 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 794 * them to be done in parallel with reads and other direct I/O writes. However, 795 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 796 * to do sub-block zeroing and that requires serialisation against other direct 797 * I/O to the same block. In this case we need to serialise the submission of 798 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 799 * In the case where sub-block zeroing is not required, we can do concurrent 800 * sub-block dios to the same block successfully. 801 * 802 * Optimistically submit the I/O using the shared lock first, but use the 803 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 804 * if block allocation or partial block zeroing would be required. In that case 805 * we try again with the exclusive lock. 806 */ 807 static noinline ssize_t 808 xfs_file_dio_write_unaligned( 809 struct xfs_inode *ip, 810 struct kiocb *iocb, 811 struct iov_iter *from) 812 { 813 size_t isize = i_size_read(VFS_I(ip)); 814 size_t count = iov_iter_count(from); 815 unsigned int iolock = XFS_IOLOCK_SHARED; 816 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 817 ssize_t ret; 818 819 /* 820 * Extending writes need exclusivity because of the sub-block zeroing 821 * that the DIO code always does for partial tail blocks beyond EOF, so 822 * don't even bother trying the fast path in this case. 823 */ 824 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 825 if (iocb->ki_flags & IOCB_NOWAIT) 826 return -EAGAIN; 827 retry_exclusive: 828 iolock = XFS_IOLOCK_EXCL; 829 flags = IOMAP_DIO_FORCE_WAIT; 830 } 831 832 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 833 if (ret) 834 return ret; 835 836 /* 837 * We can't properly handle unaligned direct I/O to reflink files yet, 838 * as we can't unshare a partial block. 839 */ 840 if (xfs_is_cow_inode(ip)) { 841 trace_xfs_reflink_bounce_dio_write(iocb, from); 842 ret = -ENOTBLK; 843 goto out_unlock; 844 } 845 846 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 847 if (ret) 848 goto out_unlock; 849 850 /* 851 * If we are doing exclusive unaligned I/O, this must be the only I/O 852 * in-flight. Otherwise we risk data corruption due to unwritten extent 853 * conversions from the AIO end_io handler. Wait for all other I/O to 854 * drain first. 855 */ 856 if (flags & IOMAP_DIO_FORCE_WAIT) 857 inode_dio_wait(VFS_I(ip)); 858 859 trace_xfs_file_direct_write(iocb, from); 860 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 861 &xfs_dio_write_ops, flags, NULL, 0); 862 863 /* 864 * Retry unaligned I/O with exclusive blocking semantics if the DIO 865 * layer rejected it for mapping or locking reasons. If we are doing 866 * nonblocking user I/O, propagate the error. 867 */ 868 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 869 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 870 xfs_iunlock(ip, iolock); 871 goto retry_exclusive; 872 } 873 874 out_unlock: 875 if (iolock) 876 xfs_iunlock(ip, iolock); 877 return ret; 878 } 879 880 static ssize_t 881 xfs_file_dio_write( 882 struct kiocb *iocb, 883 struct iov_iter *from) 884 { 885 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 886 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 887 size_t count = iov_iter_count(from); 888 889 /* direct I/O must be aligned to device logical sector size */ 890 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 891 return -EINVAL; 892 893 /* 894 * For always COW inodes we also must check the alignment of each 895 * individual iovec segment, as they could end up with different 896 * I/Os due to the way bio_iov_iter_get_pages works, and we'd 897 * then overwrite an already written block. 898 */ 899 if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || 900 (xfs_is_always_cow_inode(ip) && 901 (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) 902 return xfs_file_dio_write_unaligned(ip, iocb, from); 903 if (xfs_is_zoned_inode(ip)) 904 return xfs_file_dio_write_zoned(ip, iocb, from); 905 if (iocb->ki_flags & IOCB_ATOMIC) 906 return xfs_file_dio_write_atomic(ip, iocb, from); 907 return xfs_file_dio_write_aligned(ip, iocb, from, 908 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); 909 } 910 911 static noinline ssize_t 912 xfs_file_dax_write( 913 struct kiocb *iocb, 914 struct iov_iter *from) 915 { 916 struct inode *inode = iocb->ki_filp->f_mapping->host; 917 struct xfs_inode *ip = XFS_I(inode); 918 unsigned int iolock = XFS_IOLOCK_EXCL; 919 ssize_t ret, error = 0; 920 loff_t pos; 921 922 ret = xfs_ilock_iocb(iocb, iolock); 923 if (ret) 924 return ret; 925 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 926 if (ret) 927 goto out; 928 929 pos = iocb->ki_pos; 930 931 trace_xfs_file_dax_write(iocb, from); 932 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 933 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 934 i_size_write(inode, iocb->ki_pos); 935 error = xfs_setfilesize(ip, pos, ret); 936 } 937 out: 938 if (iolock) 939 xfs_iunlock(ip, iolock); 940 if (error) 941 return error; 942 943 if (ret > 0) { 944 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 945 946 /* Handle various SYNC-type writes */ 947 ret = generic_write_sync(iocb, ret); 948 } 949 return ret; 950 } 951 952 STATIC ssize_t 953 xfs_file_buffered_write( 954 struct kiocb *iocb, 955 struct iov_iter *from) 956 { 957 struct inode *inode = iocb->ki_filp->f_mapping->host; 958 struct xfs_inode *ip = XFS_I(inode); 959 ssize_t ret; 960 bool cleared_space = false; 961 unsigned int iolock; 962 963 write_retry: 964 iolock = XFS_IOLOCK_EXCL; 965 ret = xfs_ilock_iocb(iocb, iolock); 966 if (ret) 967 return ret; 968 969 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 970 if (ret) 971 goto out; 972 973 trace_xfs_file_buffered_write(iocb, from); 974 ret = iomap_file_buffered_write(iocb, from, 975 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 976 NULL); 977 978 /* 979 * If we hit a space limit, try to free up some lingering preallocated 980 * space before returning an error. In the case of ENOSPC, first try to 981 * write back all dirty inodes to free up some of the excess reserved 982 * metadata space. This reduces the chances that the eofblocks scan 983 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 984 * also behaves as a filter to prevent too many eofblocks scans from 985 * running at the same time. Use a synchronous scan to increase the 986 * effectiveness of the scan. 987 */ 988 if (ret == -EDQUOT && !cleared_space) { 989 xfs_iunlock(ip, iolock); 990 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 991 cleared_space = true; 992 goto write_retry; 993 } else if (ret == -ENOSPC && !cleared_space) { 994 struct xfs_icwalk icw = {0}; 995 996 cleared_space = true; 997 xfs_flush_inodes(ip->i_mount); 998 999 xfs_iunlock(ip, iolock); 1000 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 1001 xfs_blockgc_free_space(ip->i_mount, &icw); 1002 goto write_retry; 1003 } 1004 1005 out: 1006 if (iolock) 1007 xfs_iunlock(ip, iolock); 1008 1009 if (ret > 0) { 1010 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 1011 /* Handle various SYNC-type writes */ 1012 ret = generic_write_sync(iocb, ret); 1013 } 1014 return ret; 1015 } 1016 1017 STATIC ssize_t 1018 xfs_file_buffered_write_zoned( 1019 struct kiocb *iocb, 1020 struct iov_iter *from) 1021 { 1022 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 1023 struct xfs_mount *mp = ip->i_mount; 1024 unsigned int iolock = XFS_IOLOCK_EXCL; 1025 bool cleared_space = false; 1026 struct xfs_zone_alloc_ctx ac = { }; 1027 ssize_t ret; 1028 1029 ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); 1030 if (ret < 0) 1031 return ret; 1032 1033 ret = xfs_ilock_iocb(iocb, iolock); 1034 if (ret) 1035 goto out_unreserve; 1036 1037 ret = xfs_file_write_checks(iocb, from, &iolock, &ac); 1038 if (ret) 1039 goto out_unlock; 1040 1041 /* 1042 * Truncate the iter to the length that we were actually able to 1043 * allocate blocks for. This needs to happen after 1044 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND 1045 * writes. 1046 */ 1047 iov_iter_truncate(from, 1048 XFS_FSB_TO_B(mp, ac.reserved_blocks) - 1049 (iocb->ki_pos & mp->m_blockmask)); 1050 if (!iov_iter_count(from)) 1051 goto out_unlock; 1052 1053 retry: 1054 trace_xfs_file_buffered_write(iocb, from); 1055 ret = iomap_file_buffered_write(iocb, from, 1056 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 1057 &ac); 1058 if (ret == -ENOSPC && !cleared_space) { 1059 /* 1060 * Kick off writeback to convert delalloc space and release the 1061 * usually too pessimistic indirect block reservations. 1062 */ 1063 xfs_flush_inodes(mp); 1064 cleared_space = true; 1065 goto retry; 1066 } 1067 1068 out_unlock: 1069 xfs_iunlock(ip, iolock); 1070 out_unreserve: 1071 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1072 if (ret > 0) { 1073 XFS_STATS_ADD(mp, xs_write_bytes, ret); 1074 ret = generic_write_sync(iocb, ret); 1075 } 1076 return ret; 1077 } 1078 1079 STATIC ssize_t 1080 xfs_file_write_iter( 1081 struct kiocb *iocb, 1082 struct iov_iter *from) 1083 { 1084 struct inode *inode = iocb->ki_filp->f_mapping->host; 1085 struct xfs_inode *ip = XFS_I(inode); 1086 ssize_t ret; 1087 size_t ocount = iov_iter_count(from); 1088 1089 XFS_STATS_INC(ip->i_mount, xs_write_calls); 1090 1091 if (ocount == 0) 1092 return 0; 1093 1094 if (xfs_is_shutdown(ip->i_mount)) 1095 return -EIO; 1096 1097 if (iocb->ki_flags & IOCB_ATOMIC) { 1098 if (ocount < xfs_get_atomic_write_min(ip)) 1099 return -EINVAL; 1100 1101 if (ocount > xfs_get_atomic_write_max(ip)) 1102 return -EINVAL; 1103 1104 ret = generic_atomic_write_valid(iocb, from); 1105 if (ret) 1106 return ret; 1107 } 1108 1109 if (IS_DAX(inode)) 1110 return xfs_file_dax_write(iocb, from); 1111 1112 if (iocb->ki_flags & IOCB_DIRECT) { 1113 /* 1114 * Allow a directio write to fall back to a buffered 1115 * write *only* in the case that we're doing a reflink 1116 * CoW. In all other directio scenarios we do not 1117 * allow an operation to fall back to buffered mode. 1118 */ 1119 ret = xfs_file_dio_write(iocb, from); 1120 if (ret != -ENOTBLK) 1121 return ret; 1122 } 1123 1124 if (xfs_is_zoned_inode(ip)) 1125 return xfs_file_buffered_write_zoned(iocb, from); 1126 return xfs_file_buffered_write(iocb, from); 1127 } 1128 1129 /* Does this file, inode, or mount want synchronous writes? */ 1130 static inline bool xfs_file_sync_writes(struct file *filp) 1131 { 1132 struct xfs_inode *ip = XFS_I(file_inode(filp)); 1133 1134 if (xfs_has_wsync(ip->i_mount)) 1135 return true; 1136 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 1137 return true; 1138 if (IS_SYNC(file_inode(filp))) 1139 return true; 1140 1141 return false; 1142 } 1143 1144 static int 1145 xfs_falloc_newsize( 1146 struct file *file, 1147 int mode, 1148 loff_t offset, 1149 loff_t len, 1150 loff_t *new_size) 1151 { 1152 struct inode *inode = file_inode(file); 1153 1154 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) 1155 return 0; 1156 *new_size = offset + len; 1157 return inode_newsize_ok(inode, *new_size); 1158 } 1159 1160 static int 1161 xfs_falloc_setsize( 1162 struct file *file, 1163 loff_t new_size) 1164 { 1165 struct iattr iattr = { 1166 .ia_valid = ATTR_SIZE, 1167 .ia_size = new_size, 1168 }; 1169 1170 if (!new_size) 1171 return 0; 1172 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), 1173 &iattr); 1174 } 1175 1176 static int 1177 xfs_falloc_collapse_range( 1178 struct file *file, 1179 loff_t offset, 1180 loff_t len, 1181 struct xfs_zone_alloc_ctx *ac) 1182 { 1183 struct inode *inode = file_inode(file); 1184 loff_t new_size = i_size_read(inode) - len; 1185 int error; 1186 1187 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1188 return -EINVAL; 1189 1190 /* 1191 * There is no need to overlap collapse range with EOF, in which case it 1192 * is effectively a truncate operation 1193 */ 1194 if (offset + len >= i_size_read(inode)) 1195 return -EINVAL; 1196 1197 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); 1198 if (error) 1199 return error; 1200 return xfs_falloc_setsize(file, new_size); 1201 } 1202 1203 static int 1204 xfs_falloc_insert_range( 1205 struct file *file, 1206 loff_t offset, 1207 loff_t len) 1208 { 1209 struct inode *inode = file_inode(file); 1210 loff_t isize = i_size_read(inode); 1211 int error; 1212 1213 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1214 return -EINVAL; 1215 1216 /* 1217 * New inode size must not exceed ->s_maxbytes, accounting for 1218 * possible signed overflow. 1219 */ 1220 if (inode->i_sb->s_maxbytes - isize < len) 1221 return -EFBIG; 1222 1223 /* Offset should be less than i_size */ 1224 if (offset >= isize) 1225 return -EINVAL; 1226 1227 error = xfs_falloc_setsize(file, isize + len); 1228 if (error) 1229 return error; 1230 1231 /* 1232 * Perform hole insertion now that the file size has been updated so 1233 * that if we crash during the operation we don't leave shifted extents 1234 * past EOF and hence losing access to the data that is contained within 1235 * them. 1236 */ 1237 return xfs_insert_file_space(XFS_I(inode), offset, len); 1238 } 1239 1240 /* 1241 * Punch a hole and prealloc the range. We use a hole punch rather than 1242 * unwritten extent conversion for two reasons: 1243 * 1244 * 1.) Hole punch handles partial block zeroing for us. 1245 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by 1246 * virtue of the hole punch. 1247 */ 1248 static int 1249 xfs_falloc_zero_range( 1250 struct file *file, 1251 int mode, 1252 loff_t offset, 1253 loff_t len, 1254 struct xfs_zone_alloc_ctx *ac) 1255 { 1256 struct inode *inode = file_inode(file); 1257 unsigned int blksize = i_blocksize(inode); 1258 loff_t new_size = 0; 1259 int error; 1260 1261 trace_xfs_zero_file_space(XFS_I(inode)); 1262 1263 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1264 if (error) 1265 return error; 1266 1267 error = xfs_free_file_space(XFS_I(inode), offset, len, ac); 1268 if (error) 1269 return error; 1270 1271 len = round_up(offset + len, blksize) - round_down(offset, blksize); 1272 offset = round_down(offset, blksize); 1273 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1274 if (error) 1275 return error; 1276 return xfs_falloc_setsize(file, new_size); 1277 } 1278 1279 static int 1280 xfs_falloc_unshare_range( 1281 struct file *file, 1282 int mode, 1283 loff_t offset, 1284 loff_t len) 1285 { 1286 struct inode *inode = file_inode(file); 1287 loff_t new_size = 0; 1288 int error; 1289 1290 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1291 if (error) 1292 return error; 1293 1294 error = xfs_reflink_unshare(XFS_I(inode), offset, len); 1295 if (error) 1296 return error; 1297 1298 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1299 if (error) 1300 return error; 1301 return xfs_falloc_setsize(file, new_size); 1302 } 1303 1304 static int 1305 xfs_falloc_allocate_range( 1306 struct file *file, 1307 int mode, 1308 loff_t offset, 1309 loff_t len) 1310 { 1311 struct inode *inode = file_inode(file); 1312 loff_t new_size = 0; 1313 int error; 1314 1315 /* 1316 * If always_cow mode we can't use preallocations and thus should not 1317 * create them. 1318 */ 1319 if (xfs_is_always_cow_inode(XFS_I(inode))) 1320 return -EOPNOTSUPP; 1321 1322 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1323 if (error) 1324 return error; 1325 1326 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1327 if (error) 1328 return error; 1329 return xfs_falloc_setsize(file, new_size); 1330 } 1331 1332 #define XFS_FALLOC_FL_SUPPORTED \ 1333 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ 1334 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ 1335 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ 1336 FALLOC_FL_UNSHARE_RANGE) 1337 1338 STATIC long 1339 __xfs_file_fallocate( 1340 struct file *file, 1341 int mode, 1342 loff_t offset, 1343 loff_t len, 1344 struct xfs_zone_alloc_ctx *ac) 1345 { 1346 struct inode *inode = file_inode(file); 1347 struct xfs_inode *ip = XFS_I(inode); 1348 long error; 1349 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1350 1351 xfs_ilock(ip, iolock); 1352 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 1353 if (error) 1354 goto out_unlock; 1355 1356 /* 1357 * Must wait for all AIO to complete before we continue as AIO can 1358 * change the file size on completion without holding any locks we 1359 * currently hold. We must do this first because AIO can update both 1360 * the on disk and in memory inode sizes, and the operations that follow 1361 * require the in-memory size to be fully up-to-date. 1362 */ 1363 inode_dio_wait(inode); 1364 1365 error = file_modified(file); 1366 if (error) 1367 goto out_unlock; 1368 1369 switch (mode & FALLOC_FL_MODE_MASK) { 1370 case FALLOC_FL_PUNCH_HOLE: 1371 error = xfs_free_file_space(ip, offset, len, ac); 1372 break; 1373 case FALLOC_FL_COLLAPSE_RANGE: 1374 error = xfs_falloc_collapse_range(file, offset, len, ac); 1375 break; 1376 case FALLOC_FL_INSERT_RANGE: 1377 error = xfs_falloc_insert_range(file, offset, len); 1378 break; 1379 case FALLOC_FL_ZERO_RANGE: 1380 error = xfs_falloc_zero_range(file, mode, offset, len, ac); 1381 break; 1382 case FALLOC_FL_UNSHARE_RANGE: 1383 error = xfs_falloc_unshare_range(file, mode, offset, len); 1384 break; 1385 case FALLOC_FL_ALLOCATE_RANGE: 1386 error = xfs_falloc_allocate_range(file, mode, offset, len); 1387 break; 1388 default: 1389 error = -EOPNOTSUPP; 1390 break; 1391 } 1392 1393 if (!error && xfs_file_sync_writes(file)) 1394 error = xfs_log_force_inode(ip); 1395 1396 out_unlock: 1397 xfs_iunlock(ip, iolock); 1398 return error; 1399 } 1400 1401 static long 1402 xfs_file_zoned_fallocate( 1403 struct file *file, 1404 int mode, 1405 loff_t offset, 1406 loff_t len) 1407 { 1408 struct xfs_zone_alloc_ctx ac = { }; 1409 struct xfs_inode *ip = XFS_I(file_inode(file)); 1410 int error; 1411 1412 error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac); 1413 if (error) 1414 return error; 1415 error = __xfs_file_fallocate(file, mode, offset, len, &ac); 1416 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1417 return error; 1418 } 1419 1420 static long 1421 xfs_file_fallocate( 1422 struct file *file, 1423 int mode, 1424 loff_t offset, 1425 loff_t len) 1426 { 1427 struct inode *inode = file_inode(file); 1428 1429 if (!S_ISREG(inode->i_mode)) 1430 return -EINVAL; 1431 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1432 return -EOPNOTSUPP; 1433 1434 /* 1435 * For zoned file systems, zeroing the first and last block of a hole 1436 * punch requires allocating a new block to rewrite the remaining data 1437 * and new zeroes out of place. Get a reservations for those before 1438 * taking the iolock. Dip into the reserved pool because we are 1439 * expected to be able to punch a hole even on a completely full 1440 * file system. 1441 */ 1442 if (xfs_is_zoned_inode(XFS_I(inode)) && 1443 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 1444 FALLOC_FL_COLLAPSE_RANGE))) 1445 return xfs_file_zoned_fallocate(file, mode, offset, len); 1446 return __xfs_file_fallocate(file, mode, offset, len, NULL); 1447 } 1448 1449 STATIC int 1450 xfs_file_fadvise( 1451 struct file *file, 1452 loff_t start, 1453 loff_t end, 1454 int advice) 1455 { 1456 struct xfs_inode *ip = XFS_I(file_inode(file)); 1457 int ret; 1458 int lockflags = 0; 1459 1460 /* 1461 * Operations creating pages in page cache need protection from hole 1462 * punching and similar ops 1463 */ 1464 if (advice == POSIX_FADV_WILLNEED) { 1465 lockflags = XFS_IOLOCK_SHARED; 1466 xfs_ilock(ip, lockflags); 1467 } 1468 ret = generic_fadvise(file, start, end, advice); 1469 if (lockflags) 1470 xfs_iunlock(ip, lockflags); 1471 return ret; 1472 } 1473 1474 STATIC loff_t 1475 xfs_file_remap_range( 1476 struct file *file_in, 1477 loff_t pos_in, 1478 struct file *file_out, 1479 loff_t pos_out, 1480 loff_t len, 1481 unsigned int remap_flags) 1482 { 1483 struct inode *inode_in = file_inode(file_in); 1484 struct xfs_inode *src = XFS_I(inode_in); 1485 struct inode *inode_out = file_inode(file_out); 1486 struct xfs_inode *dest = XFS_I(inode_out); 1487 struct xfs_mount *mp = src->i_mount; 1488 loff_t remapped = 0; 1489 xfs_extlen_t cowextsize; 1490 int ret; 1491 1492 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1493 return -EINVAL; 1494 1495 if (!xfs_has_reflink(mp)) 1496 return -EOPNOTSUPP; 1497 1498 if (xfs_is_shutdown(mp)) 1499 return -EIO; 1500 1501 /* Prepare and then clone file data. */ 1502 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1503 &len, remap_flags); 1504 if (ret || len == 0) 1505 return ret; 1506 1507 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1508 1509 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1510 &remapped); 1511 if (ret) 1512 goto out_unlock; 1513 1514 /* 1515 * Carry the cowextsize hint from src to dest if we're sharing the 1516 * entire source file to the entire destination file, the source file 1517 * has a cowextsize hint, and the destination file does not. 1518 */ 1519 cowextsize = 0; 1520 if (pos_in == 0 && len == i_size_read(inode_in) && 1521 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1522 pos_out == 0 && len >= i_size_read(inode_out) && 1523 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1524 cowextsize = src->i_cowextsize; 1525 1526 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1527 remap_flags); 1528 if (ret) 1529 goto out_unlock; 1530 1531 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1532 xfs_log_force_inode(dest); 1533 out_unlock: 1534 xfs_iunlock2_remapping(src, dest); 1535 if (ret) 1536 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1537 /* 1538 * If the caller did not set CAN_SHORTEN, then it is not prepared to 1539 * handle partial results -- either the whole remap succeeds, or we 1540 * must say why it did not. In this case, any error should be returned 1541 * to the caller. 1542 */ 1543 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) 1544 return ret; 1545 return remapped > 0 ? remapped : ret; 1546 } 1547 1548 STATIC int 1549 xfs_file_open( 1550 struct inode *inode, 1551 struct file *file) 1552 { 1553 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1554 return -EIO; 1555 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 1556 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) 1557 file->f_mode |= FMODE_CAN_ATOMIC_WRITE; 1558 return generic_file_open(inode, file); 1559 } 1560 1561 STATIC int 1562 xfs_dir_open( 1563 struct inode *inode, 1564 struct file *file) 1565 { 1566 struct xfs_inode *ip = XFS_I(inode); 1567 unsigned int mode; 1568 int error; 1569 1570 if (xfs_is_shutdown(ip->i_mount)) 1571 return -EIO; 1572 error = generic_file_open(inode, file); 1573 if (error) 1574 return error; 1575 1576 /* 1577 * If there are any blocks, read-ahead block 0 as we're almost 1578 * certain to have the next operation be a read there. 1579 */ 1580 mode = xfs_ilock_data_map_shared(ip); 1581 if (ip->i_df.if_nextents > 0) 1582 error = xfs_dir3_data_readahead(ip, 0, 0); 1583 xfs_iunlock(ip, mode); 1584 return error; 1585 } 1586 1587 /* 1588 * Don't bother propagating errors. We're just doing cleanup, and the caller 1589 * ignores the return value anyway. 1590 */ 1591 STATIC int 1592 xfs_file_release( 1593 struct inode *inode, 1594 struct file *file) 1595 { 1596 struct xfs_inode *ip = XFS_I(inode); 1597 struct xfs_mount *mp = ip->i_mount; 1598 1599 /* 1600 * If this is a read-only mount or the file system has been shut down, 1601 * don't generate I/O. 1602 */ 1603 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) 1604 return 0; 1605 1606 /* 1607 * If we previously truncated this file and removed old data in the 1608 * process, we want to initiate "early" writeout on the last close. 1609 * This is an attempt to combat the notorious NULL files problem which 1610 * is particularly noticeable from a truncate down, buffered (re-)write 1611 * (delalloc), followed by a crash. What we are effectively doing here 1612 * is significantly reducing the time window where we'd otherwise be 1613 * exposed to that problem. 1614 */ 1615 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { 1616 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); 1617 if (ip->i_delayed_blks > 0) 1618 filemap_flush(inode->i_mapping); 1619 } 1620 1621 /* 1622 * XFS aggressively preallocates post-EOF space to generate contiguous 1623 * allocations for writers that append to the end of the file. 1624 * 1625 * To support workloads that close and reopen the file frequently, these 1626 * preallocations usually persist after a close unless it is the first 1627 * close for the inode. This is a tradeoff to generate tightly packed 1628 * data layouts for unpacking tarballs or similar archives that write 1629 * one file after another without going back to it while keeping the 1630 * preallocation for files that have recurring open/write/close cycles. 1631 * 1632 * This heuristic is skipped for inodes with the append-only flag as 1633 * that flag is rather pointless for inodes written only once. 1634 * 1635 * There is no point in freeing blocks here for open but unlinked files 1636 * as they will be taken care of by the inactivation path soon. 1637 * 1638 * When releasing a read-only context, don't flush data or trim post-EOF 1639 * blocks. This avoids open/read/close workloads from removing EOF 1640 * blocks that other writers depend upon to reduce fragmentation. 1641 * 1642 * Inodes on the zoned RT device never have preallocations, so skip 1643 * taking the locks below. 1644 */ 1645 if (!inode->i_nlink || 1646 !(file->f_mode & FMODE_WRITE) || 1647 (ip->i_diflags & XFS_DIFLAG_APPEND) || 1648 xfs_is_zoned_inode(ip)) 1649 return 0; 1650 1651 /* 1652 * If we can't get the iolock just skip truncating the blocks past EOF 1653 * because we could deadlock with the mmap_lock otherwise. We'll get 1654 * another chance to drop them once the last reference to the inode is 1655 * dropped, so we'll never leak blocks permanently. 1656 */ 1657 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1658 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1659 if (xfs_can_free_eofblocks(ip) && 1660 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) 1661 xfs_free_eofblocks(ip); 1662 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1663 } 1664 1665 return 0; 1666 } 1667 1668 STATIC int 1669 xfs_file_readdir( 1670 struct file *file, 1671 struct dir_context *ctx) 1672 { 1673 struct inode *inode = file_inode(file); 1674 xfs_inode_t *ip = XFS_I(inode); 1675 size_t bufsize; 1676 1677 /* 1678 * The Linux API doesn't pass down the total size of the buffer 1679 * we read into down to the filesystem. With the filldir concept 1680 * it's not needed for correct information, but the XFS dir2 leaf 1681 * code wants an estimate of the buffer size to calculate it's 1682 * readahead window and size the buffers used for mapping to 1683 * physical blocks. 1684 * 1685 * Try to give it an estimate that's good enough, maybe at some 1686 * point we can change the ->readdir prototype to include the 1687 * buffer size. For now we use the current glibc buffer size. 1688 */ 1689 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1690 1691 return xfs_readdir(NULL, ip, ctx, bufsize); 1692 } 1693 1694 STATIC loff_t 1695 xfs_file_llseek( 1696 struct file *file, 1697 loff_t offset, 1698 int whence) 1699 { 1700 struct inode *inode = file->f_mapping->host; 1701 1702 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1703 return -EIO; 1704 1705 switch (whence) { 1706 default: 1707 return generic_file_llseek(file, offset, whence); 1708 case SEEK_HOLE: 1709 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1710 break; 1711 case SEEK_DATA: 1712 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1713 break; 1714 } 1715 1716 if (offset < 0) 1717 return offset; 1718 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1719 } 1720 1721 static inline vm_fault_t 1722 xfs_dax_fault_locked( 1723 struct vm_fault *vmf, 1724 unsigned int order, 1725 bool write_fault) 1726 { 1727 vm_fault_t ret; 1728 unsigned long pfn; 1729 1730 if (!IS_ENABLED(CONFIG_FS_DAX)) { 1731 ASSERT(0); 1732 return VM_FAULT_SIGBUS; 1733 } 1734 ret = dax_iomap_fault(vmf, order, &pfn, NULL, 1735 (write_fault && !vmf->cow_page) ? 1736 &xfs_dax_write_iomap_ops : 1737 &xfs_read_iomap_ops); 1738 if (ret & VM_FAULT_NEEDDSYNC) 1739 ret = dax_finish_sync_fault(vmf, order, pfn); 1740 return ret; 1741 } 1742 1743 static vm_fault_t 1744 xfs_dax_read_fault( 1745 struct vm_fault *vmf, 1746 unsigned int order) 1747 { 1748 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1749 vm_fault_t ret; 1750 1751 trace_xfs_read_fault(ip, order); 1752 1753 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1754 ret = xfs_dax_fault_locked(vmf, order, false); 1755 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1756 1757 return ret; 1758 } 1759 1760 /* 1761 * Locking for serialisation of IO during page faults. This results in a lock 1762 * ordering of: 1763 * 1764 * mmap_lock (MM) 1765 * sb_start_pagefault(vfs, freeze) 1766 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1767 * page_lock (MM) 1768 * i_lock (XFS - extent map serialisation) 1769 */ 1770 static vm_fault_t 1771 __xfs_write_fault( 1772 struct vm_fault *vmf, 1773 unsigned int order, 1774 struct xfs_zone_alloc_ctx *ac) 1775 { 1776 struct inode *inode = file_inode(vmf->vma->vm_file); 1777 struct xfs_inode *ip = XFS_I(inode); 1778 unsigned int lock_mode = XFS_MMAPLOCK_SHARED; 1779 vm_fault_t ret; 1780 1781 trace_xfs_write_fault(ip, order); 1782 1783 sb_start_pagefault(inode->i_sb); 1784 file_update_time(vmf->vma->vm_file); 1785 1786 /* 1787 * Normally we only need the shared mmaplock, but if a reflink remap is 1788 * in progress we take the exclusive lock to wait for the remap to 1789 * finish before taking a write fault. 1790 */ 1791 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1792 if (xfs_iflags_test(ip, XFS_IREMAPPING)) { 1793 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1794 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1795 lock_mode = XFS_MMAPLOCK_EXCL; 1796 } 1797 1798 if (IS_DAX(inode)) 1799 ret = xfs_dax_fault_locked(vmf, order, true); 1800 else 1801 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, 1802 ac); 1803 xfs_iunlock(ip, lock_mode); 1804 1805 sb_end_pagefault(inode->i_sb); 1806 return ret; 1807 } 1808 1809 static vm_fault_t 1810 xfs_write_fault_zoned( 1811 struct vm_fault *vmf, 1812 unsigned int order) 1813 { 1814 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1815 unsigned int len = folio_size(page_folio(vmf->page)); 1816 struct xfs_zone_alloc_ctx ac = { }; 1817 int error; 1818 vm_fault_t ret; 1819 1820 /* 1821 * This could over-allocate as it doesn't check for truncation. 1822 * 1823 * But as the overallocation is limited to less than a folio and will be 1824 * release instantly that's just fine. 1825 */ 1826 error = xfs_zoned_space_reserve(ip->i_mount, 1827 XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); 1828 if (error < 0) 1829 return vmf_fs_error(error); 1830 ret = __xfs_write_fault(vmf, order, &ac); 1831 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1832 return ret; 1833 } 1834 1835 static vm_fault_t 1836 xfs_write_fault( 1837 struct vm_fault *vmf, 1838 unsigned int order) 1839 { 1840 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) 1841 return xfs_write_fault_zoned(vmf, order); 1842 return __xfs_write_fault(vmf, order, NULL); 1843 } 1844 1845 static inline bool 1846 xfs_is_write_fault( 1847 struct vm_fault *vmf) 1848 { 1849 return (vmf->flags & FAULT_FLAG_WRITE) && 1850 (vmf->vma->vm_flags & VM_SHARED); 1851 } 1852 1853 static vm_fault_t 1854 xfs_filemap_fault( 1855 struct vm_fault *vmf) 1856 { 1857 struct inode *inode = file_inode(vmf->vma->vm_file); 1858 1859 /* DAX can shortcut the normal fault path on write faults! */ 1860 if (IS_DAX(inode)) { 1861 if (xfs_is_write_fault(vmf)) 1862 return xfs_write_fault(vmf, 0); 1863 return xfs_dax_read_fault(vmf, 0); 1864 } 1865 1866 trace_xfs_read_fault(XFS_I(inode), 0); 1867 return filemap_fault(vmf); 1868 } 1869 1870 static vm_fault_t 1871 xfs_filemap_huge_fault( 1872 struct vm_fault *vmf, 1873 unsigned int order) 1874 { 1875 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1876 return VM_FAULT_FALLBACK; 1877 1878 /* DAX can shortcut the normal fault path on write faults! */ 1879 if (xfs_is_write_fault(vmf)) 1880 return xfs_write_fault(vmf, order); 1881 return xfs_dax_read_fault(vmf, order); 1882 } 1883 1884 static vm_fault_t 1885 xfs_filemap_page_mkwrite( 1886 struct vm_fault *vmf) 1887 { 1888 return xfs_write_fault(vmf, 0); 1889 } 1890 1891 /* 1892 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1893 * on write faults. In reality, it needs to serialise against truncate and 1894 * prepare memory for writing so handle is as standard write fault. 1895 */ 1896 static vm_fault_t 1897 xfs_filemap_pfn_mkwrite( 1898 struct vm_fault *vmf) 1899 { 1900 return xfs_write_fault(vmf, 0); 1901 } 1902 1903 static const struct vm_operations_struct xfs_file_vm_ops = { 1904 .fault = xfs_filemap_fault, 1905 .huge_fault = xfs_filemap_huge_fault, 1906 .map_pages = filemap_map_pages, 1907 .page_mkwrite = xfs_filemap_page_mkwrite, 1908 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1909 }; 1910 1911 STATIC int 1912 xfs_file_mmap_prepare( 1913 struct vm_area_desc *desc) 1914 { 1915 struct file *file = desc->file; 1916 struct inode *inode = file_inode(file); 1917 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 1918 1919 /* 1920 * We don't support synchronous mappings for non-DAX files and 1921 * for DAX files if underneath dax_device is not synchronous. 1922 */ 1923 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), 1924 target->bt_daxdev)) 1925 return -EOPNOTSUPP; 1926 1927 file_accessed(file); 1928 desc->vm_ops = &xfs_file_vm_ops; 1929 if (IS_DAX(inode)) 1930 desc->vm_flags |= VM_HUGEPAGE; 1931 return 0; 1932 } 1933 1934 const struct file_operations xfs_file_operations = { 1935 .llseek = xfs_file_llseek, 1936 .read_iter = xfs_file_read_iter, 1937 .write_iter = xfs_file_write_iter, 1938 .splice_read = xfs_file_splice_read, 1939 .splice_write = iter_file_splice_write, 1940 .iopoll = iocb_bio_iopoll, 1941 .unlocked_ioctl = xfs_file_ioctl, 1942 #ifdef CONFIG_COMPAT 1943 .compat_ioctl = xfs_file_compat_ioctl, 1944 #endif 1945 .mmap_prepare = xfs_file_mmap_prepare, 1946 .open = xfs_file_open, 1947 .release = xfs_file_release, 1948 .fsync = xfs_file_fsync, 1949 .get_unmapped_area = thp_get_unmapped_area, 1950 .fallocate = xfs_file_fallocate, 1951 .fadvise = xfs_file_fadvise, 1952 .remap_file_range = xfs_file_remap_range, 1953 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 1954 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | 1955 FOP_DONTCACHE, 1956 }; 1957 1958 const struct file_operations xfs_dir_file_operations = { 1959 .open = xfs_dir_open, 1960 .read = generic_read_dir, 1961 .iterate_shared = xfs_file_readdir, 1962 .llseek = generic_file_llseek, 1963 .unlocked_ioctl = xfs_file_ioctl, 1964 #ifdef CONFIG_COMPAT 1965 .compat_ioctl = xfs_file_compat_ioctl, 1966 #endif 1967 .fsync = xfs_dir_fsync, 1968 }; 1969