1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 #include "xfs_file.h" 28 #include "xfs_aops.h" 29 #include "xfs_zone_alloc.h" 30 31 #include <linux/dax.h> 32 #include <linux/falloc.h> 33 #include <linux/backing-dev.h> 34 #include <linux/mman.h> 35 #include <linux/fadvise.h> 36 #include <linux/mount.h> 37 38 static const struct vm_operations_struct xfs_file_vm_ops; 39 40 /* 41 * Decide if the given file range is aligned to the size of the fundamental 42 * allocation unit for the file. 43 */ 44 bool 45 xfs_is_falloc_aligned( 46 struct xfs_inode *ip, 47 loff_t pos, 48 long long int len) 49 { 50 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); 51 52 if (!is_power_of_2(alloc_unit)) 53 return isaligned_64(pos, alloc_unit) && 54 isaligned_64(len, alloc_unit); 55 56 return !((pos | len) & (alloc_unit - 1)); 57 } 58 59 /* 60 * Fsync operations on directories are much simpler than on regular files, 61 * as there is no file data to flush, and thus also no need for explicit 62 * cache flush operations, and there are no non-transaction metadata updates 63 * on directories either. 64 */ 65 STATIC int 66 xfs_dir_fsync( 67 struct file *file, 68 loff_t start, 69 loff_t end, 70 int datasync) 71 { 72 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 73 74 trace_xfs_dir_fsync(ip); 75 return xfs_log_force_inode(ip); 76 } 77 78 static xfs_csn_t 79 xfs_fsync_seq( 80 struct xfs_inode *ip, 81 bool datasync) 82 { 83 if (!xfs_ipincount(ip)) 84 return 0; 85 if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 86 return 0; 87 return ip->i_itemp->ili_commit_seq; 88 } 89 90 /* 91 * All metadata updates are logged, which means that we just have to flush the 92 * log up to the latest LSN that touched the inode. 93 * 94 * If we have concurrent fsync/fdatasync() calls, we need them to all block on 95 * the log force before we clear the ili_fsync_fields field. This ensures that 96 * we don't get a racing sync operation that does not wait for the metadata to 97 * hit the journal before returning. If we race with clearing ili_fsync_fields, 98 * then all that will happen is the log force will do nothing as the lsn will 99 * already be on disk. We can't race with setting ili_fsync_fields because that 100 * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock 101 * shared until after the ili_fsync_fields is cleared. 102 */ 103 static int 104 xfs_fsync_flush_log( 105 struct xfs_inode *ip, 106 bool datasync, 107 int *log_flushed) 108 { 109 int error = 0; 110 xfs_csn_t seq; 111 112 xfs_ilock(ip, XFS_ILOCK_SHARED); 113 seq = xfs_fsync_seq(ip, datasync); 114 if (seq) { 115 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 116 log_flushed); 117 118 spin_lock(&ip->i_itemp->ili_lock); 119 ip->i_itemp->ili_fsync_fields = 0; 120 spin_unlock(&ip->i_itemp->ili_lock); 121 } 122 xfs_iunlock(ip, XFS_ILOCK_SHARED); 123 return error; 124 } 125 126 STATIC int 127 xfs_file_fsync( 128 struct file *file, 129 loff_t start, 130 loff_t end, 131 int datasync) 132 { 133 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 134 struct xfs_mount *mp = ip->i_mount; 135 int error, err2; 136 int log_flushed = 0; 137 138 trace_xfs_file_fsync(ip); 139 140 error = file_write_and_wait_range(file, start, end); 141 if (error) 142 return error; 143 144 if (xfs_is_shutdown(mp)) 145 return -EIO; 146 147 xfs_iflags_clear(ip, XFS_ITRUNCATED); 148 149 /* 150 * If we have an RT and/or log subvolume we need to make sure to flush 151 * the write cache the device used for file data first. This is to 152 * ensure newly written file data make it to disk before logging the new 153 * inode size in case of an extending write. 154 */ 155 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) 156 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 157 else if (mp->m_logdev_targp != mp->m_ddev_targp) 158 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 159 160 /* 161 * Any inode that has dirty modifications in the log is pinned. The 162 * racy check here for a pinned inode will not catch modifications 163 * that happen concurrently to the fsync call, but fsync semantics 164 * only require to sync previously completed I/O. 165 */ 166 if (xfs_ipincount(ip)) { 167 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 168 if (err2 && !error) 169 error = err2; 170 } 171 172 /* 173 * If we only have a single device, and the log force about was 174 * a no-op we might have to flush the data device cache here. 175 * This can only happen for fdatasync/O_DSYNC if we were overwriting 176 * an already allocated file and thus do not have any metadata to 177 * commit. 178 */ 179 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 180 mp->m_logdev_targp == mp->m_ddev_targp) { 181 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 182 if (err2 && !error) 183 error = err2; 184 } 185 186 return error; 187 } 188 189 static int 190 xfs_ilock_iocb( 191 struct kiocb *iocb, 192 unsigned int lock_mode) 193 { 194 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 195 196 if (iocb->ki_flags & IOCB_NOWAIT) { 197 if (!xfs_ilock_nowait(ip, lock_mode)) 198 return -EAGAIN; 199 } else { 200 xfs_ilock(ip, lock_mode); 201 } 202 203 return 0; 204 } 205 206 static int 207 xfs_ilock_iocb_for_write( 208 struct kiocb *iocb, 209 unsigned int *lock_mode) 210 { 211 ssize_t ret; 212 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 213 214 ret = xfs_ilock_iocb(iocb, *lock_mode); 215 if (ret) 216 return ret; 217 218 /* 219 * If a reflink remap is in progress we always need to take the iolock 220 * exclusively to wait for it to finish. 221 */ 222 if (*lock_mode == XFS_IOLOCK_SHARED && 223 xfs_iflags_test(ip, XFS_IREMAPPING)) { 224 xfs_iunlock(ip, *lock_mode); 225 *lock_mode = XFS_IOLOCK_EXCL; 226 return xfs_ilock_iocb(iocb, *lock_mode); 227 } 228 229 return 0; 230 } 231 232 STATIC ssize_t 233 xfs_file_dio_read( 234 struct kiocb *iocb, 235 struct iov_iter *to) 236 { 237 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 238 ssize_t ret; 239 240 trace_xfs_file_direct_read(iocb, to); 241 242 if (!iov_iter_count(to)) 243 return 0; /* skip atime */ 244 245 file_accessed(iocb->ki_filp); 246 247 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 248 if (ret) 249 return ret; 250 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 251 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 252 253 return ret; 254 } 255 256 static noinline ssize_t 257 xfs_file_dax_read( 258 struct kiocb *iocb, 259 struct iov_iter *to) 260 { 261 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 262 ssize_t ret = 0; 263 264 trace_xfs_file_dax_read(iocb, to); 265 266 if (!iov_iter_count(to)) 267 return 0; /* skip atime */ 268 269 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 270 if (ret) 271 return ret; 272 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 273 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 274 275 file_accessed(iocb->ki_filp); 276 return ret; 277 } 278 279 STATIC ssize_t 280 xfs_file_buffered_read( 281 struct kiocb *iocb, 282 struct iov_iter *to) 283 { 284 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 285 ssize_t ret; 286 287 trace_xfs_file_buffered_read(iocb, to); 288 289 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 290 if (ret) 291 return ret; 292 ret = generic_file_read_iter(iocb, to); 293 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 294 295 return ret; 296 } 297 298 STATIC ssize_t 299 xfs_file_read_iter( 300 struct kiocb *iocb, 301 struct iov_iter *to) 302 { 303 struct inode *inode = file_inode(iocb->ki_filp); 304 struct xfs_mount *mp = XFS_I(inode)->i_mount; 305 ssize_t ret = 0; 306 307 XFS_STATS_INC(mp, xs_read_calls); 308 309 if (xfs_is_shutdown(mp)) 310 return -EIO; 311 312 if (IS_DAX(inode)) 313 ret = xfs_file_dax_read(iocb, to); 314 else if (iocb->ki_flags & IOCB_DIRECT) 315 ret = xfs_file_dio_read(iocb, to); 316 else 317 ret = xfs_file_buffered_read(iocb, to); 318 319 if (ret > 0) 320 XFS_STATS_ADD(mp, xs_read_bytes, ret); 321 return ret; 322 } 323 324 STATIC ssize_t 325 xfs_file_splice_read( 326 struct file *in, 327 loff_t *ppos, 328 struct pipe_inode_info *pipe, 329 size_t len, 330 unsigned int flags) 331 { 332 struct inode *inode = file_inode(in); 333 struct xfs_inode *ip = XFS_I(inode); 334 struct xfs_mount *mp = ip->i_mount; 335 ssize_t ret = 0; 336 337 XFS_STATS_INC(mp, xs_read_calls); 338 339 if (xfs_is_shutdown(mp)) 340 return -EIO; 341 342 trace_xfs_file_splice_read(ip, *ppos, len); 343 344 xfs_ilock(ip, XFS_IOLOCK_SHARED); 345 ret = filemap_splice_read(in, ppos, pipe, len, flags); 346 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 347 if (ret > 0) 348 XFS_STATS_ADD(mp, xs_read_bytes, ret); 349 return ret; 350 } 351 352 /* 353 * Take care of zeroing post-EOF blocks when they might exist. 354 * 355 * Returns 0 if successfully, a negative error for a failure, or 1 if this 356 * function dropped the iolock and reacquired it exclusively and the caller 357 * needs to restart the write sanity checks. 358 */ 359 static ssize_t 360 xfs_file_write_zero_eof( 361 struct kiocb *iocb, 362 struct iov_iter *from, 363 unsigned int *iolock, 364 size_t count, 365 bool *drained_dio, 366 struct xfs_zone_alloc_ctx *ac) 367 { 368 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 369 loff_t isize; 370 int error; 371 372 /* 373 * We need to serialise against EOF updates that occur in IO completions 374 * here. We want to make sure that nobody is changing the size while 375 * we do this check until we have placed an IO barrier (i.e. hold 376 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 377 * spinlock effectively forms a memory barrier once we have 378 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 379 * hence be able to correctly determine if we need to run zeroing. 380 */ 381 spin_lock(&ip->i_flags_lock); 382 isize = i_size_read(VFS_I(ip)); 383 if (iocb->ki_pos <= isize) { 384 spin_unlock(&ip->i_flags_lock); 385 return 0; 386 } 387 spin_unlock(&ip->i_flags_lock); 388 389 if (iocb->ki_flags & IOCB_NOWAIT) 390 return -EAGAIN; 391 392 if (!*drained_dio) { 393 /* 394 * If zeroing is needed and we are currently holding the iolock 395 * shared, we need to update it to exclusive which implies 396 * having to redo all checks before. 397 */ 398 if (*iolock == XFS_IOLOCK_SHARED) { 399 xfs_iunlock(ip, *iolock); 400 *iolock = XFS_IOLOCK_EXCL; 401 xfs_ilock(ip, *iolock); 402 iov_iter_reexpand(from, count); 403 } 404 405 /* 406 * We now have an IO submission barrier in place, but AIO can do 407 * EOF updates during IO completion and hence we now need to 408 * wait for all of them to drain. Non-AIO DIO will have drained 409 * before we are given the XFS_IOLOCK_EXCL, and so for most 410 * cases this wait is a no-op. 411 */ 412 inode_dio_wait(VFS_I(ip)); 413 *drained_dio = true; 414 return 1; 415 } 416 417 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 418 419 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 420 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); 421 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 422 423 return error; 424 } 425 426 /* 427 * Common pre-write limit and setup checks. 428 * 429 * Called with the iolock held either shared and exclusive according to 430 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 431 * if called for a direct write beyond i_size. 432 */ 433 STATIC ssize_t 434 xfs_file_write_checks( 435 struct kiocb *iocb, 436 struct iov_iter *from, 437 unsigned int *iolock, 438 struct xfs_zone_alloc_ctx *ac) 439 { 440 struct inode *inode = iocb->ki_filp->f_mapping->host; 441 size_t count = iov_iter_count(from); 442 bool drained_dio = false; 443 ssize_t error; 444 445 restart: 446 error = generic_write_checks(iocb, from); 447 if (error <= 0) 448 return error; 449 450 if (iocb->ki_flags & IOCB_NOWAIT) { 451 error = break_layout(inode, false); 452 if (error == -EWOULDBLOCK) 453 error = -EAGAIN; 454 } else { 455 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 456 } 457 458 if (error) 459 return error; 460 461 /* 462 * For changing security info in file_remove_privs() we need i_rwsem 463 * exclusively. 464 */ 465 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 466 xfs_iunlock(XFS_I(inode), *iolock); 467 *iolock = XFS_IOLOCK_EXCL; 468 error = xfs_ilock_iocb(iocb, *iolock); 469 if (error) { 470 *iolock = 0; 471 return error; 472 } 473 goto restart; 474 } 475 476 /* 477 * If the offset is beyond the size of the file, we need to zero all 478 * blocks that fall between the existing EOF and the start of this 479 * write. 480 * 481 * We can do an unlocked check for i_size here safely as I/O completion 482 * can only extend EOF. Truncate is locked out at this point, so the 483 * EOF can not move backwards, only forwards. Hence we only need to take 484 * the slow path when we are at or beyond the current EOF. 485 */ 486 if (iocb->ki_pos > i_size_read(inode)) { 487 error = xfs_file_write_zero_eof(iocb, from, iolock, count, 488 &drained_dio, ac); 489 if (error == 1) 490 goto restart; 491 if (error) 492 return error; 493 } 494 495 return kiocb_modified(iocb); 496 } 497 498 static ssize_t 499 xfs_zoned_write_space_reserve( 500 struct xfs_mount *mp, 501 struct kiocb *iocb, 502 struct iov_iter *from, 503 unsigned int flags, 504 struct xfs_zone_alloc_ctx *ac) 505 { 506 loff_t count = iov_iter_count(from); 507 int error; 508 509 if (iocb->ki_flags & IOCB_NOWAIT) 510 flags |= XFS_ZR_NOWAIT; 511 512 /* 513 * Check the rlimit and LFS boundary first so that we don't over-reserve 514 * by possibly a lot. 515 * 516 * The generic write path will redo this check later, and it might have 517 * changed by then. If it got expanded we'll stick to our earlier 518 * smaller limit, and if it is decreased the new smaller limit will be 519 * used and our extra space reservation will be returned after finishing 520 * the write. 521 */ 522 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); 523 if (error) 524 return error; 525 526 /* 527 * Sloppily round up count to file system blocks. 528 * 529 * This will often reserve an extra block, but that avoids having to look 530 * at the start offset, which isn't stable for O_APPEND until taking the 531 * iolock. Also we need to reserve a block each for zeroing the old 532 * EOF block and the new start block if they are unaligned. 533 * 534 * Any remaining block will be returned after the write. 535 */ 536 return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, 537 flags, ac); 538 } 539 540 static int 541 xfs_dio_write_end_io( 542 struct kiocb *iocb, 543 ssize_t size, 544 int error, 545 unsigned flags) 546 { 547 struct inode *inode = file_inode(iocb->ki_filp); 548 struct xfs_inode *ip = XFS_I(inode); 549 loff_t offset = iocb->ki_pos; 550 unsigned int nofs_flag; 551 552 ASSERT(!xfs_is_zoned_inode(ip) || 553 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 554 555 trace_xfs_end_io_direct_write(ip, offset, size); 556 557 if (xfs_is_shutdown(ip->i_mount)) 558 return -EIO; 559 560 if (error) 561 return error; 562 if (!size) 563 return 0; 564 565 /* 566 * Capture amount written on completion as we can't reliably account 567 * for it on submission. 568 */ 569 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 570 571 /* 572 * We can allocate memory here while doing writeback on behalf of 573 * memory reclaim. To avoid memory allocation deadlocks set the 574 * task-wide nofs context for the following operations. 575 */ 576 nofs_flag = memalloc_nofs_save(); 577 578 if (flags & IOMAP_DIO_COW) { 579 if (iocb->ki_flags & IOCB_ATOMIC) 580 error = xfs_reflink_end_atomic_cow(ip, offset, size); 581 else 582 error = xfs_reflink_end_cow(ip, offset, size); 583 if (error) 584 goto out; 585 } 586 587 /* 588 * Unwritten conversion updates the in-core isize after extent 589 * conversion but before updating the on-disk size. Updating isize any 590 * earlier allows a racing dio read to find unwritten extents before 591 * they are converted. 592 */ 593 if (flags & IOMAP_DIO_UNWRITTEN) { 594 error = xfs_iomap_write_unwritten(ip, offset, size, true); 595 goto out; 596 } 597 598 /* 599 * We need to update the in-core inode size here so that we don't end up 600 * with the on-disk inode size being outside the in-core inode size. We 601 * have no other method of updating EOF for AIO, so always do it here 602 * if necessary. 603 * 604 * We need to lock the test/set EOF update as we can be racing with 605 * other IO completions here to update the EOF. Failing to serialise 606 * here can result in EOF moving backwards and Bad Things Happen when 607 * that occurs. 608 * 609 * As IO completion only ever extends EOF, we can do an unlocked check 610 * here to avoid taking the spinlock. If we land within the current EOF, 611 * then we do not need to do an extending update at all, and we don't 612 * need to take the lock to check this. If we race with an update moving 613 * EOF, then we'll either still be beyond EOF and need to take the lock, 614 * or we'll be within EOF and we don't need to take it at all. 615 */ 616 if (offset + size <= i_size_read(inode)) 617 goto out; 618 619 spin_lock(&ip->i_flags_lock); 620 if (offset + size > i_size_read(inode)) { 621 i_size_write(inode, offset + size); 622 spin_unlock(&ip->i_flags_lock); 623 error = xfs_setfilesize(ip, offset, size); 624 } else { 625 spin_unlock(&ip->i_flags_lock); 626 } 627 628 out: 629 memalloc_nofs_restore(nofs_flag); 630 return error; 631 } 632 633 static const struct iomap_dio_ops xfs_dio_write_ops = { 634 .end_io = xfs_dio_write_end_io, 635 }; 636 637 static void 638 xfs_dio_zoned_submit_io( 639 const struct iomap_iter *iter, 640 struct bio *bio, 641 loff_t file_offset) 642 { 643 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; 644 struct xfs_zone_alloc_ctx *ac = iter->private; 645 xfs_filblks_t count_fsb; 646 struct iomap_ioend *ioend; 647 648 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); 649 if (count_fsb > ac->reserved_blocks) { 650 xfs_err(mp, 651 "allocation (%lld) larger than reservation (%lld).", 652 count_fsb, ac->reserved_blocks); 653 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 654 bio_io_error(bio); 655 return; 656 } 657 ac->reserved_blocks -= count_fsb; 658 659 bio->bi_end_io = xfs_end_bio; 660 ioend = iomap_init_ioend(iter->inode, bio, file_offset, 661 IOMAP_IOEND_DIRECT); 662 xfs_zone_alloc_and_submit(ioend, &ac->open_zone); 663 } 664 665 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 666 .bio_set = &iomap_ioend_bioset, 667 .submit_io = xfs_dio_zoned_submit_io, 668 .end_io = xfs_dio_write_end_io, 669 }; 670 671 /* 672 * Handle block aligned direct I/O writes. 673 */ 674 static noinline ssize_t 675 xfs_file_dio_write_aligned( 676 struct xfs_inode *ip, 677 struct kiocb *iocb, 678 struct iov_iter *from, 679 const struct iomap_ops *ops, 680 const struct iomap_dio_ops *dops, 681 struct xfs_zone_alloc_ctx *ac) 682 { 683 unsigned int iolock = XFS_IOLOCK_SHARED; 684 ssize_t ret; 685 686 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 687 if (ret) 688 return ret; 689 ret = xfs_file_write_checks(iocb, from, &iolock, ac); 690 if (ret) 691 goto out_unlock; 692 693 /* 694 * We don't need to hold the IOLOCK exclusively across the IO, so demote 695 * the iolock back to shared if we had to take the exclusive lock in 696 * xfs_file_write_checks() for other reasons. 697 */ 698 if (iolock == XFS_IOLOCK_EXCL) { 699 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 700 iolock = XFS_IOLOCK_SHARED; 701 } 702 trace_xfs_file_direct_write(iocb, from); 703 ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); 704 out_unlock: 705 xfs_iunlock(ip, iolock); 706 return ret; 707 } 708 709 /* 710 * Handle block aligned direct I/O writes to zoned devices. 711 */ 712 static noinline ssize_t 713 xfs_file_dio_write_zoned( 714 struct xfs_inode *ip, 715 struct kiocb *iocb, 716 struct iov_iter *from) 717 { 718 struct xfs_zone_alloc_ctx ac = { }; 719 ssize_t ret; 720 721 ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); 722 if (ret < 0) 723 return ret; 724 ret = xfs_file_dio_write_aligned(ip, iocb, from, 725 &xfs_zoned_direct_write_iomap_ops, 726 &xfs_dio_zoned_write_ops, &ac); 727 xfs_zoned_space_unreserve(ip->i_mount, &ac); 728 return ret; 729 } 730 731 /* 732 * Handle block atomic writes 733 * 734 * Two methods of atomic writes are supported: 735 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the 736 * disk 737 * - COW-based, which uses a COW fork as a staging extent for data updates 738 * before atomically updating extent mappings for the range being written 739 * 740 */ 741 static noinline ssize_t 742 xfs_file_dio_write_atomic( 743 struct xfs_inode *ip, 744 struct kiocb *iocb, 745 struct iov_iter *from) 746 { 747 unsigned int iolock = XFS_IOLOCK_SHARED; 748 ssize_t ret, ocount = iov_iter_count(from); 749 const struct iomap_ops *dops; 750 751 /* 752 * HW offload should be faster, so try that first if it is already 753 * known that the write length is not too large. 754 */ 755 if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) 756 dops = &xfs_atomic_write_cow_iomap_ops; 757 else 758 dops = &xfs_direct_write_iomap_ops; 759 760 retry: 761 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 762 if (ret) 763 return ret; 764 765 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 766 if (ret) 767 goto out_unlock; 768 769 /* Demote similar to xfs_file_dio_write_aligned() */ 770 if (iolock == XFS_IOLOCK_EXCL) { 771 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 772 iolock = XFS_IOLOCK_SHARED; 773 } 774 775 trace_xfs_file_direct_write(iocb, from); 776 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, 777 0, NULL, 0); 778 779 /* 780 * The retry mechanism is based on the ->iomap_begin method returning 781 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not 782 * possible. The REQ_ATOMIC-based method typically not be possible if 783 * the write spans multiple extents or the disk blocks are misaligned. 784 */ 785 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { 786 xfs_iunlock(ip, iolock); 787 dops = &xfs_atomic_write_cow_iomap_ops; 788 goto retry; 789 } 790 791 out_unlock: 792 if (iolock) 793 xfs_iunlock(ip, iolock); 794 return ret; 795 } 796 797 /* 798 * Handle block unaligned direct I/O writes 799 * 800 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 801 * them to be done in parallel with reads and other direct I/O writes. However, 802 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 803 * to do sub-block zeroing and that requires serialisation against other direct 804 * I/O to the same block. In this case we need to serialise the submission of 805 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 806 * In the case where sub-block zeroing is not required, we can do concurrent 807 * sub-block dios to the same block successfully. 808 * 809 * Optimistically submit the I/O using the shared lock first, but use the 810 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 811 * if block allocation or partial block zeroing would be required. In that case 812 * we try again with the exclusive lock. 813 */ 814 static noinline ssize_t 815 xfs_file_dio_write_unaligned( 816 struct xfs_inode *ip, 817 struct kiocb *iocb, 818 struct iov_iter *from) 819 { 820 size_t isize = i_size_read(VFS_I(ip)); 821 size_t count = iov_iter_count(from); 822 unsigned int iolock = XFS_IOLOCK_SHARED; 823 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 824 ssize_t ret; 825 826 /* 827 * Extending writes need exclusivity because of the sub-block zeroing 828 * that the DIO code always does for partial tail blocks beyond EOF, so 829 * don't even bother trying the fast path in this case. 830 */ 831 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 832 if (iocb->ki_flags & IOCB_NOWAIT) 833 return -EAGAIN; 834 retry_exclusive: 835 iolock = XFS_IOLOCK_EXCL; 836 flags = IOMAP_DIO_FORCE_WAIT; 837 } 838 839 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 840 if (ret) 841 return ret; 842 843 /* 844 * We can't properly handle unaligned direct I/O to reflink files yet, 845 * as we can't unshare a partial block. 846 */ 847 if (xfs_is_cow_inode(ip)) { 848 trace_xfs_reflink_bounce_dio_write(iocb, from); 849 ret = -ENOTBLK; 850 goto out_unlock; 851 } 852 853 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 854 if (ret) 855 goto out_unlock; 856 857 /* 858 * If we are doing exclusive unaligned I/O, this must be the only I/O 859 * in-flight. Otherwise we risk data corruption due to unwritten extent 860 * conversions from the AIO end_io handler. Wait for all other I/O to 861 * drain first. 862 */ 863 if (flags & IOMAP_DIO_FORCE_WAIT) 864 inode_dio_wait(VFS_I(ip)); 865 866 trace_xfs_file_direct_write(iocb, from); 867 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 868 &xfs_dio_write_ops, flags, NULL, 0); 869 870 /* 871 * Retry unaligned I/O with exclusive blocking semantics if the DIO 872 * layer rejected it for mapping or locking reasons. If we are doing 873 * nonblocking user I/O, propagate the error. 874 */ 875 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 876 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 877 xfs_iunlock(ip, iolock); 878 goto retry_exclusive; 879 } 880 881 out_unlock: 882 if (iolock) 883 xfs_iunlock(ip, iolock); 884 return ret; 885 } 886 887 static ssize_t 888 xfs_file_dio_write( 889 struct kiocb *iocb, 890 struct iov_iter *from) 891 { 892 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 893 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 894 size_t count = iov_iter_count(from); 895 896 /* direct I/O must be aligned to device logical sector size */ 897 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 898 return -EINVAL; 899 900 /* 901 * For always COW inodes we also must check the alignment of each 902 * individual iovec segment, as they could end up with different 903 * I/Os due to the way bio_iov_iter_get_pages works, and we'd 904 * then overwrite an already written block. 905 */ 906 if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || 907 (xfs_is_always_cow_inode(ip) && 908 (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) 909 return xfs_file_dio_write_unaligned(ip, iocb, from); 910 if (xfs_is_zoned_inode(ip)) 911 return xfs_file_dio_write_zoned(ip, iocb, from); 912 if (iocb->ki_flags & IOCB_ATOMIC) 913 return xfs_file_dio_write_atomic(ip, iocb, from); 914 return xfs_file_dio_write_aligned(ip, iocb, from, 915 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); 916 } 917 918 static noinline ssize_t 919 xfs_file_dax_write( 920 struct kiocb *iocb, 921 struct iov_iter *from) 922 { 923 struct inode *inode = iocb->ki_filp->f_mapping->host; 924 struct xfs_inode *ip = XFS_I(inode); 925 unsigned int iolock = XFS_IOLOCK_EXCL; 926 ssize_t ret, error = 0; 927 loff_t pos; 928 929 ret = xfs_ilock_iocb(iocb, iolock); 930 if (ret) 931 return ret; 932 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 933 if (ret) 934 goto out; 935 936 pos = iocb->ki_pos; 937 938 trace_xfs_file_dax_write(iocb, from); 939 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 940 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 941 i_size_write(inode, iocb->ki_pos); 942 error = xfs_setfilesize(ip, pos, ret); 943 } 944 out: 945 if (iolock) 946 xfs_iunlock(ip, iolock); 947 if (error) 948 return error; 949 950 if (ret > 0) { 951 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 952 953 /* Handle various SYNC-type writes */ 954 ret = generic_write_sync(iocb, ret); 955 } 956 return ret; 957 } 958 959 STATIC ssize_t 960 xfs_file_buffered_write( 961 struct kiocb *iocb, 962 struct iov_iter *from) 963 { 964 struct inode *inode = iocb->ki_filp->f_mapping->host; 965 struct xfs_inode *ip = XFS_I(inode); 966 ssize_t ret; 967 bool cleared_space = false; 968 unsigned int iolock; 969 970 write_retry: 971 iolock = XFS_IOLOCK_EXCL; 972 ret = xfs_ilock_iocb(iocb, iolock); 973 if (ret) 974 return ret; 975 976 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 977 if (ret) 978 goto out; 979 980 trace_xfs_file_buffered_write(iocb, from); 981 ret = iomap_file_buffered_write(iocb, from, 982 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 983 NULL); 984 985 /* 986 * If we hit a space limit, try to free up some lingering preallocated 987 * space before returning an error. In the case of ENOSPC, first try to 988 * write back all dirty inodes to free up some of the excess reserved 989 * metadata space. This reduces the chances that the eofblocks scan 990 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 991 * also behaves as a filter to prevent too many eofblocks scans from 992 * running at the same time. Use a synchronous scan to increase the 993 * effectiveness of the scan. 994 */ 995 if (ret == -EDQUOT && !cleared_space) { 996 xfs_iunlock(ip, iolock); 997 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 998 cleared_space = true; 999 goto write_retry; 1000 } else if (ret == -ENOSPC && !cleared_space) { 1001 struct xfs_icwalk icw = {0}; 1002 1003 cleared_space = true; 1004 xfs_flush_inodes(ip->i_mount); 1005 1006 xfs_iunlock(ip, iolock); 1007 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 1008 xfs_blockgc_free_space(ip->i_mount, &icw); 1009 goto write_retry; 1010 } 1011 1012 out: 1013 if (iolock) 1014 xfs_iunlock(ip, iolock); 1015 1016 if (ret > 0) { 1017 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 1018 /* Handle various SYNC-type writes */ 1019 ret = generic_write_sync(iocb, ret); 1020 } 1021 return ret; 1022 } 1023 1024 STATIC ssize_t 1025 xfs_file_buffered_write_zoned( 1026 struct kiocb *iocb, 1027 struct iov_iter *from) 1028 { 1029 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 1030 struct xfs_mount *mp = ip->i_mount; 1031 unsigned int iolock = XFS_IOLOCK_EXCL; 1032 bool cleared_space = false; 1033 struct xfs_zone_alloc_ctx ac = { }; 1034 ssize_t ret; 1035 1036 ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); 1037 if (ret < 0) 1038 return ret; 1039 1040 ret = xfs_ilock_iocb(iocb, iolock); 1041 if (ret) 1042 goto out_unreserve; 1043 1044 ret = xfs_file_write_checks(iocb, from, &iolock, &ac); 1045 if (ret) 1046 goto out_unlock; 1047 1048 /* 1049 * Truncate the iter to the length that we were actually able to 1050 * allocate blocks for. This needs to happen after 1051 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND 1052 * writes. 1053 */ 1054 iov_iter_truncate(from, 1055 XFS_FSB_TO_B(mp, ac.reserved_blocks) - 1056 (iocb->ki_pos & mp->m_blockmask)); 1057 if (!iov_iter_count(from)) 1058 goto out_unlock; 1059 1060 retry: 1061 trace_xfs_file_buffered_write(iocb, from); 1062 ret = iomap_file_buffered_write(iocb, from, 1063 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 1064 &ac); 1065 if (ret == -ENOSPC && !cleared_space) { 1066 /* 1067 * Kick off writeback to convert delalloc space and release the 1068 * usually too pessimistic indirect block reservations. 1069 */ 1070 xfs_flush_inodes(mp); 1071 cleared_space = true; 1072 goto retry; 1073 } 1074 1075 out_unlock: 1076 xfs_iunlock(ip, iolock); 1077 out_unreserve: 1078 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1079 if (ret > 0) { 1080 XFS_STATS_ADD(mp, xs_write_bytes, ret); 1081 ret = generic_write_sync(iocb, ret); 1082 } 1083 return ret; 1084 } 1085 1086 STATIC ssize_t 1087 xfs_file_write_iter( 1088 struct kiocb *iocb, 1089 struct iov_iter *from) 1090 { 1091 struct inode *inode = iocb->ki_filp->f_mapping->host; 1092 struct xfs_inode *ip = XFS_I(inode); 1093 ssize_t ret; 1094 size_t ocount = iov_iter_count(from); 1095 1096 XFS_STATS_INC(ip->i_mount, xs_write_calls); 1097 1098 if (ocount == 0) 1099 return 0; 1100 1101 if (xfs_is_shutdown(ip->i_mount)) 1102 return -EIO; 1103 1104 if (iocb->ki_flags & IOCB_ATOMIC) { 1105 if (ocount < xfs_get_atomic_write_min(ip)) 1106 return -EINVAL; 1107 1108 if (ocount > xfs_get_atomic_write_max(ip)) 1109 return -EINVAL; 1110 1111 ret = generic_atomic_write_valid(iocb, from); 1112 if (ret) 1113 return ret; 1114 } 1115 1116 if (IS_DAX(inode)) 1117 return xfs_file_dax_write(iocb, from); 1118 1119 if (iocb->ki_flags & IOCB_DIRECT) { 1120 /* 1121 * Allow a directio write to fall back to a buffered 1122 * write *only* in the case that we're doing a reflink 1123 * CoW. In all other directio scenarios we do not 1124 * allow an operation to fall back to buffered mode. 1125 */ 1126 ret = xfs_file_dio_write(iocb, from); 1127 if (ret != -ENOTBLK) 1128 return ret; 1129 } 1130 1131 if (xfs_is_zoned_inode(ip)) 1132 return xfs_file_buffered_write_zoned(iocb, from); 1133 return xfs_file_buffered_write(iocb, from); 1134 } 1135 1136 /* Does this file, inode, or mount want synchronous writes? */ 1137 static inline bool xfs_file_sync_writes(struct file *filp) 1138 { 1139 struct xfs_inode *ip = XFS_I(file_inode(filp)); 1140 1141 if (xfs_has_wsync(ip->i_mount)) 1142 return true; 1143 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 1144 return true; 1145 if (IS_SYNC(file_inode(filp))) 1146 return true; 1147 1148 return false; 1149 } 1150 1151 static int 1152 xfs_falloc_newsize( 1153 struct file *file, 1154 int mode, 1155 loff_t offset, 1156 loff_t len, 1157 loff_t *new_size) 1158 { 1159 struct inode *inode = file_inode(file); 1160 1161 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) 1162 return 0; 1163 *new_size = offset + len; 1164 return inode_newsize_ok(inode, *new_size); 1165 } 1166 1167 static int 1168 xfs_falloc_setsize( 1169 struct file *file, 1170 loff_t new_size) 1171 { 1172 struct iattr iattr = { 1173 .ia_valid = ATTR_SIZE, 1174 .ia_size = new_size, 1175 }; 1176 1177 if (!new_size) 1178 return 0; 1179 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), 1180 &iattr); 1181 } 1182 1183 static int 1184 xfs_falloc_collapse_range( 1185 struct file *file, 1186 loff_t offset, 1187 loff_t len, 1188 struct xfs_zone_alloc_ctx *ac) 1189 { 1190 struct inode *inode = file_inode(file); 1191 loff_t new_size = i_size_read(inode) - len; 1192 int error; 1193 1194 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1195 return -EINVAL; 1196 1197 /* 1198 * There is no need to overlap collapse range with EOF, in which case it 1199 * is effectively a truncate operation 1200 */ 1201 if (offset + len >= i_size_read(inode)) 1202 return -EINVAL; 1203 1204 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); 1205 if (error) 1206 return error; 1207 return xfs_falloc_setsize(file, new_size); 1208 } 1209 1210 static int 1211 xfs_falloc_insert_range( 1212 struct file *file, 1213 loff_t offset, 1214 loff_t len) 1215 { 1216 struct inode *inode = file_inode(file); 1217 loff_t isize = i_size_read(inode); 1218 int error; 1219 1220 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1221 return -EINVAL; 1222 1223 /* 1224 * New inode size must not exceed ->s_maxbytes, accounting for 1225 * possible signed overflow. 1226 */ 1227 if (inode->i_sb->s_maxbytes - isize < len) 1228 return -EFBIG; 1229 1230 /* Offset should be less than i_size */ 1231 if (offset >= isize) 1232 return -EINVAL; 1233 1234 error = xfs_falloc_setsize(file, isize + len); 1235 if (error) 1236 return error; 1237 1238 /* 1239 * Perform hole insertion now that the file size has been updated so 1240 * that if we crash during the operation we don't leave shifted extents 1241 * past EOF and hence losing access to the data that is contained within 1242 * them. 1243 */ 1244 return xfs_insert_file_space(XFS_I(inode), offset, len); 1245 } 1246 1247 /* 1248 * Punch a hole and prealloc the range. We use a hole punch rather than 1249 * unwritten extent conversion for two reasons: 1250 * 1251 * 1.) Hole punch handles partial block zeroing for us. 1252 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by 1253 * virtue of the hole punch. 1254 */ 1255 static int 1256 xfs_falloc_zero_range( 1257 struct file *file, 1258 int mode, 1259 loff_t offset, 1260 loff_t len, 1261 struct xfs_zone_alloc_ctx *ac) 1262 { 1263 struct inode *inode = file_inode(file); 1264 unsigned int blksize = i_blocksize(inode); 1265 loff_t new_size = 0; 1266 int error; 1267 1268 trace_xfs_zero_file_space(XFS_I(inode)); 1269 1270 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1271 if (error) 1272 return error; 1273 1274 error = xfs_free_file_space(XFS_I(inode), offset, len, ac); 1275 if (error) 1276 return error; 1277 1278 len = round_up(offset + len, blksize) - round_down(offset, blksize); 1279 offset = round_down(offset, blksize); 1280 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1281 if (error) 1282 return error; 1283 return xfs_falloc_setsize(file, new_size); 1284 } 1285 1286 static int 1287 xfs_falloc_unshare_range( 1288 struct file *file, 1289 int mode, 1290 loff_t offset, 1291 loff_t len) 1292 { 1293 struct inode *inode = file_inode(file); 1294 loff_t new_size = 0; 1295 int error; 1296 1297 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1298 if (error) 1299 return error; 1300 1301 error = xfs_reflink_unshare(XFS_I(inode), offset, len); 1302 if (error) 1303 return error; 1304 1305 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1306 if (error) 1307 return error; 1308 return xfs_falloc_setsize(file, new_size); 1309 } 1310 1311 static int 1312 xfs_falloc_allocate_range( 1313 struct file *file, 1314 int mode, 1315 loff_t offset, 1316 loff_t len) 1317 { 1318 struct inode *inode = file_inode(file); 1319 loff_t new_size = 0; 1320 int error; 1321 1322 /* 1323 * If always_cow mode we can't use preallocations and thus should not 1324 * create them. 1325 */ 1326 if (xfs_is_always_cow_inode(XFS_I(inode))) 1327 return -EOPNOTSUPP; 1328 1329 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1330 if (error) 1331 return error; 1332 1333 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1334 if (error) 1335 return error; 1336 return xfs_falloc_setsize(file, new_size); 1337 } 1338 1339 #define XFS_FALLOC_FL_SUPPORTED \ 1340 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ 1341 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ 1342 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ 1343 FALLOC_FL_UNSHARE_RANGE) 1344 1345 STATIC long 1346 __xfs_file_fallocate( 1347 struct file *file, 1348 int mode, 1349 loff_t offset, 1350 loff_t len, 1351 struct xfs_zone_alloc_ctx *ac) 1352 { 1353 struct inode *inode = file_inode(file); 1354 struct xfs_inode *ip = XFS_I(inode); 1355 long error; 1356 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1357 1358 xfs_ilock(ip, iolock); 1359 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 1360 if (error) 1361 goto out_unlock; 1362 1363 /* 1364 * Must wait for all AIO to complete before we continue as AIO can 1365 * change the file size on completion without holding any locks we 1366 * currently hold. We must do this first because AIO can update both 1367 * the on disk and in memory inode sizes, and the operations that follow 1368 * require the in-memory size to be fully up-to-date. 1369 */ 1370 inode_dio_wait(inode); 1371 1372 error = file_modified(file); 1373 if (error) 1374 goto out_unlock; 1375 1376 switch (mode & FALLOC_FL_MODE_MASK) { 1377 case FALLOC_FL_PUNCH_HOLE: 1378 error = xfs_free_file_space(ip, offset, len, ac); 1379 break; 1380 case FALLOC_FL_COLLAPSE_RANGE: 1381 error = xfs_falloc_collapse_range(file, offset, len, ac); 1382 break; 1383 case FALLOC_FL_INSERT_RANGE: 1384 error = xfs_falloc_insert_range(file, offset, len); 1385 break; 1386 case FALLOC_FL_ZERO_RANGE: 1387 error = xfs_falloc_zero_range(file, mode, offset, len, ac); 1388 break; 1389 case FALLOC_FL_UNSHARE_RANGE: 1390 error = xfs_falloc_unshare_range(file, mode, offset, len); 1391 break; 1392 case FALLOC_FL_ALLOCATE_RANGE: 1393 error = xfs_falloc_allocate_range(file, mode, offset, len); 1394 break; 1395 default: 1396 error = -EOPNOTSUPP; 1397 break; 1398 } 1399 1400 if (!error && xfs_file_sync_writes(file)) 1401 error = xfs_log_force_inode(ip); 1402 1403 out_unlock: 1404 xfs_iunlock(ip, iolock); 1405 return error; 1406 } 1407 1408 static long 1409 xfs_file_zoned_fallocate( 1410 struct file *file, 1411 int mode, 1412 loff_t offset, 1413 loff_t len) 1414 { 1415 struct xfs_zone_alloc_ctx ac = { }; 1416 struct xfs_inode *ip = XFS_I(file_inode(file)); 1417 int error; 1418 1419 error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac); 1420 if (error) 1421 return error; 1422 error = __xfs_file_fallocate(file, mode, offset, len, &ac); 1423 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1424 return error; 1425 } 1426 1427 static long 1428 xfs_file_fallocate( 1429 struct file *file, 1430 int mode, 1431 loff_t offset, 1432 loff_t len) 1433 { 1434 struct inode *inode = file_inode(file); 1435 1436 if (!S_ISREG(inode->i_mode)) 1437 return -EINVAL; 1438 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1439 return -EOPNOTSUPP; 1440 1441 /* 1442 * For zoned file systems, zeroing the first and last block of a hole 1443 * punch requires allocating a new block to rewrite the remaining data 1444 * and new zeroes out of place. Get a reservations for those before 1445 * taking the iolock. Dip into the reserved pool because we are 1446 * expected to be able to punch a hole even on a completely full 1447 * file system. 1448 */ 1449 if (xfs_is_zoned_inode(XFS_I(inode)) && 1450 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 1451 FALLOC_FL_COLLAPSE_RANGE))) 1452 return xfs_file_zoned_fallocate(file, mode, offset, len); 1453 return __xfs_file_fallocate(file, mode, offset, len, NULL); 1454 } 1455 1456 STATIC int 1457 xfs_file_fadvise( 1458 struct file *file, 1459 loff_t start, 1460 loff_t end, 1461 int advice) 1462 { 1463 struct xfs_inode *ip = XFS_I(file_inode(file)); 1464 int ret; 1465 int lockflags = 0; 1466 1467 /* 1468 * Operations creating pages in page cache need protection from hole 1469 * punching and similar ops 1470 */ 1471 if (advice == POSIX_FADV_WILLNEED) { 1472 lockflags = XFS_IOLOCK_SHARED; 1473 xfs_ilock(ip, lockflags); 1474 } 1475 ret = generic_fadvise(file, start, end, advice); 1476 if (lockflags) 1477 xfs_iunlock(ip, lockflags); 1478 return ret; 1479 } 1480 1481 STATIC loff_t 1482 xfs_file_remap_range( 1483 struct file *file_in, 1484 loff_t pos_in, 1485 struct file *file_out, 1486 loff_t pos_out, 1487 loff_t len, 1488 unsigned int remap_flags) 1489 { 1490 struct inode *inode_in = file_inode(file_in); 1491 struct xfs_inode *src = XFS_I(inode_in); 1492 struct inode *inode_out = file_inode(file_out); 1493 struct xfs_inode *dest = XFS_I(inode_out); 1494 struct xfs_mount *mp = src->i_mount; 1495 loff_t remapped = 0; 1496 xfs_extlen_t cowextsize; 1497 int ret; 1498 1499 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1500 return -EINVAL; 1501 1502 if (!xfs_has_reflink(mp)) 1503 return -EOPNOTSUPP; 1504 1505 if (xfs_is_shutdown(mp)) 1506 return -EIO; 1507 1508 /* Prepare and then clone file data. */ 1509 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1510 &len, remap_flags); 1511 if (ret || len == 0) 1512 return ret; 1513 1514 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1515 1516 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1517 &remapped); 1518 if (ret) 1519 goto out_unlock; 1520 1521 /* 1522 * Carry the cowextsize hint from src to dest if we're sharing the 1523 * entire source file to the entire destination file, the source file 1524 * has a cowextsize hint, and the destination file does not. 1525 */ 1526 cowextsize = 0; 1527 if (pos_in == 0 && len == i_size_read(inode_in) && 1528 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1529 pos_out == 0 && len >= i_size_read(inode_out) && 1530 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1531 cowextsize = src->i_cowextsize; 1532 1533 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1534 remap_flags); 1535 if (ret) 1536 goto out_unlock; 1537 1538 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1539 xfs_log_force_inode(dest); 1540 out_unlock: 1541 xfs_iunlock2_remapping(src, dest); 1542 if (ret) 1543 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1544 /* 1545 * If the caller did not set CAN_SHORTEN, then it is not prepared to 1546 * handle partial results -- either the whole remap succeeds, or we 1547 * must say why it did not. In this case, any error should be returned 1548 * to the caller. 1549 */ 1550 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) 1551 return ret; 1552 return remapped > 0 ? remapped : ret; 1553 } 1554 1555 STATIC int 1556 xfs_file_open( 1557 struct inode *inode, 1558 struct file *file) 1559 { 1560 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1561 return -EIO; 1562 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 1563 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) 1564 file->f_mode |= FMODE_CAN_ATOMIC_WRITE; 1565 return generic_file_open(inode, file); 1566 } 1567 1568 STATIC int 1569 xfs_dir_open( 1570 struct inode *inode, 1571 struct file *file) 1572 { 1573 struct xfs_inode *ip = XFS_I(inode); 1574 unsigned int mode; 1575 int error; 1576 1577 if (xfs_is_shutdown(ip->i_mount)) 1578 return -EIO; 1579 error = generic_file_open(inode, file); 1580 if (error) 1581 return error; 1582 1583 /* 1584 * If there are any blocks, read-ahead block 0 as we're almost 1585 * certain to have the next operation be a read there. 1586 */ 1587 mode = xfs_ilock_data_map_shared(ip); 1588 if (ip->i_df.if_nextents > 0) 1589 error = xfs_dir3_data_readahead(ip, 0, 0); 1590 xfs_iunlock(ip, mode); 1591 return error; 1592 } 1593 1594 /* 1595 * Don't bother propagating errors. We're just doing cleanup, and the caller 1596 * ignores the return value anyway. 1597 */ 1598 STATIC int 1599 xfs_file_release( 1600 struct inode *inode, 1601 struct file *file) 1602 { 1603 struct xfs_inode *ip = XFS_I(inode); 1604 struct xfs_mount *mp = ip->i_mount; 1605 1606 /* 1607 * If this is a read-only mount or the file system has been shut down, 1608 * don't generate I/O. 1609 */ 1610 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) 1611 return 0; 1612 1613 /* 1614 * If we previously truncated this file and removed old data in the 1615 * process, we want to initiate "early" writeout on the last close. 1616 * This is an attempt to combat the notorious NULL files problem which 1617 * is particularly noticeable from a truncate down, buffered (re-)write 1618 * (delalloc), followed by a crash. What we are effectively doing here 1619 * is significantly reducing the time window where we'd otherwise be 1620 * exposed to that problem. 1621 */ 1622 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { 1623 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); 1624 if (ip->i_delayed_blks > 0) 1625 filemap_flush(inode->i_mapping); 1626 } 1627 1628 /* 1629 * XFS aggressively preallocates post-EOF space to generate contiguous 1630 * allocations for writers that append to the end of the file. 1631 * 1632 * To support workloads that close and reopen the file frequently, these 1633 * preallocations usually persist after a close unless it is the first 1634 * close for the inode. This is a tradeoff to generate tightly packed 1635 * data layouts for unpacking tarballs or similar archives that write 1636 * one file after another without going back to it while keeping the 1637 * preallocation for files that have recurring open/write/close cycles. 1638 * 1639 * This heuristic is skipped for inodes with the append-only flag as 1640 * that flag is rather pointless for inodes written only once. 1641 * 1642 * There is no point in freeing blocks here for open but unlinked files 1643 * as they will be taken care of by the inactivation path soon. 1644 * 1645 * When releasing a read-only context, don't flush data or trim post-EOF 1646 * blocks. This avoids open/read/close workloads from removing EOF 1647 * blocks that other writers depend upon to reduce fragmentation. 1648 * 1649 * Inodes on the zoned RT device never have preallocations, so skip 1650 * taking the locks below. 1651 */ 1652 if (!inode->i_nlink || 1653 !(file->f_mode & FMODE_WRITE) || 1654 (ip->i_diflags & XFS_DIFLAG_APPEND) || 1655 xfs_is_zoned_inode(ip)) 1656 return 0; 1657 1658 /* 1659 * If we can't get the iolock just skip truncating the blocks past EOF 1660 * because we could deadlock with the mmap_lock otherwise. We'll get 1661 * another chance to drop them once the last reference to the inode is 1662 * dropped, so we'll never leak blocks permanently. 1663 */ 1664 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1665 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1666 if (xfs_can_free_eofblocks(ip) && 1667 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) 1668 xfs_free_eofblocks(ip); 1669 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1670 } 1671 1672 return 0; 1673 } 1674 1675 STATIC int 1676 xfs_file_readdir( 1677 struct file *file, 1678 struct dir_context *ctx) 1679 { 1680 struct inode *inode = file_inode(file); 1681 xfs_inode_t *ip = XFS_I(inode); 1682 size_t bufsize; 1683 1684 /* 1685 * The Linux API doesn't pass down the total size of the buffer 1686 * we read into down to the filesystem. With the filldir concept 1687 * it's not needed for correct information, but the XFS dir2 leaf 1688 * code wants an estimate of the buffer size to calculate it's 1689 * readahead window and size the buffers used for mapping to 1690 * physical blocks. 1691 * 1692 * Try to give it an estimate that's good enough, maybe at some 1693 * point we can change the ->readdir prototype to include the 1694 * buffer size. For now we use the current glibc buffer size. 1695 */ 1696 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1697 1698 return xfs_readdir(NULL, ip, ctx, bufsize); 1699 } 1700 1701 STATIC loff_t 1702 xfs_file_llseek( 1703 struct file *file, 1704 loff_t offset, 1705 int whence) 1706 { 1707 struct inode *inode = file->f_mapping->host; 1708 1709 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1710 return -EIO; 1711 1712 switch (whence) { 1713 default: 1714 return generic_file_llseek(file, offset, whence); 1715 case SEEK_HOLE: 1716 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1717 break; 1718 case SEEK_DATA: 1719 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1720 break; 1721 } 1722 1723 if (offset < 0) 1724 return offset; 1725 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1726 } 1727 1728 static inline vm_fault_t 1729 xfs_dax_fault_locked( 1730 struct vm_fault *vmf, 1731 unsigned int order, 1732 bool write_fault) 1733 { 1734 vm_fault_t ret; 1735 unsigned long pfn; 1736 1737 if (!IS_ENABLED(CONFIG_FS_DAX)) { 1738 ASSERT(0); 1739 return VM_FAULT_SIGBUS; 1740 } 1741 ret = dax_iomap_fault(vmf, order, &pfn, NULL, 1742 (write_fault && !vmf->cow_page) ? 1743 &xfs_dax_write_iomap_ops : 1744 &xfs_read_iomap_ops); 1745 if (ret & VM_FAULT_NEEDDSYNC) 1746 ret = dax_finish_sync_fault(vmf, order, pfn); 1747 return ret; 1748 } 1749 1750 static vm_fault_t 1751 xfs_dax_read_fault( 1752 struct vm_fault *vmf, 1753 unsigned int order) 1754 { 1755 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1756 vm_fault_t ret; 1757 1758 trace_xfs_read_fault(ip, order); 1759 1760 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1761 ret = xfs_dax_fault_locked(vmf, order, false); 1762 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1763 1764 return ret; 1765 } 1766 1767 /* 1768 * Locking for serialisation of IO during page faults. This results in a lock 1769 * ordering of: 1770 * 1771 * mmap_lock (MM) 1772 * sb_start_pagefault(vfs, freeze) 1773 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1774 * page_lock (MM) 1775 * i_lock (XFS - extent map serialisation) 1776 */ 1777 static vm_fault_t 1778 __xfs_write_fault( 1779 struct vm_fault *vmf, 1780 unsigned int order, 1781 struct xfs_zone_alloc_ctx *ac) 1782 { 1783 struct inode *inode = file_inode(vmf->vma->vm_file); 1784 struct xfs_inode *ip = XFS_I(inode); 1785 unsigned int lock_mode = XFS_MMAPLOCK_SHARED; 1786 vm_fault_t ret; 1787 1788 trace_xfs_write_fault(ip, order); 1789 1790 sb_start_pagefault(inode->i_sb); 1791 file_update_time(vmf->vma->vm_file); 1792 1793 /* 1794 * Normally we only need the shared mmaplock, but if a reflink remap is 1795 * in progress we take the exclusive lock to wait for the remap to 1796 * finish before taking a write fault. 1797 */ 1798 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1799 if (xfs_iflags_test(ip, XFS_IREMAPPING)) { 1800 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1801 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1802 lock_mode = XFS_MMAPLOCK_EXCL; 1803 } 1804 1805 if (IS_DAX(inode)) 1806 ret = xfs_dax_fault_locked(vmf, order, true); 1807 else 1808 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, 1809 ac); 1810 xfs_iunlock(ip, lock_mode); 1811 1812 sb_end_pagefault(inode->i_sb); 1813 return ret; 1814 } 1815 1816 static vm_fault_t 1817 xfs_write_fault_zoned( 1818 struct vm_fault *vmf, 1819 unsigned int order) 1820 { 1821 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1822 unsigned int len = folio_size(page_folio(vmf->page)); 1823 struct xfs_zone_alloc_ctx ac = { }; 1824 int error; 1825 vm_fault_t ret; 1826 1827 /* 1828 * This could over-allocate as it doesn't check for truncation. 1829 * 1830 * But as the overallocation is limited to less than a folio and will be 1831 * release instantly that's just fine. 1832 */ 1833 error = xfs_zoned_space_reserve(ip->i_mount, 1834 XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); 1835 if (error < 0) 1836 return vmf_fs_error(error); 1837 ret = __xfs_write_fault(vmf, order, &ac); 1838 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1839 return ret; 1840 } 1841 1842 static vm_fault_t 1843 xfs_write_fault( 1844 struct vm_fault *vmf, 1845 unsigned int order) 1846 { 1847 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) 1848 return xfs_write_fault_zoned(vmf, order); 1849 return __xfs_write_fault(vmf, order, NULL); 1850 } 1851 1852 static inline bool 1853 xfs_is_write_fault( 1854 struct vm_fault *vmf) 1855 { 1856 return (vmf->flags & FAULT_FLAG_WRITE) && 1857 (vmf->vma->vm_flags & VM_SHARED); 1858 } 1859 1860 static vm_fault_t 1861 xfs_filemap_fault( 1862 struct vm_fault *vmf) 1863 { 1864 struct inode *inode = file_inode(vmf->vma->vm_file); 1865 1866 /* DAX can shortcut the normal fault path on write faults! */ 1867 if (IS_DAX(inode)) { 1868 if (xfs_is_write_fault(vmf)) 1869 return xfs_write_fault(vmf, 0); 1870 return xfs_dax_read_fault(vmf, 0); 1871 } 1872 1873 trace_xfs_read_fault(XFS_I(inode), 0); 1874 return filemap_fault(vmf); 1875 } 1876 1877 static vm_fault_t 1878 xfs_filemap_huge_fault( 1879 struct vm_fault *vmf, 1880 unsigned int order) 1881 { 1882 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1883 return VM_FAULT_FALLBACK; 1884 1885 /* DAX can shortcut the normal fault path on write faults! */ 1886 if (xfs_is_write_fault(vmf)) 1887 return xfs_write_fault(vmf, order); 1888 return xfs_dax_read_fault(vmf, order); 1889 } 1890 1891 static vm_fault_t 1892 xfs_filemap_page_mkwrite( 1893 struct vm_fault *vmf) 1894 { 1895 return xfs_write_fault(vmf, 0); 1896 } 1897 1898 /* 1899 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1900 * on write faults. In reality, it needs to serialise against truncate and 1901 * prepare memory for writing so handle is as standard write fault. 1902 */ 1903 static vm_fault_t 1904 xfs_filemap_pfn_mkwrite( 1905 struct vm_fault *vmf) 1906 { 1907 return xfs_write_fault(vmf, 0); 1908 } 1909 1910 static const struct vm_operations_struct xfs_file_vm_ops = { 1911 .fault = xfs_filemap_fault, 1912 .huge_fault = xfs_filemap_huge_fault, 1913 .map_pages = filemap_map_pages, 1914 .page_mkwrite = xfs_filemap_page_mkwrite, 1915 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1916 }; 1917 1918 STATIC int 1919 xfs_file_mmap_prepare( 1920 struct vm_area_desc *desc) 1921 { 1922 struct file *file = desc->file; 1923 struct inode *inode = file_inode(file); 1924 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 1925 1926 /* 1927 * We don't support synchronous mappings for non-DAX files and 1928 * for DAX files if underneath dax_device is not synchronous. 1929 */ 1930 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), 1931 target->bt_daxdev)) 1932 return -EOPNOTSUPP; 1933 1934 file_accessed(file); 1935 desc->vm_ops = &xfs_file_vm_ops; 1936 if (IS_DAX(inode)) 1937 desc->vm_flags |= VM_HUGEPAGE; 1938 return 0; 1939 } 1940 1941 const struct file_operations xfs_file_operations = { 1942 .llseek = xfs_file_llseek, 1943 .read_iter = xfs_file_read_iter, 1944 .write_iter = xfs_file_write_iter, 1945 .splice_read = xfs_file_splice_read, 1946 .splice_write = iter_file_splice_write, 1947 .iopoll = iocb_bio_iopoll, 1948 .unlocked_ioctl = xfs_file_ioctl, 1949 #ifdef CONFIG_COMPAT 1950 .compat_ioctl = xfs_file_compat_ioctl, 1951 #endif 1952 .mmap_prepare = xfs_file_mmap_prepare, 1953 .open = xfs_file_open, 1954 .release = xfs_file_release, 1955 .fsync = xfs_file_fsync, 1956 .get_unmapped_area = thp_get_unmapped_area, 1957 .fallocate = xfs_file_fallocate, 1958 .fadvise = xfs_file_fadvise, 1959 .remap_file_range = xfs_file_remap_range, 1960 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 1961 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | 1962 FOP_DONTCACHE, 1963 }; 1964 1965 const struct file_operations xfs_dir_file_operations = { 1966 .open = xfs_dir_open, 1967 .read = generic_read_dir, 1968 .iterate_shared = xfs_file_readdir, 1969 .llseek = generic_file_llseek, 1970 .unlocked_ioctl = xfs_file_ioctl, 1971 #ifdef CONFIG_COMPAT 1972 .compat_ioctl = xfs_file_compat_ioctl, 1973 #endif 1974 .fsync = xfs_dir_fsync, 1975 }; 1976