1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 #include "xfs_file.h" 28 #include "xfs_aops.h" 29 #include "xfs_zone_alloc.h" 30 31 #include <linux/dax.h> 32 #include <linux/falloc.h> 33 #include <linux/backing-dev.h> 34 #include <linux/mman.h> 35 #include <linux/fadvise.h> 36 #include <linux/mount.h> 37 38 static const struct vm_operations_struct xfs_file_vm_ops; 39 40 /* 41 * Decide if the given file range is aligned to the size of the fundamental 42 * allocation unit for the file. 43 */ 44 bool 45 xfs_is_falloc_aligned( 46 struct xfs_inode *ip, 47 loff_t pos, 48 long long int len) 49 { 50 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); 51 52 if (!is_power_of_2(alloc_unit)) 53 return isaligned_64(pos, alloc_unit) && 54 isaligned_64(len, alloc_unit); 55 56 return !((pos | len) & (alloc_unit - 1)); 57 } 58 59 /* 60 * Fsync operations on directories are much simpler than on regular files, 61 * as there is no file data to flush, and thus also no need for explicit 62 * cache flush operations, and there are no non-transaction metadata updates 63 * on directories either. 64 */ 65 STATIC int 66 xfs_dir_fsync( 67 struct file *file, 68 loff_t start, 69 loff_t end, 70 int datasync) 71 { 72 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 73 74 trace_xfs_dir_fsync(ip); 75 return xfs_log_force_inode(ip); 76 } 77 78 static xfs_csn_t 79 xfs_fsync_seq( 80 struct xfs_inode *ip, 81 bool datasync) 82 { 83 if (!xfs_ipincount(ip)) 84 return 0; 85 if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 86 return 0; 87 return ip->i_itemp->ili_commit_seq; 88 } 89 90 /* 91 * All metadata updates are logged, which means that we just have to flush the 92 * log up to the latest LSN that touched the inode. 93 * 94 * If we have concurrent fsync/fdatasync() calls, we need them to all block on 95 * the log force before we clear the ili_fsync_fields field. This ensures that 96 * we don't get a racing sync operation that does not wait for the metadata to 97 * hit the journal before returning. If we race with clearing ili_fsync_fields, 98 * then all that will happen is the log force will do nothing as the lsn will 99 * already be on disk. We can't race with setting ili_fsync_fields because that 100 * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock 101 * shared until after the ili_fsync_fields is cleared. 102 */ 103 static int 104 xfs_fsync_flush_log( 105 struct xfs_inode *ip, 106 bool datasync, 107 int *log_flushed) 108 { 109 int error = 0; 110 xfs_csn_t seq; 111 112 xfs_ilock(ip, XFS_ILOCK_SHARED); 113 seq = xfs_fsync_seq(ip, datasync); 114 if (seq) { 115 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 116 log_flushed); 117 118 spin_lock(&ip->i_itemp->ili_lock); 119 ip->i_itemp->ili_fsync_fields = 0; 120 spin_unlock(&ip->i_itemp->ili_lock); 121 } 122 xfs_iunlock(ip, XFS_ILOCK_SHARED); 123 return error; 124 } 125 126 STATIC int 127 xfs_file_fsync( 128 struct file *file, 129 loff_t start, 130 loff_t end, 131 int datasync) 132 { 133 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 134 struct xfs_mount *mp = ip->i_mount; 135 int error, err2; 136 int log_flushed = 0; 137 138 trace_xfs_file_fsync(ip); 139 140 error = file_write_and_wait_range(file, start, end); 141 if (error) 142 return error; 143 144 if (xfs_is_shutdown(mp)) 145 return -EIO; 146 147 xfs_iflags_clear(ip, XFS_ITRUNCATED); 148 149 /* 150 * If we have an RT and/or log subvolume we need to make sure to flush 151 * the write cache the device used for file data first. This is to 152 * ensure newly written file data make it to disk before logging the new 153 * inode size in case of an extending write. 154 */ 155 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) 156 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 157 else if (mp->m_logdev_targp != mp->m_ddev_targp) 158 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 159 160 /* 161 * Any inode that has dirty modifications in the log is pinned. The 162 * racy check here for a pinned inode will not catch modifications 163 * that happen concurrently to the fsync call, but fsync semantics 164 * only require to sync previously completed I/O. 165 */ 166 if (xfs_ipincount(ip)) { 167 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 168 if (err2 && !error) 169 error = err2; 170 } 171 172 /* 173 * If we only have a single device, and the log force about was 174 * a no-op we might have to flush the data device cache here. 175 * This can only happen for fdatasync/O_DSYNC if we were overwriting 176 * an already allocated file and thus do not have any metadata to 177 * commit. 178 */ 179 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 180 mp->m_logdev_targp == mp->m_ddev_targp) { 181 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 182 if (err2 && !error) 183 error = err2; 184 } 185 186 return error; 187 } 188 189 static int 190 xfs_ilock_iocb( 191 struct kiocb *iocb, 192 unsigned int lock_mode) 193 { 194 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 195 196 if (iocb->ki_flags & IOCB_NOWAIT) { 197 if (!xfs_ilock_nowait(ip, lock_mode)) 198 return -EAGAIN; 199 } else { 200 xfs_ilock(ip, lock_mode); 201 } 202 203 return 0; 204 } 205 206 static int 207 xfs_ilock_iocb_for_write( 208 struct kiocb *iocb, 209 unsigned int *lock_mode) 210 { 211 ssize_t ret; 212 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 213 214 ret = xfs_ilock_iocb(iocb, *lock_mode); 215 if (ret) 216 return ret; 217 218 /* 219 * If a reflink remap is in progress we always need to take the iolock 220 * exclusively to wait for it to finish. 221 */ 222 if (*lock_mode == XFS_IOLOCK_SHARED && 223 xfs_iflags_test(ip, XFS_IREMAPPING)) { 224 xfs_iunlock(ip, *lock_mode); 225 *lock_mode = XFS_IOLOCK_EXCL; 226 return xfs_ilock_iocb(iocb, *lock_mode); 227 } 228 229 return 0; 230 } 231 232 STATIC ssize_t 233 xfs_file_dio_read( 234 struct kiocb *iocb, 235 struct iov_iter *to) 236 { 237 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 238 ssize_t ret; 239 240 trace_xfs_file_direct_read(iocb, to); 241 242 if (!iov_iter_count(to)) 243 return 0; /* skip atime */ 244 245 file_accessed(iocb->ki_filp); 246 247 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 248 if (ret) 249 return ret; 250 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 251 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 252 253 return ret; 254 } 255 256 static noinline ssize_t 257 xfs_file_dax_read( 258 struct kiocb *iocb, 259 struct iov_iter *to) 260 { 261 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 262 ssize_t ret = 0; 263 264 trace_xfs_file_dax_read(iocb, to); 265 266 if (!iov_iter_count(to)) 267 return 0; /* skip atime */ 268 269 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 270 if (ret) 271 return ret; 272 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 273 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 274 275 file_accessed(iocb->ki_filp); 276 return ret; 277 } 278 279 STATIC ssize_t 280 xfs_file_buffered_read( 281 struct kiocb *iocb, 282 struct iov_iter *to) 283 { 284 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 285 ssize_t ret; 286 287 trace_xfs_file_buffered_read(iocb, to); 288 289 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 290 if (ret) 291 return ret; 292 ret = generic_file_read_iter(iocb, to); 293 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 294 295 return ret; 296 } 297 298 STATIC ssize_t 299 xfs_file_read_iter( 300 struct kiocb *iocb, 301 struct iov_iter *to) 302 { 303 struct inode *inode = file_inode(iocb->ki_filp); 304 struct xfs_mount *mp = XFS_I(inode)->i_mount; 305 ssize_t ret = 0; 306 307 XFS_STATS_INC(mp, xs_read_calls); 308 309 if (xfs_is_shutdown(mp)) 310 return -EIO; 311 312 if (IS_DAX(inode)) 313 ret = xfs_file_dax_read(iocb, to); 314 else if (iocb->ki_flags & IOCB_DIRECT) 315 ret = xfs_file_dio_read(iocb, to); 316 else 317 ret = xfs_file_buffered_read(iocb, to); 318 319 if (ret > 0) 320 XFS_STATS_ADD(mp, xs_read_bytes, ret); 321 return ret; 322 } 323 324 STATIC ssize_t 325 xfs_file_splice_read( 326 struct file *in, 327 loff_t *ppos, 328 struct pipe_inode_info *pipe, 329 size_t len, 330 unsigned int flags) 331 { 332 struct inode *inode = file_inode(in); 333 struct xfs_inode *ip = XFS_I(inode); 334 struct xfs_mount *mp = ip->i_mount; 335 ssize_t ret = 0; 336 337 XFS_STATS_INC(mp, xs_read_calls); 338 339 if (xfs_is_shutdown(mp)) 340 return -EIO; 341 342 trace_xfs_file_splice_read(ip, *ppos, len); 343 344 xfs_ilock(ip, XFS_IOLOCK_SHARED); 345 ret = filemap_splice_read(in, ppos, pipe, len, flags); 346 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 347 if (ret > 0) 348 XFS_STATS_ADD(mp, xs_read_bytes, ret); 349 return ret; 350 } 351 352 /* 353 * Take care of zeroing post-EOF blocks when they might exist. 354 * 355 * Returns 0 if successfully, a negative error for a failure, or 1 if this 356 * function dropped the iolock and reacquired it exclusively and the caller 357 * needs to restart the write sanity checks. 358 */ 359 static ssize_t 360 xfs_file_write_zero_eof( 361 struct kiocb *iocb, 362 struct iov_iter *from, 363 unsigned int *iolock, 364 size_t count, 365 bool *drained_dio, 366 struct xfs_zone_alloc_ctx *ac) 367 { 368 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 369 loff_t isize; 370 int error; 371 372 /* 373 * We need to serialise against EOF updates that occur in IO completions 374 * here. We want to make sure that nobody is changing the size while 375 * we do this check until we have placed an IO barrier (i.e. hold 376 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 377 * spinlock effectively forms a memory barrier once we have 378 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 379 * hence be able to correctly determine if we need to run zeroing. 380 */ 381 spin_lock(&ip->i_flags_lock); 382 isize = i_size_read(VFS_I(ip)); 383 if (iocb->ki_pos <= isize) { 384 spin_unlock(&ip->i_flags_lock); 385 return 0; 386 } 387 spin_unlock(&ip->i_flags_lock); 388 389 if (iocb->ki_flags & IOCB_NOWAIT) 390 return -EAGAIN; 391 392 if (!*drained_dio) { 393 /* 394 * If zeroing is needed and we are currently holding the iolock 395 * shared, we need to update it to exclusive which implies 396 * having to redo all checks before. 397 */ 398 if (*iolock == XFS_IOLOCK_SHARED) { 399 xfs_iunlock(ip, *iolock); 400 *iolock = XFS_IOLOCK_EXCL; 401 xfs_ilock(ip, *iolock); 402 iov_iter_reexpand(from, count); 403 } 404 405 /* 406 * We now have an IO submission barrier in place, but AIO can do 407 * EOF updates during IO completion and hence we now need to 408 * wait for all of them to drain. Non-AIO DIO will have drained 409 * before we are given the XFS_IOLOCK_EXCL, and so for most 410 * cases this wait is a no-op. 411 */ 412 inode_dio_wait(VFS_I(ip)); 413 *drained_dio = true; 414 return 1; 415 } 416 417 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 418 419 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 420 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); 421 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 422 423 return error; 424 } 425 426 /* 427 * Common pre-write limit and setup checks. 428 * 429 * Called with the iolock held either shared and exclusive according to 430 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 431 * if called for a direct write beyond i_size. 432 */ 433 STATIC ssize_t 434 xfs_file_write_checks( 435 struct kiocb *iocb, 436 struct iov_iter *from, 437 unsigned int *iolock, 438 struct xfs_zone_alloc_ctx *ac) 439 { 440 struct inode *inode = iocb->ki_filp->f_mapping->host; 441 size_t count = iov_iter_count(from); 442 bool drained_dio = false; 443 ssize_t error; 444 445 restart: 446 error = generic_write_checks(iocb, from); 447 if (error <= 0) 448 return error; 449 450 if (iocb->ki_flags & IOCB_NOWAIT) { 451 error = break_layout(inode, false); 452 if (error == -EWOULDBLOCK) 453 error = -EAGAIN; 454 } else { 455 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 456 } 457 458 if (error) 459 return error; 460 461 /* 462 * For changing security info in file_remove_privs() we need i_rwsem 463 * exclusively. 464 */ 465 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 466 xfs_iunlock(XFS_I(inode), *iolock); 467 *iolock = XFS_IOLOCK_EXCL; 468 error = xfs_ilock_iocb(iocb, *iolock); 469 if (error) { 470 *iolock = 0; 471 return error; 472 } 473 goto restart; 474 } 475 476 /* 477 * If the offset is beyond the size of the file, we need to zero all 478 * blocks that fall between the existing EOF and the start of this 479 * write. 480 * 481 * We can do an unlocked check for i_size here safely as I/O completion 482 * can only extend EOF. Truncate is locked out at this point, so the 483 * EOF can not move backwards, only forwards. Hence we only need to take 484 * the slow path when we are at or beyond the current EOF. 485 */ 486 if (iocb->ki_pos > i_size_read(inode)) { 487 error = xfs_file_write_zero_eof(iocb, from, iolock, count, 488 &drained_dio, ac); 489 if (error == 1) 490 goto restart; 491 if (error) 492 return error; 493 } 494 495 return kiocb_modified(iocb); 496 } 497 498 static ssize_t 499 xfs_zoned_write_space_reserve( 500 struct xfs_inode *ip, 501 struct kiocb *iocb, 502 struct iov_iter *from, 503 unsigned int flags, 504 struct xfs_zone_alloc_ctx *ac) 505 { 506 loff_t count = iov_iter_count(from); 507 int error; 508 509 if (iocb->ki_flags & IOCB_NOWAIT) 510 flags |= XFS_ZR_NOWAIT; 511 512 /* 513 * Check the rlimit and LFS boundary first so that we don't over-reserve 514 * by possibly a lot. 515 * 516 * The generic write path will redo this check later, and it might have 517 * changed by then. If it got expanded we'll stick to our earlier 518 * smaller limit, and if it is decreased the new smaller limit will be 519 * used and our extra space reservation will be returned after finishing 520 * the write. 521 */ 522 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); 523 if (error) 524 return error; 525 526 /* 527 * Sloppily round up count to file system blocks. 528 * 529 * This will often reserve an extra block, but that avoids having to look 530 * at the start offset, which isn't stable for O_APPEND until taking the 531 * iolock. Also we need to reserve a block each for zeroing the old 532 * EOF block and the new start block if they are unaligned. 533 * 534 * Any remaining block will be returned after the write. 535 */ 536 return xfs_zoned_space_reserve(ip, 537 XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); 538 } 539 540 static int 541 xfs_dio_write_end_io( 542 struct kiocb *iocb, 543 ssize_t size, 544 int error, 545 unsigned flags) 546 { 547 struct inode *inode = file_inode(iocb->ki_filp); 548 struct xfs_inode *ip = XFS_I(inode); 549 loff_t offset = iocb->ki_pos; 550 unsigned int nofs_flag; 551 552 ASSERT(!xfs_is_zoned_inode(ip) || 553 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 554 555 trace_xfs_end_io_direct_write(ip, offset, size); 556 557 if (xfs_is_shutdown(ip->i_mount)) 558 return -EIO; 559 560 if (error) 561 return error; 562 if (!size) 563 return 0; 564 565 /* 566 * Capture amount written on completion as we can't reliably account 567 * for it on submission. 568 */ 569 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 570 571 /* 572 * We can allocate memory here while doing writeback on behalf of 573 * memory reclaim. To avoid memory allocation deadlocks set the 574 * task-wide nofs context for the following operations. 575 */ 576 nofs_flag = memalloc_nofs_save(); 577 578 if (flags & IOMAP_DIO_COW) { 579 if (iocb->ki_flags & IOCB_ATOMIC) 580 error = xfs_reflink_end_atomic_cow(ip, offset, size); 581 else 582 error = xfs_reflink_end_cow(ip, offset, size); 583 if (error) 584 goto out; 585 } 586 587 /* 588 * Unwritten conversion updates the in-core isize after extent 589 * conversion but before updating the on-disk size. Updating isize any 590 * earlier allows a racing dio read to find unwritten extents before 591 * they are converted. 592 */ 593 if (flags & IOMAP_DIO_UNWRITTEN) { 594 error = xfs_iomap_write_unwritten(ip, offset, size, true); 595 goto out; 596 } 597 598 /* 599 * We need to update the in-core inode size here so that we don't end up 600 * with the on-disk inode size being outside the in-core inode size. We 601 * have no other method of updating EOF for AIO, so always do it here 602 * if necessary. 603 * 604 * We need to lock the test/set EOF update as we can be racing with 605 * other IO completions here to update the EOF. Failing to serialise 606 * here can result in EOF moving backwards and Bad Things Happen when 607 * that occurs. 608 * 609 * As IO completion only ever extends EOF, we can do an unlocked check 610 * here to avoid taking the spinlock. If we land within the current EOF, 611 * then we do not need to do an extending update at all, and we don't 612 * need to take the lock to check this. If we race with an update moving 613 * EOF, then we'll either still be beyond EOF and need to take the lock, 614 * or we'll be within EOF and we don't need to take it at all. 615 */ 616 if (offset + size <= i_size_read(inode)) 617 goto out; 618 619 spin_lock(&ip->i_flags_lock); 620 if (offset + size > i_size_read(inode)) { 621 i_size_write(inode, offset + size); 622 spin_unlock(&ip->i_flags_lock); 623 error = xfs_setfilesize(ip, offset, size); 624 } else { 625 spin_unlock(&ip->i_flags_lock); 626 } 627 628 out: 629 memalloc_nofs_restore(nofs_flag); 630 return error; 631 } 632 633 static const struct iomap_dio_ops xfs_dio_write_ops = { 634 .end_io = xfs_dio_write_end_io, 635 }; 636 637 static void 638 xfs_dio_zoned_submit_io( 639 const struct iomap_iter *iter, 640 struct bio *bio, 641 loff_t file_offset) 642 { 643 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; 644 struct xfs_zone_alloc_ctx *ac = iter->private; 645 xfs_filblks_t count_fsb; 646 struct iomap_ioend *ioend; 647 648 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); 649 if (count_fsb > ac->reserved_blocks) { 650 xfs_err(mp, 651 "allocation (%lld) larger than reservation (%lld).", 652 count_fsb, ac->reserved_blocks); 653 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 654 bio_io_error(bio); 655 return; 656 } 657 ac->reserved_blocks -= count_fsb; 658 659 bio->bi_end_io = xfs_end_bio; 660 ioend = iomap_init_ioend(iter->inode, bio, file_offset, 661 IOMAP_IOEND_DIRECT); 662 xfs_zone_alloc_and_submit(ioend, &ac->open_zone); 663 } 664 665 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 666 .bio_set = &iomap_ioend_bioset, 667 .submit_io = xfs_dio_zoned_submit_io, 668 .end_io = xfs_dio_write_end_io, 669 }; 670 671 /* 672 * Handle block aligned direct I/O writes. 673 */ 674 static noinline ssize_t 675 xfs_file_dio_write_aligned( 676 struct xfs_inode *ip, 677 struct kiocb *iocb, 678 struct iov_iter *from, 679 const struct iomap_ops *ops, 680 const struct iomap_dio_ops *dops, 681 struct xfs_zone_alloc_ctx *ac) 682 { 683 unsigned int iolock = XFS_IOLOCK_SHARED; 684 ssize_t ret; 685 686 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 687 if (ret) 688 return ret; 689 ret = xfs_file_write_checks(iocb, from, &iolock, ac); 690 if (ret) 691 goto out_unlock; 692 693 /* 694 * We don't need to hold the IOLOCK exclusively across the IO, so demote 695 * the iolock back to shared if we had to take the exclusive lock in 696 * xfs_file_write_checks() for other reasons. 697 */ 698 if (iolock == XFS_IOLOCK_EXCL) { 699 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 700 iolock = XFS_IOLOCK_SHARED; 701 } 702 trace_xfs_file_direct_write(iocb, from); 703 ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); 704 out_unlock: 705 xfs_iunlock(ip, iolock); 706 return ret; 707 } 708 709 /* 710 * Handle block aligned direct I/O writes to zoned devices. 711 */ 712 static noinline ssize_t 713 xfs_file_dio_write_zoned( 714 struct xfs_inode *ip, 715 struct kiocb *iocb, 716 struct iov_iter *from) 717 { 718 struct xfs_zone_alloc_ctx ac = { }; 719 ssize_t ret; 720 721 ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); 722 if (ret < 0) 723 return ret; 724 ret = xfs_file_dio_write_aligned(ip, iocb, from, 725 &xfs_zoned_direct_write_iomap_ops, 726 &xfs_dio_zoned_write_ops, &ac); 727 xfs_zoned_space_unreserve(ip, &ac); 728 return ret; 729 } 730 731 /* 732 * Handle block atomic writes 733 * 734 * Two methods of atomic writes are supported: 735 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the 736 * disk 737 * - COW-based, which uses a COW fork as a staging extent for data updates 738 * before atomically updating extent mappings for the range being written 739 * 740 */ 741 static noinline ssize_t 742 xfs_file_dio_write_atomic( 743 struct xfs_inode *ip, 744 struct kiocb *iocb, 745 struct iov_iter *from) 746 { 747 unsigned int iolock = XFS_IOLOCK_SHARED; 748 ssize_t ret, ocount = iov_iter_count(from); 749 const struct iomap_ops *dops; 750 751 /* 752 * HW offload should be faster, so try that first if it is already 753 * known that the write length is not too large. 754 */ 755 if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max) 756 dops = &xfs_atomic_write_cow_iomap_ops; 757 else 758 dops = &xfs_direct_write_iomap_ops; 759 760 retry: 761 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 762 if (ret) 763 return ret; 764 765 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 766 if (ret) 767 goto out_unlock; 768 769 /* Demote similar to xfs_file_dio_write_aligned() */ 770 if (iolock == XFS_IOLOCK_EXCL) { 771 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 772 iolock = XFS_IOLOCK_SHARED; 773 } 774 775 trace_xfs_file_direct_write(iocb, from); 776 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, 777 0, NULL, 0); 778 779 /* 780 * The retry mechanism is based on the ->iomap_begin method returning 781 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not 782 * possible. The REQ_ATOMIC-based method typically not be possible if 783 * the write spans multiple extents or the disk blocks are misaligned. 784 */ 785 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { 786 xfs_iunlock(ip, iolock); 787 dops = &xfs_atomic_write_cow_iomap_ops; 788 goto retry; 789 } 790 791 out_unlock: 792 if (iolock) 793 xfs_iunlock(ip, iolock); 794 return ret; 795 } 796 797 /* 798 * Handle block unaligned direct I/O writes 799 * 800 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 801 * them to be done in parallel with reads and other direct I/O writes. However, 802 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 803 * to do sub-block zeroing and that requires serialisation against other direct 804 * I/O to the same block. In this case we need to serialise the submission of 805 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 806 * In the case where sub-block zeroing is not required, we can do concurrent 807 * sub-block dios to the same block successfully. 808 * 809 * Optimistically submit the I/O using the shared lock first, but use the 810 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 811 * if block allocation or partial block zeroing would be required. In that case 812 * we try again with the exclusive lock. 813 */ 814 static noinline ssize_t 815 xfs_file_dio_write_unaligned( 816 struct xfs_inode *ip, 817 struct kiocb *iocb, 818 struct iov_iter *from) 819 { 820 size_t isize = i_size_read(VFS_I(ip)); 821 size_t count = iov_iter_count(from); 822 unsigned int iolock = XFS_IOLOCK_SHARED; 823 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 824 ssize_t ret; 825 826 /* 827 * Extending writes need exclusivity because of the sub-block zeroing 828 * that the DIO code always does for partial tail blocks beyond EOF, so 829 * don't even bother trying the fast path in this case. 830 */ 831 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 832 if (iocb->ki_flags & IOCB_NOWAIT) 833 return -EAGAIN; 834 retry_exclusive: 835 iolock = XFS_IOLOCK_EXCL; 836 flags = IOMAP_DIO_FORCE_WAIT; 837 } 838 839 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 840 if (ret) 841 return ret; 842 843 /* 844 * We can't properly handle unaligned direct I/O to reflink files yet, 845 * as we can't unshare a partial block. 846 */ 847 if (xfs_is_cow_inode(ip)) { 848 trace_xfs_reflink_bounce_dio_write(iocb, from); 849 ret = -ENOTBLK; 850 goto out_unlock; 851 } 852 853 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 854 if (ret) 855 goto out_unlock; 856 857 /* 858 * If we are doing exclusive unaligned I/O, this must be the only I/O 859 * in-flight. Otherwise we risk data corruption due to unwritten extent 860 * conversions from the AIO end_io handler. Wait for all other I/O to 861 * drain first. 862 */ 863 if (flags & IOMAP_DIO_FORCE_WAIT) 864 inode_dio_wait(VFS_I(ip)); 865 866 trace_xfs_file_direct_write(iocb, from); 867 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 868 &xfs_dio_write_ops, flags, NULL, 0); 869 870 /* 871 * Retry unaligned I/O with exclusive blocking semantics if the DIO 872 * layer rejected it for mapping or locking reasons. If we are doing 873 * nonblocking user I/O, propagate the error. 874 */ 875 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 876 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 877 xfs_iunlock(ip, iolock); 878 goto retry_exclusive; 879 } 880 881 out_unlock: 882 if (iolock) 883 xfs_iunlock(ip, iolock); 884 return ret; 885 } 886 887 static ssize_t 888 xfs_file_dio_write( 889 struct kiocb *iocb, 890 struct iov_iter *from) 891 { 892 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 893 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 894 size_t count = iov_iter_count(from); 895 896 /* direct I/O must be aligned to device logical sector size */ 897 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 898 return -EINVAL; 899 900 /* 901 * For always COW inodes we also must check the alignment of each 902 * individual iovec segment, as they could end up with different 903 * I/Os due to the way bio_iov_iter_get_pages works, and we'd 904 * then overwrite an already written block. 905 */ 906 if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || 907 (xfs_is_always_cow_inode(ip) && 908 (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) 909 return xfs_file_dio_write_unaligned(ip, iocb, from); 910 if (xfs_is_zoned_inode(ip)) 911 return xfs_file_dio_write_zoned(ip, iocb, from); 912 if (iocb->ki_flags & IOCB_ATOMIC) 913 return xfs_file_dio_write_atomic(ip, iocb, from); 914 return xfs_file_dio_write_aligned(ip, iocb, from, 915 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); 916 } 917 918 static noinline ssize_t 919 xfs_file_dax_write( 920 struct kiocb *iocb, 921 struct iov_iter *from) 922 { 923 struct inode *inode = iocb->ki_filp->f_mapping->host; 924 struct xfs_inode *ip = XFS_I(inode); 925 unsigned int iolock = XFS_IOLOCK_EXCL; 926 ssize_t ret, error = 0; 927 loff_t pos; 928 929 ret = xfs_ilock_iocb(iocb, iolock); 930 if (ret) 931 return ret; 932 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 933 if (ret) 934 goto out; 935 936 pos = iocb->ki_pos; 937 938 trace_xfs_file_dax_write(iocb, from); 939 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 940 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 941 i_size_write(inode, iocb->ki_pos); 942 error = xfs_setfilesize(ip, pos, ret); 943 } 944 out: 945 if (iolock) 946 xfs_iunlock(ip, iolock); 947 if (error) 948 return error; 949 950 if (ret > 0) { 951 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 952 953 /* Handle various SYNC-type writes */ 954 ret = generic_write_sync(iocb, ret); 955 } 956 return ret; 957 } 958 959 STATIC ssize_t 960 xfs_file_buffered_write( 961 struct kiocb *iocb, 962 struct iov_iter *from) 963 { 964 struct inode *inode = iocb->ki_filp->f_mapping->host; 965 struct xfs_inode *ip = XFS_I(inode); 966 ssize_t ret; 967 bool cleared_space = false; 968 unsigned int iolock; 969 970 write_retry: 971 iolock = XFS_IOLOCK_EXCL; 972 ret = xfs_ilock_iocb(iocb, iolock); 973 if (ret) 974 return ret; 975 976 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 977 if (ret) 978 goto out; 979 980 trace_xfs_file_buffered_write(iocb, from); 981 ret = iomap_file_buffered_write(iocb, from, 982 &xfs_buffered_write_iomap_ops, NULL); 983 984 /* 985 * If we hit a space limit, try to free up some lingering preallocated 986 * space before returning an error. In the case of ENOSPC, first try to 987 * write back all dirty inodes to free up some of the excess reserved 988 * metadata space. This reduces the chances that the eofblocks scan 989 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 990 * also behaves as a filter to prevent too many eofblocks scans from 991 * running at the same time. Use a synchronous scan to increase the 992 * effectiveness of the scan. 993 */ 994 if (ret == -EDQUOT && !cleared_space) { 995 xfs_iunlock(ip, iolock); 996 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 997 cleared_space = true; 998 goto write_retry; 999 } else if (ret == -ENOSPC && !cleared_space) { 1000 struct xfs_icwalk icw = {0}; 1001 1002 cleared_space = true; 1003 xfs_flush_inodes(ip->i_mount); 1004 1005 xfs_iunlock(ip, iolock); 1006 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 1007 xfs_blockgc_free_space(ip->i_mount, &icw); 1008 goto write_retry; 1009 } 1010 1011 out: 1012 if (iolock) 1013 xfs_iunlock(ip, iolock); 1014 1015 if (ret > 0) { 1016 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 1017 /* Handle various SYNC-type writes */ 1018 ret = generic_write_sync(iocb, ret); 1019 } 1020 return ret; 1021 } 1022 1023 STATIC ssize_t 1024 xfs_file_buffered_write_zoned( 1025 struct kiocb *iocb, 1026 struct iov_iter *from) 1027 { 1028 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 1029 struct xfs_mount *mp = ip->i_mount; 1030 unsigned int iolock = XFS_IOLOCK_EXCL; 1031 bool cleared_space = false; 1032 struct xfs_zone_alloc_ctx ac = { }; 1033 ssize_t ret; 1034 1035 ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); 1036 if (ret < 0) 1037 return ret; 1038 1039 ret = xfs_ilock_iocb(iocb, iolock); 1040 if (ret) 1041 goto out_unreserve; 1042 1043 ret = xfs_file_write_checks(iocb, from, &iolock, &ac); 1044 if (ret) 1045 goto out_unlock; 1046 1047 /* 1048 * Truncate the iter to the length that we were actually able to 1049 * allocate blocks for. This needs to happen after 1050 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND 1051 * writes. 1052 */ 1053 iov_iter_truncate(from, 1054 XFS_FSB_TO_B(mp, ac.reserved_blocks) - 1055 (iocb->ki_pos & mp->m_blockmask)); 1056 if (!iov_iter_count(from)) 1057 goto out_unlock; 1058 1059 retry: 1060 trace_xfs_file_buffered_write(iocb, from); 1061 ret = iomap_file_buffered_write(iocb, from, 1062 &xfs_buffered_write_iomap_ops, &ac); 1063 if (ret == -ENOSPC && !cleared_space) { 1064 /* 1065 * Kick off writeback to convert delalloc space and release the 1066 * usually too pessimistic indirect block reservations. 1067 */ 1068 xfs_flush_inodes(mp); 1069 cleared_space = true; 1070 goto retry; 1071 } 1072 1073 out_unlock: 1074 xfs_iunlock(ip, iolock); 1075 out_unreserve: 1076 xfs_zoned_space_unreserve(ip, &ac); 1077 if (ret > 0) { 1078 XFS_STATS_ADD(mp, xs_write_bytes, ret); 1079 ret = generic_write_sync(iocb, ret); 1080 } 1081 return ret; 1082 } 1083 1084 STATIC ssize_t 1085 xfs_file_write_iter( 1086 struct kiocb *iocb, 1087 struct iov_iter *from) 1088 { 1089 struct inode *inode = iocb->ki_filp->f_mapping->host; 1090 struct xfs_inode *ip = XFS_I(inode); 1091 ssize_t ret; 1092 size_t ocount = iov_iter_count(from); 1093 1094 XFS_STATS_INC(ip->i_mount, xs_write_calls); 1095 1096 if (ocount == 0) 1097 return 0; 1098 1099 if (xfs_is_shutdown(ip->i_mount)) 1100 return -EIO; 1101 1102 if (IS_DAX(inode)) 1103 return xfs_file_dax_write(iocb, from); 1104 1105 if (iocb->ki_flags & IOCB_ATOMIC) { 1106 if (ocount < xfs_get_atomic_write_min(ip)) 1107 return -EINVAL; 1108 1109 if (ocount > xfs_get_atomic_write_max(ip)) 1110 return -EINVAL; 1111 1112 ret = generic_atomic_write_valid(iocb, from); 1113 if (ret) 1114 return ret; 1115 } 1116 1117 if (iocb->ki_flags & IOCB_DIRECT) { 1118 /* 1119 * Allow a directio write to fall back to a buffered 1120 * write *only* in the case that we're doing a reflink 1121 * CoW. In all other directio scenarios we do not 1122 * allow an operation to fall back to buffered mode. 1123 */ 1124 ret = xfs_file_dio_write(iocb, from); 1125 if (ret != -ENOTBLK) 1126 return ret; 1127 } 1128 1129 if (xfs_is_zoned_inode(ip)) 1130 return xfs_file_buffered_write_zoned(iocb, from); 1131 return xfs_file_buffered_write(iocb, from); 1132 } 1133 1134 /* Does this file, inode, or mount want synchronous writes? */ 1135 static inline bool xfs_file_sync_writes(struct file *filp) 1136 { 1137 struct xfs_inode *ip = XFS_I(file_inode(filp)); 1138 1139 if (xfs_has_wsync(ip->i_mount)) 1140 return true; 1141 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 1142 return true; 1143 if (IS_SYNC(file_inode(filp))) 1144 return true; 1145 1146 return false; 1147 } 1148 1149 static int 1150 xfs_falloc_newsize( 1151 struct file *file, 1152 int mode, 1153 loff_t offset, 1154 loff_t len, 1155 loff_t *new_size) 1156 { 1157 struct inode *inode = file_inode(file); 1158 1159 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) 1160 return 0; 1161 *new_size = offset + len; 1162 return inode_newsize_ok(inode, *new_size); 1163 } 1164 1165 static int 1166 xfs_falloc_setsize( 1167 struct file *file, 1168 loff_t new_size) 1169 { 1170 struct iattr iattr = { 1171 .ia_valid = ATTR_SIZE, 1172 .ia_size = new_size, 1173 }; 1174 1175 if (!new_size) 1176 return 0; 1177 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), 1178 &iattr); 1179 } 1180 1181 static int 1182 xfs_falloc_collapse_range( 1183 struct file *file, 1184 loff_t offset, 1185 loff_t len, 1186 struct xfs_zone_alloc_ctx *ac) 1187 { 1188 struct inode *inode = file_inode(file); 1189 loff_t new_size = i_size_read(inode) - len; 1190 int error; 1191 1192 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1193 return -EINVAL; 1194 1195 /* 1196 * There is no need to overlap collapse range with EOF, in which case it 1197 * is effectively a truncate operation 1198 */ 1199 if (offset + len >= i_size_read(inode)) 1200 return -EINVAL; 1201 1202 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); 1203 if (error) 1204 return error; 1205 return xfs_falloc_setsize(file, new_size); 1206 } 1207 1208 static int 1209 xfs_falloc_insert_range( 1210 struct file *file, 1211 loff_t offset, 1212 loff_t len) 1213 { 1214 struct inode *inode = file_inode(file); 1215 loff_t isize = i_size_read(inode); 1216 int error; 1217 1218 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1219 return -EINVAL; 1220 1221 /* 1222 * New inode size must not exceed ->s_maxbytes, accounting for 1223 * possible signed overflow. 1224 */ 1225 if (inode->i_sb->s_maxbytes - isize < len) 1226 return -EFBIG; 1227 1228 /* Offset should be less than i_size */ 1229 if (offset >= isize) 1230 return -EINVAL; 1231 1232 error = xfs_falloc_setsize(file, isize + len); 1233 if (error) 1234 return error; 1235 1236 /* 1237 * Perform hole insertion now that the file size has been updated so 1238 * that if we crash during the operation we don't leave shifted extents 1239 * past EOF and hence losing access to the data that is contained within 1240 * them. 1241 */ 1242 return xfs_insert_file_space(XFS_I(inode), offset, len); 1243 } 1244 1245 /* 1246 * Punch a hole and prealloc the range. We use a hole punch rather than 1247 * unwritten extent conversion for two reasons: 1248 * 1249 * 1.) Hole punch handles partial block zeroing for us. 1250 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by 1251 * virtue of the hole punch. 1252 */ 1253 static int 1254 xfs_falloc_zero_range( 1255 struct file *file, 1256 int mode, 1257 loff_t offset, 1258 loff_t len, 1259 struct xfs_zone_alloc_ctx *ac) 1260 { 1261 struct inode *inode = file_inode(file); 1262 unsigned int blksize = i_blocksize(inode); 1263 loff_t new_size = 0; 1264 int error; 1265 1266 trace_xfs_zero_file_space(XFS_I(inode)); 1267 1268 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1269 if (error) 1270 return error; 1271 1272 error = xfs_free_file_space(XFS_I(inode), offset, len, ac); 1273 if (error) 1274 return error; 1275 1276 len = round_up(offset + len, blksize) - round_down(offset, blksize); 1277 offset = round_down(offset, blksize); 1278 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1279 if (error) 1280 return error; 1281 return xfs_falloc_setsize(file, new_size); 1282 } 1283 1284 static int 1285 xfs_falloc_unshare_range( 1286 struct file *file, 1287 int mode, 1288 loff_t offset, 1289 loff_t len) 1290 { 1291 struct inode *inode = file_inode(file); 1292 loff_t new_size = 0; 1293 int error; 1294 1295 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1296 if (error) 1297 return error; 1298 1299 error = xfs_reflink_unshare(XFS_I(inode), offset, len); 1300 if (error) 1301 return error; 1302 1303 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1304 if (error) 1305 return error; 1306 return xfs_falloc_setsize(file, new_size); 1307 } 1308 1309 static int 1310 xfs_falloc_allocate_range( 1311 struct file *file, 1312 int mode, 1313 loff_t offset, 1314 loff_t len) 1315 { 1316 struct inode *inode = file_inode(file); 1317 loff_t new_size = 0; 1318 int error; 1319 1320 /* 1321 * If always_cow mode we can't use preallocations and thus should not 1322 * create them. 1323 */ 1324 if (xfs_is_always_cow_inode(XFS_I(inode))) 1325 return -EOPNOTSUPP; 1326 1327 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1328 if (error) 1329 return error; 1330 1331 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1332 if (error) 1333 return error; 1334 return xfs_falloc_setsize(file, new_size); 1335 } 1336 1337 #define XFS_FALLOC_FL_SUPPORTED \ 1338 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ 1339 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ 1340 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ 1341 FALLOC_FL_UNSHARE_RANGE) 1342 1343 STATIC long 1344 __xfs_file_fallocate( 1345 struct file *file, 1346 int mode, 1347 loff_t offset, 1348 loff_t len, 1349 struct xfs_zone_alloc_ctx *ac) 1350 { 1351 struct inode *inode = file_inode(file); 1352 struct xfs_inode *ip = XFS_I(inode); 1353 long error; 1354 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1355 1356 xfs_ilock(ip, iolock); 1357 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 1358 if (error) 1359 goto out_unlock; 1360 1361 /* 1362 * Must wait for all AIO to complete before we continue as AIO can 1363 * change the file size on completion without holding any locks we 1364 * currently hold. We must do this first because AIO can update both 1365 * the on disk and in memory inode sizes, and the operations that follow 1366 * require the in-memory size to be fully up-to-date. 1367 */ 1368 inode_dio_wait(inode); 1369 1370 error = file_modified(file); 1371 if (error) 1372 goto out_unlock; 1373 1374 switch (mode & FALLOC_FL_MODE_MASK) { 1375 case FALLOC_FL_PUNCH_HOLE: 1376 error = xfs_free_file_space(ip, offset, len, ac); 1377 break; 1378 case FALLOC_FL_COLLAPSE_RANGE: 1379 error = xfs_falloc_collapse_range(file, offset, len, ac); 1380 break; 1381 case FALLOC_FL_INSERT_RANGE: 1382 error = xfs_falloc_insert_range(file, offset, len); 1383 break; 1384 case FALLOC_FL_ZERO_RANGE: 1385 error = xfs_falloc_zero_range(file, mode, offset, len, ac); 1386 break; 1387 case FALLOC_FL_UNSHARE_RANGE: 1388 error = xfs_falloc_unshare_range(file, mode, offset, len); 1389 break; 1390 case FALLOC_FL_ALLOCATE_RANGE: 1391 error = xfs_falloc_allocate_range(file, mode, offset, len); 1392 break; 1393 default: 1394 error = -EOPNOTSUPP; 1395 break; 1396 } 1397 1398 if (!error && xfs_file_sync_writes(file)) 1399 error = xfs_log_force_inode(ip); 1400 1401 out_unlock: 1402 xfs_iunlock(ip, iolock); 1403 return error; 1404 } 1405 1406 static long 1407 xfs_file_zoned_fallocate( 1408 struct file *file, 1409 int mode, 1410 loff_t offset, 1411 loff_t len) 1412 { 1413 struct xfs_zone_alloc_ctx ac = { }; 1414 struct xfs_inode *ip = XFS_I(file_inode(file)); 1415 int error; 1416 1417 error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); 1418 if (error) 1419 return error; 1420 error = __xfs_file_fallocate(file, mode, offset, len, &ac); 1421 xfs_zoned_space_unreserve(ip, &ac); 1422 return error; 1423 } 1424 1425 static long 1426 xfs_file_fallocate( 1427 struct file *file, 1428 int mode, 1429 loff_t offset, 1430 loff_t len) 1431 { 1432 struct inode *inode = file_inode(file); 1433 1434 if (!S_ISREG(inode->i_mode)) 1435 return -EINVAL; 1436 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1437 return -EOPNOTSUPP; 1438 1439 /* 1440 * For zoned file systems, zeroing the first and last block of a hole 1441 * punch requires allocating a new block to rewrite the remaining data 1442 * and new zeroes out of place. Get a reservations for those before 1443 * taking the iolock. Dip into the reserved pool because we are 1444 * expected to be able to punch a hole even on a completely full 1445 * file system. 1446 */ 1447 if (xfs_is_zoned_inode(XFS_I(inode)) && 1448 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 1449 FALLOC_FL_COLLAPSE_RANGE))) 1450 return xfs_file_zoned_fallocate(file, mode, offset, len); 1451 return __xfs_file_fallocate(file, mode, offset, len, NULL); 1452 } 1453 1454 STATIC int 1455 xfs_file_fadvise( 1456 struct file *file, 1457 loff_t start, 1458 loff_t end, 1459 int advice) 1460 { 1461 struct xfs_inode *ip = XFS_I(file_inode(file)); 1462 int ret; 1463 int lockflags = 0; 1464 1465 /* 1466 * Operations creating pages in page cache need protection from hole 1467 * punching and similar ops 1468 */ 1469 if (advice == POSIX_FADV_WILLNEED) { 1470 lockflags = XFS_IOLOCK_SHARED; 1471 xfs_ilock(ip, lockflags); 1472 } 1473 ret = generic_fadvise(file, start, end, advice); 1474 if (lockflags) 1475 xfs_iunlock(ip, lockflags); 1476 return ret; 1477 } 1478 1479 STATIC loff_t 1480 xfs_file_remap_range( 1481 struct file *file_in, 1482 loff_t pos_in, 1483 struct file *file_out, 1484 loff_t pos_out, 1485 loff_t len, 1486 unsigned int remap_flags) 1487 { 1488 struct inode *inode_in = file_inode(file_in); 1489 struct xfs_inode *src = XFS_I(inode_in); 1490 struct inode *inode_out = file_inode(file_out); 1491 struct xfs_inode *dest = XFS_I(inode_out); 1492 struct xfs_mount *mp = src->i_mount; 1493 loff_t remapped = 0; 1494 xfs_extlen_t cowextsize; 1495 int ret; 1496 1497 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1498 return -EINVAL; 1499 1500 if (!xfs_has_reflink(mp)) 1501 return -EOPNOTSUPP; 1502 1503 if (xfs_is_shutdown(mp)) 1504 return -EIO; 1505 1506 /* Prepare and then clone file data. */ 1507 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1508 &len, remap_flags); 1509 if (ret || len == 0) 1510 return ret; 1511 1512 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1513 1514 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1515 &remapped); 1516 if (ret) 1517 goto out_unlock; 1518 1519 /* 1520 * Carry the cowextsize hint from src to dest if we're sharing the 1521 * entire source file to the entire destination file, the source file 1522 * has a cowextsize hint, and the destination file does not. 1523 */ 1524 cowextsize = 0; 1525 if (pos_in == 0 && len == i_size_read(inode_in) && 1526 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1527 pos_out == 0 && len >= i_size_read(inode_out) && 1528 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1529 cowextsize = src->i_cowextsize; 1530 1531 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1532 remap_flags); 1533 if (ret) 1534 goto out_unlock; 1535 1536 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1537 xfs_log_force_inode(dest); 1538 out_unlock: 1539 xfs_iunlock2_remapping(src, dest); 1540 if (ret) 1541 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1542 /* 1543 * If the caller did not set CAN_SHORTEN, then it is not prepared to 1544 * handle partial results -- either the whole remap succeeds, or we 1545 * must say why it did not. In this case, any error should be returned 1546 * to the caller. 1547 */ 1548 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) 1549 return ret; 1550 return remapped > 0 ? remapped : ret; 1551 } 1552 1553 STATIC int 1554 xfs_file_open( 1555 struct inode *inode, 1556 struct file *file) 1557 { 1558 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1559 return -EIO; 1560 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 1561 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) 1562 file->f_mode |= FMODE_CAN_ATOMIC_WRITE; 1563 return generic_file_open(inode, file); 1564 } 1565 1566 STATIC int 1567 xfs_dir_open( 1568 struct inode *inode, 1569 struct file *file) 1570 { 1571 struct xfs_inode *ip = XFS_I(inode); 1572 unsigned int mode; 1573 int error; 1574 1575 if (xfs_is_shutdown(ip->i_mount)) 1576 return -EIO; 1577 error = generic_file_open(inode, file); 1578 if (error) 1579 return error; 1580 1581 /* 1582 * If there are any blocks, read-ahead block 0 as we're almost 1583 * certain to have the next operation be a read there. 1584 */ 1585 mode = xfs_ilock_data_map_shared(ip); 1586 if (ip->i_df.if_nextents > 0) 1587 error = xfs_dir3_data_readahead(ip, 0, 0); 1588 xfs_iunlock(ip, mode); 1589 return error; 1590 } 1591 1592 /* 1593 * Don't bother propagating errors. We're just doing cleanup, and the caller 1594 * ignores the return value anyway. 1595 */ 1596 STATIC int 1597 xfs_file_release( 1598 struct inode *inode, 1599 struct file *file) 1600 { 1601 struct xfs_inode *ip = XFS_I(inode); 1602 struct xfs_mount *mp = ip->i_mount; 1603 1604 /* 1605 * If this is a read-only mount or the file system has been shut down, 1606 * don't generate I/O. 1607 */ 1608 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) 1609 return 0; 1610 1611 /* 1612 * If we previously truncated this file and removed old data in the 1613 * process, we want to initiate "early" writeout on the last close. 1614 * This is an attempt to combat the notorious NULL files problem which 1615 * is particularly noticeable from a truncate down, buffered (re-)write 1616 * (delalloc), followed by a crash. What we are effectively doing here 1617 * is significantly reducing the time window where we'd otherwise be 1618 * exposed to that problem. 1619 */ 1620 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { 1621 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); 1622 if (ip->i_delayed_blks > 0) 1623 filemap_flush(inode->i_mapping); 1624 } 1625 1626 /* 1627 * XFS aggressively preallocates post-EOF space to generate contiguous 1628 * allocations for writers that append to the end of the file. 1629 * 1630 * To support workloads that close and reopen the file frequently, these 1631 * preallocations usually persist after a close unless it is the first 1632 * close for the inode. This is a tradeoff to generate tightly packed 1633 * data layouts for unpacking tarballs or similar archives that write 1634 * one file after another without going back to it while keeping the 1635 * preallocation for files that have recurring open/write/close cycles. 1636 * 1637 * This heuristic is skipped for inodes with the append-only flag as 1638 * that flag is rather pointless for inodes written only once. 1639 * 1640 * There is no point in freeing blocks here for open but unlinked files 1641 * as they will be taken care of by the inactivation path soon. 1642 * 1643 * When releasing a read-only context, don't flush data or trim post-EOF 1644 * blocks. This avoids open/read/close workloads from removing EOF 1645 * blocks that other writers depend upon to reduce fragmentation. 1646 * 1647 * Inodes on the zoned RT device never have preallocations, so skip 1648 * taking the locks below. 1649 */ 1650 if (!inode->i_nlink || 1651 !(file->f_mode & FMODE_WRITE) || 1652 (ip->i_diflags & XFS_DIFLAG_APPEND) || 1653 xfs_is_zoned_inode(ip)) 1654 return 0; 1655 1656 /* 1657 * If we can't get the iolock just skip truncating the blocks past EOF 1658 * because we could deadlock with the mmap_lock otherwise. We'll get 1659 * another chance to drop them once the last reference to the inode is 1660 * dropped, so we'll never leak blocks permanently. 1661 */ 1662 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1663 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1664 if (xfs_can_free_eofblocks(ip) && 1665 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) 1666 xfs_free_eofblocks(ip); 1667 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1668 } 1669 1670 return 0; 1671 } 1672 1673 STATIC int 1674 xfs_file_readdir( 1675 struct file *file, 1676 struct dir_context *ctx) 1677 { 1678 struct inode *inode = file_inode(file); 1679 xfs_inode_t *ip = XFS_I(inode); 1680 size_t bufsize; 1681 1682 /* 1683 * The Linux API doesn't pass down the total size of the buffer 1684 * we read into down to the filesystem. With the filldir concept 1685 * it's not needed for correct information, but the XFS dir2 leaf 1686 * code wants an estimate of the buffer size to calculate it's 1687 * readahead window and size the buffers used for mapping to 1688 * physical blocks. 1689 * 1690 * Try to give it an estimate that's good enough, maybe at some 1691 * point we can change the ->readdir prototype to include the 1692 * buffer size. For now we use the current glibc buffer size. 1693 */ 1694 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1695 1696 return xfs_readdir(NULL, ip, ctx, bufsize); 1697 } 1698 1699 STATIC loff_t 1700 xfs_file_llseek( 1701 struct file *file, 1702 loff_t offset, 1703 int whence) 1704 { 1705 struct inode *inode = file->f_mapping->host; 1706 1707 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1708 return -EIO; 1709 1710 switch (whence) { 1711 default: 1712 return generic_file_llseek(file, offset, whence); 1713 case SEEK_HOLE: 1714 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1715 break; 1716 case SEEK_DATA: 1717 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1718 break; 1719 } 1720 1721 if (offset < 0) 1722 return offset; 1723 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1724 } 1725 1726 static inline vm_fault_t 1727 xfs_dax_fault_locked( 1728 struct vm_fault *vmf, 1729 unsigned int order, 1730 bool write_fault) 1731 { 1732 vm_fault_t ret; 1733 pfn_t pfn; 1734 1735 if (!IS_ENABLED(CONFIG_FS_DAX)) { 1736 ASSERT(0); 1737 return VM_FAULT_SIGBUS; 1738 } 1739 ret = dax_iomap_fault(vmf, order, &pfn, NULL, 1740 (write_fault && !vmf->cow_page) ? 1741 &xfs_dax_write_iomap_ops : 1742 &xfs_read_iomap_ops); 1743 if (ret & VM_FAULT_NEEDDSYNC) 1744 ret = dax_finish_sync_fault(vmf, order, pfn); 1745 return ret; 1746 } 1747 1748 static vm_fault_t 1749 xfs_dax_read_fault( 1750 struct vm_fault *vmf, 1751 unsigned int order) 1752 { 1753 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1754 vm_fault_t ret; 1755 1756 trace_xfs_read_fault(ip, order); 1757 1758 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1759 ret = xfs_dax_fault_locked(vmf, order, false); 1760 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1761 1762 return ret; 1763 } 1764 1765 /* 1766 * Locking for serialisation of IO during page faults. This results in a lock 1767 * ordering of: 1768 * 1769 * mmap_lock (MM) 1770 * sb_start_pagefault(vfs, freeze) 1771 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1772 * page_lock (MM) 1773 * i_lock (XFS - extent map serialisation) 1774 */ 1775 static vm_fault_t 1776 __xfs_write_fault( 1777 struct vm_fault *vmf, 1778 unsigned int order, 1779 struct xfs_zone_alloc_ctx *ac) 1780 { 1781 struct inode *inode = file_inode(vmf->vma->vm_file); 1782 struct xfs_inode *ip = XFS_I(inode); 1783 unsigned int lock_mode = XFS_MMAPLOCK_SHARED; 1784 vm_fault_t ret; 1785 1786 trace_xfs_write_fault(ip, order); 1787 1788 sb_start_pagefault(inode->i_sb); 1789 file_update_time(vmf->vma->vm_file); 1790 1791 /* 1792 * Normally we only need the shared mmaplock, but if a reflink remap is 1793 * in progress we take the exclusive lock to wait for the remap to 1794 * finish before taking a write fault. 1795 */ 1796 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1797 if (xfs_iflags_test(ip, XFS_IREMAPPING)) { 1798 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1799 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1800 lock_mode = XFS_MMAPLOCK_EXCL; 1801 } 1802 1803 if (IS_DAX(inode)) 1804 ret = xfs_dax_fault_locked(vmf, order, true); 1805 else 1806 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, 1807 ac); 1808 xfs_iunlock(ip, lock_mode); 1809 1810 sb_end_pagefault(inode->i_sb); 1811 return ret; 1812 } 1813 1814 static vm_fault_t 1815 xfs_write_fault_zoned( 1816 struct vm_fault *vmf, 1817 unsigned int order) 1818 { 1819 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1820 unsigned int len = folio_size(page_folio(vmf->page)); 1821 struct xfs_zone_alloc_ctx ac = { }; 1822 int error; 1823 vm_fault_t ret; 1824 1825 /* 1826 * This could over-allocate as it doesn't check for truncation. 1827 * 1828 * But as the overallocation is limited to less than a folio and will be 1829 * release instantly that's just fine. 1830 */ 1831 error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, 1832 &ac); 1833 if (error < 0) 1834 return vmf_fs_error(error); 1835 ret = __xfs_write_fault(vmf, order, &ac); 1836 xfs_zoned_space_unreserve(ip, &ac); 1837 return ret; 1838 } 1839 1840 static vm_fault_t 1841 xfs_write_fault( 1842 struct vm_fault *vmf, 1843 unsigned int order) 1844 { 1845 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) 1846 return xfs_write_fault_zoned(vmf, order); 1847 return __xfs_write_fault(vmf, order, NULL); 1848 } 1849 1850 static inline bool 1851 xfs_is_write_fault( 1852 struct vm_fault *vmf) 1853 { 1854 return (vmf->flags & FAULT_FLAG_WRITE) && 1855 (vmf->vma->vm_flags & VM_SHARED); 1856 } 1857 1858 static vm_fault_t 1859 xfs_filemap_fault( 1860 struct vm_fault *vmf) 1861 { 1862 struct inode *inode = file_inode(vmf->vma->vm_file); 1863 1864 /* DAX can shortcut the normal fault path on write faults! */ 1865 if (IS_DAX(inode)) { 1866 if (xfs_is_write_fault(vmf)) 1867 return xfs_write_fault(vmf, 0); 1868 return xfs_dax_read_fault(vmf, 0); 1869 } 1870 1871 trace_xfs_read_fault(XFS_I(inode), 0); 1872 return filemap_fault(vmf); 1873 } 1874 1875 static vm_fault_t 1876 xfs_filemap_huge_fault( 1877 struct vm_fault *vmf, 1878 unsigned int order) 1879 { 1880 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1881 return VM_FAULT_FALLBACK; 1882 1883 /* DAX can shortcut the normal fault path on write faults! */ 1884 if (xfs_is_write_fault(vmf)) 1885 return xfs_write_fault(vmf, order); 1886 return xfs_dax_read_fault(vmf, order); 1887 } 1888 1889 static vm_fault_t 1890 xfs_filemap_page_mkwrite( 1891 struct vm_fault *vmf) 1892 { 1893 return xfs_write_fault(vmf, 0); 1894 } 1895 1896 /* 1897 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1898 * on write faults. In reality, it needs to serialise against truncate and 1899 * prepare memory for writing so handle is as standard write fault. 1900 */ 1901 static vm_fault_t 1902 xfs_filemap_pfn_mkwrite( 1903 struct vm_fault *vmf) 1904 { 1905 return xfs_write_fault(vmf, 0); 1906 } 1907 1908 static const struct vm_operations_struct xfs_file_vm_ops = { 1909 .fault = xfs_filemap_fault, 1910 .huge_fault = xfs_filemap_huge_fault, 1911 .map_pages = filemap_map_pages, 1912 .page_mkwrite = xfs_filemap_page_mkwrite, 1913 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1914 }; 1915 1916 STATIC int 1917 xfs_file_mmap( 1918 struct file *file, 1919 struct vm_area_struct *vma) 1920 { 1921 struct inode *inode = file_inode(file); 1922 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 1923 1924 /* 1925 * We don't support synchronous mappings for non-DAX files and 1926 * for DAX files if underneath dax_device is not synchronous. 1927 */ 1928 if (!daxdev_mapping_supported(vma, target->bt_daxdev)) 1929 return -EOPNOTSUPP; 1930 1931 file_accessed(file); 1932 vma->vm_ops = &xfs_file_vm_ops; 1933 if (IS_DAX(inode)) 1934 vm_flags_set(vma, VM_HUGEPAGE); 1935 return 0; 1936 } 1937 1938 const struct file_operations xfs_file_operations = { 1939 .llseek = xfs_file_llseek, 1940 .read_iter = xfs_file_read_iter, 1941 .write_iter = xfs_file_write_iter, 1942 .splice_read = xfs_file_splice_read, 1943 .splice_write = iter_file_splice_write, 1944 .iopoll = iocb_bio_iopoll, 1945 .unlocked_ioctl = xfs_file_ioctl, 1946 #ifdef CONFIG_COMPAT 1947 .compat_ioctl = xfs_file_compat_ioctl, 1948 #endif 1949 .mmap = xfs_file_mmap, 1950 .open = xfs_file_open, 1951 .release = xfs_file_release, 1952 .fsync = xfs_file_fsync, 1953 .get_unmapped_area = thp_get_unmapped_area, 1954 .fallocate = xfs_file_fallocate, 1955 .fadvise = xfs_file_fadvise, 1956 .remap_file_range = xfs_file_remap_range, 1957 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 1958 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | 1959 FOP_DONTCACHE, 1960 }; 1961 1962 const struct file_operations xfs_dir_file_operations = { 1963 .open = xfs_dir_open, 1964 .read = generic_read_dir, 1965 .iterate_shared = xfs_file_readdir, 1966 .llseek = generic_file_llseek, 1967 .unlocked_ioctl = xfs_file_ioctl, 1968 #ifdef CONFIG_COMPAT 1969 .compat_ioctl = xfs_file_compat_ioctl, 1970 #endif 1971 .fsync = xfs_dir_fsync, 1972 }; 1973