1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 #include "xfs_file.h" 28 #include "xfs_aops.h" 29 #include "xfs_zone_alloc.h" 30 31 #include <linux/dax.h> 32 #include <linux/falloc.h> 33 #include <linux/backing-dev.h> 34 #include <linux/mman.h> 35 #include <linux/fadvise.h> 36 #include <linux/mount.h> 37 38 static const struct vm_operations_struct xfs_file_vm_ops; 39 40 /* 41 * Decide if the given file range is aligned to the size of the fundamental 42 * allocation unit for the file. 43 */ 44 bool 45 xfs_is_falloc_aligned( 46 struct xfs_inode *ip, 47 loff_t pos, 48 long long int len) 49 { 50 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); 51 52 if (!is_power_of_2(alloc_unit)) 53 return isaligned_64(pos, alloc_unit) && 54 isaligned_64(len, alloc_unit); 55 56 return !((pos | len) & (alloc_unit - 1)); 57 } 58 59 /* 60 * Fsync operations on directories are much simpler than on regular files, 61 * as there is no file data to flush, and thus also no need for explicit 62 * cache flush operations, and there are no non-transaction metadata updates 63 * on directories either. 64 */ 65 STATIC int 66 xfs_dir_fsync( 67 struct file *file, 68 loff_t start, 69 loff_t end, 70 int datasync) 71 { 72 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 73 74 trace_xfs_dir_fsync(ip); 75 return xfs_log_force_inode(ip); 76 } 77 78 static xfs_csn_t 79 xfs_fsync_seq( 80 struct xfs_inode *ip, 81 bool datasync) 82 { 83 if (!xfs_ipincount(ip)) 84 return 0; 85 if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 86 return 0; 87 return ip->i_itemp->ili_commit_seq; 88 } 89 90 /* 91 * All metadata updates are logged, which means that we just have to flush the 92 * log up to the latest LSN that touched the inode. 93 * 94 * If we have concurrent fsync/fdatasync() calls, we need them to all block on 95 * the log force before we clear the ili_fsync_fields field. This ensures that 96 * we don't get a racing sync operation that does not wait for the metadata to 97 * hit the journal before returning. If we race with clearing ili_fsync_fields, 98 * then all that will happen is the log force will do nothing as the lsn will 99 * already be on disk. We can't race with setting ili_fsync_fields because that 100 * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock 101 * shared until after the ili_fsync_fields is cleared. 102 */ 103 static int 104 xfs_fsync_flush_log( 105 struct xfs_inode *ip, 106 bool datasync, 107 int *log_flushed) 108 { 109 int error = 0; 110 xfs_csn_t seq; 111 112 xfs_ilock(ip, XFS_ILOCK_SHARED); 113 seq = xfs_fsync_seq(ip, datasync); 114 if (seq) { 115 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 116 log_flushed); 117 118 spin_lock(&ip->i_itemp->ili_lock); 119 ip->i_itemp->ili_fsync_fields = 0; 120 spin_unlock(&ip->i_itemp->ili_lock); 121 } 122 xfs_iunlock(ip, XFS_ILOCK_SHARED); 123 return error; 124 } 125 126 STATIC int 127 xfs_file_fsync( 128 struct file *file, 129 loff_t start, 130 loff_t end, 131 int datasync) 132 { 133 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 134 struct xfs_mount *mp = ip->i_mount; 135 int error, err2; 136 int log_flushed = 0; 137 138 trace_xfs_file_fsync(ip); 139 140 error = file_write_and_wait_range(file, start, end); 141 if (error) 142 return error; 143 144 if (xfs_is_shutdown(mp)) 145 return -EIO; 146 147 xfs_iflags_clear(ip, XFS_ITRUNCATED); 148 149 /* 150 * If we have an RT and/or log subvolume we need to make sure to flush 151 * the write cache the device used for file data first. This is to 152 * ensure newly written file data make it to disk before logging the new 153 * inode size in case of an extending write. 154 */ 155 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) 156 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 157 else if (mp->m_logdev_targp != mp->m_ddev_targp) 158 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 159 160 /* 161 * Any inode that has dirty modifications in the log is pinned. The 162 * racy check here for a pinned inode will not catch modifications 163 * that happen concurrently to the fsync call, but fsync semantics 164 * only require to sync previously completed I/O. 165 */ 166 if (xfs_ipincount(ip)) { 167 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 168 if (err2 && !error) 169 error = err2; 170 } 171 172 /* 173 * If we only have a single device, and the log force about was 174 * a no-op we might have to flush the data device cache here. 175 * This can only happen for fdatasync/O_DSYNC if we were overwriting 176 * an already allocated file and thus do not have any metadata to 177 * commit. 178 */ 179 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 180 mp->m_logdev_targp == mp->m_ddev_targp) { 181 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 182 if (err2 && !error) 183 error = err2; 184 } 185 186 return error; 187 } 188 189 static int 190 xfs_ilock_iocb( 191 struct kiocb *iocb, 192 unsigned int lock_mode) 193 { 194 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 195 196 if (iocb->ki_flags & IOCB_NOWAIT) { 197 if (!xfs_ilock_nowait(ip, lock_mode)) 198 return -EAGAIN; 199 } else { 200 xfs_ilock(ip, lock_mode); 201 } 202 203 return 0; 204 } 205 206 static int 207 xfs_ilock_iocb_for_write( 208 struct kiocb *iocb, 209 unsigned int *lock_mode) 210 { 211 ssize_t ret; 212 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 213 214 ret = xfs_ilock_iocb(iocb, *lock_mode); 215 if (ret) 216 return ret; 217 218 /* 219 * If a reflink remap is in progress we always need to take the iolock 220 * exclusively to wait for it to finish. 221 */ 222 if (*lock_mode == XFS_IOLOCK_SHARED && 223 xfs_iflags_test(ip, XFS_IREMAPPING)) { 224 xfs_iunlock(ip, *lock_mode); 225 *lock_mode = XFS_IOLOCK_EXCL; 226 return xfs_ilock_iocb(iocb, *lock_mode); 227 } 228 229 return 0; 230 } 231 232 STATIC ssize_t 233 xfs_file_dio_read( 234 struct kiocb *iocb, 235 struct iov_iter *to) 236 { 237 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 238 ssize_t ret; 239 240 trace_xfs_file_direct_read(iocb, to); 241 242 if (!iov_iter_count(to)) 243 return 0; /* skip atime */ 244 245 file_accessed(iocb->ki_filp); 246 247 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 248 if (ret) 249 return ret; 250 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 251 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 252 253 return ret; 254 } 255 256 static noinline ssize_t 257 xfs_file_dax_read( 258 struct kiocb *iocb, 259 struct iov_iter *to) 260 { 261 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 262 ssize_t ret = 0; 263 264 trace_xfs_file_dax_read(iocb, to); 265 266 if (!iov_iter_count(to)) 267 return 0; /* skip atime */ 268 269 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 270 if (ret) 271 return ret; 272 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 273 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 274 275 file_accessed(iocb->ki_filp); 276 return ret; 277 } 278 279 STATIC ssize_t 280 xfs_file_buffered_read( 281 struct kiocb *iocb, 282 struct iov_iter *to) 283 { 284 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 285 ssize_t ret; 286 287 trace_xfs_file_buffered_read(iocb, to); 288 289 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 290 if (ret) 291 return ret; 292 ret = generic_file_read_iter(iocb, to); 293 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 294 295 return ret; 296 } 297 298 STATIC ssize_t 299 xfs_file_read_iter( 300 struct kiocb *iocb, 301 struct iov_iter *to) 302 { 303 struct inode *inode = file_inode(iocb->ki_filp); 304 struct xfs_mount *mp = XFS_I(inode)->i_mount; 305 ssize_t ret = 0; 306 307 XFS_STATS_INC(mp, xs_read_calls); 308 309 if (xfs_is_shutdown(mp)) 310 return -EIO; 311 312 if (IS_DAX(inode)) 313 ret = xfs_file_dax_read(iocb, to); 314 else if (iocb->ki_flags & IOCB_DIRECT) 315 ret = xfs_file_dio_read(iocb, to); 316 else 317 ret = xfs_file_buffered_read(iocb, to); 318 319 if (ret > 0) 320 XFS_STATS_ADD(mp, xs_read_bytes, ret); 321 return ret; 322 } 323 324 STATIC ssize_t 325 xfs_file_splice_read( 326 struct file *in, 327 loff_t *ppos, 328 struct pipe_inode_info *pipe, 329 size_t len, 330 unsigned int flags) 331 { 332 struct inode *inode = file_inode(in); 333 struct xfs_inode *ip = XFS_I(inode); 334 struct xfs_mount *mp = ip->i_mount; 335 ssize_t ret = 0; 336 337 XFS_STATS_INC(mp, xs_read_calls); 338 339 if (xfs_is_shutdown(mp)) 340 return -EIO; 341 342 trace_xfs_file_splice_read(ip, *ppos, len); 343 344 xfs_ilock(ip, XFS_IOLOCK_SHARED); 345 ret = filemap_splice_read(in, ppos, pipe, len, flags); 346 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 347 if (ret > 0) 348 XFS_STATS_ADD(mp, xs_read_bytes, ret); 349 return ret; 350 } 351 352 /* 353 * Take care of zeroing post-EOF blocks when they might exist. 354 * 355 * Returns 0 if successfully, a negative error for a failure, or 1 if this 356 * function dropped the iolock and reacquired it exclusively and the caller 357 * needs to restart the write sanity checks. 358 */ 359 static ssize_t 360 xfs_file_write_zero_eof( 361 struct kiocb *iocb, 362 struct iov_iter *from, 363 unsigned int *iolock, 364 size_t count, 365 bool *drained_dio, 366 struct xfs_zone_alloc_ctx *ac) 367 { 368 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 369 loff_t isize; 370 int error; 371 372 /* 373 * We need to serialise against EOF updates that occur in IO completions 374 * here. We want to make sure that nobody is changing the size while 375 * we do this check until we have placed an IO barrier (i.e. hold 376 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 377 * spinlock effectively forms a memory barrier once we have 378 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 379 * hence be able to correctly determine if we need to run zeroing. 380 */ 381 spin_lock(&ip->i_flags_lock); 382 isize = i_size_read(VFS_I(ip)); 383 if (iocb->ki_pos <= isize) { 384 spin_unlock(&ip->i_flags_lock); 385 return 0; 386 } 387 spin_unlock(&ip->i_flags_lock); 388 389 if (iocb->ki_flags & IOCB_NOWAIT) 390 return -EAGAIN; 391 392 if (!*drained_dio) { 393 /* 394 * If zeroing is needed and we are currently holding the iolock 395 * shared, we need to update it to exclusive which implies 396 * having to redo all checks before. 397 */ 398 if (*iolock == XFS_IOLOCK_SHARED) { 399 xfs_iunlock(ip, *iolock); 400 *iolock = XFS_IOLOCK_EXCL; 401 xfs_ilock(ip, *iolock); 402 iov_iter_reexpand(from, count); 403 } 404 405 /* 406 * We now have an IO submission barrier in place, but AIO can do 407 * EOF updates during IO completion and hence we now need to 408 * wait for all of them to drain. Non-AIO DIO will have drained 409 * before we are given the XFS_IOLOCK_EXCL, and so for most 410 * cases this wait is a no-op. 411 */ 412 inode_dio_wait(VFS_I(ip)); 413 *drained_dio = true; 414 return 1; 415 } 416 417 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 418 419 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 420 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); 421 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 422 423 return error; 424 } 425 426 /* 427 * Common pre-write limit and setup checks. 428 * 429 * Called with the iolock held either shared and exclusive according to 430 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 431 * if called for a direct write beyond i_size. 432 */ 433 STATIC ssize_t 434 xfs_file_write_checks( 435 struct kiocb *iocb, 436 struct iov_iter *from, 437 unsigned int *iolock, 438 struct xfs_zone_alloc_ctx *ac) 439 { 440 struct inode *inode = iocb->ki_filp->f_mapping->host; 441 size_t count = iov_iter_count(from); 442 bool drained_dio = false; 443 ssize_t error; 444 445 restart: 446 error = generic_write_checks(iocb, from); 447 if (error <= 0) 448 return error; 449 450 if (iocb->ki_flags & IOCB_NOWAIT) { 451 error = break_layout(inode, false); 452 if (error == -EWOULDBLOCK) 453 error = -EAGAIN; 454 } else { 455 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 456 } 457 458 if (error) 459 return error; 460 461 /* 462 * For changing security info in file_remove_privs() we need i_rwsem 463 * exclusively. 464 */ 465 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 466 xfs_iunlock(XFS_I(inode), *iolock); 467 *iolock = XFS_IOLOCK_EXCL; 468 error = xfs_ilock_iocb(iocb, *iolock); 469 if (error) { 470 *iolock = 0; 471 return error; 472 } 473 goto restart; 474 } 475 476 /* 477 * If the offset is beyond the size of the file, we need to zero all 478 * blocks that fall between the existing EOF and the start of this 479 * write. 480 * 481 * We can do an unlocked check for i_size here safely as I/O completion 482 * can only extend EOF. Truncate is locked out at this point, so the 483 * EOF can not move backwards, only forwards. Hence we only need to take 484 * the slow path when we are at or beyond the current EOF. 485 */ 486 if (iocb->ki_pos > i_size_read(inode)) { 487 error = xfs_file_write_zero_eof(iocb, from, iolock, count, 488 &drained_dio, ac); 489 if (error == 1) 490 goto restart; 491 if (error) 492 return error; 493 } 494 495 return kiocb_modified(iocb); 496 } 497 498 static ssize_t 499 xfs_zoned_write_space_reserve( 500 struct xfs_inode *ip, 501 struct kiocb *iocb, 502 struct iov_iter *from, 503 unsigned int flags, 504 struct xfs_zone_alloc_ctx *ac) 505 { 506 loff_t count = iov_iter_count(from); 507 int error; 508 509 if (iocb->ki_flags & IOCB_NOWAIT) 510 flags |= XFS_ZR_NOWAIT; 511 512 /* 513 * Check the rlimit and LFS boundary first so that we don't over-reserve 514 * by possibly a lot. 515 * 516 * The generic write path will redo this check later, and it might have 517 * changed by then. If it got expanded we'll stick to our earlier 518 * smaller limit, and if it is decreased the new smaller limit will be 519 * used and our extra space reservation will be returned after finishing 520 * the write. 521 */ 522 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); 523 if (error) 524 return error; 525 526 /* 527 * Sloppily round up count to file system blocks. 528 * 529 * This will often reserve an extra block, but that avoids having to look 530 * at the start offset, which isn't stable for O_APPEND until taking the 531 * iolock. Also we need to reserve a block each for zeroing the old 532 * EOF block and the new start block if they are unaligned. 533 * 534 * Any remaining block will be returned after the write. 535 */ 536 return xfs_zoned_space_reserve(ip, 537 XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); 538 } 539 540 static int 541 xfs_dio_write_end_io( 542 struct kiocb *iocb, 543 ssize_t size, 544 int error, 545 unsigned flags) 546 { 547 struct inode *inode = file_inode(iocb->ki_filp); 548 struct xfs_inode *ip = XFS_I(inode); 549 loff_t offset = iocb->ki_pos; 550 unsigned int nofs_flag; 551 552 ASSERT(!xfs_is_zoned_inode(ip) || 553 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 554 555 trace_xfs_end_io_direct_write(ip, offset, size); 556 557 if (xfs_is_shutdown(ip->i_mount)) 558 return -EIO; 559 560 if (error) 561 return error; 562 if (!size) 563 return 0; 564 565 /* 566 * Capture amount written on completion as we can't reliably account 567 * for it on submission. 568 */ 569 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 570 571 /* 572 * We can allocate memory here while doing writeback on behalf of 573 * memory reclaim. To avoid memory allocation deadlocks set the 574 * task-wide nofs context for the following operations. 575 */ 576 nofs_flag = memalloc_nofs_save(); 577 578 if (flags & IOMAP_DIO_COW) { 579 error = xfs_reflink_end_cow(ip, offset, size); 580 if (error) 581 goto out; 582 } 583 584 /* 585 * Unwritten conversion updates the in-core isize after extent 586 * conversion but before updating the on-disk size. Updating isize any 587 * earlier allows a racing dio read to find unwritten extents before 588 * they are converted. 589 */ 590 if (flags & IOMAP_DIO_UNWRITTEN) { 591 error = xfs_iomap_write_unwritten(ip, offset, size, true); 592 goto out; 593 } 594 595 /* 596 * We need to update the in-core inode size here so that we don't end up 597 * with the on-disk inode size being outside the in-core inode size. We 598 * have no other method of updating EOF for AIO, so always do it here 599 * if necessary. 600 * 601 * We need to lock the test/set EOF update as we can be racing with 602 * other IO completions here to update the EOF. Failing to serialise 603 * here can result in EOF moving backwards and Bad Things Happen when 604 * that occurs. 605 * 606 * As IO completion only ever extends EOF, we can do an unlocked check 607 * here to avoid taking the spinlock. If we land within the current EOF, 608 * then we do not need to do an extending update at all, and we don't 609 * need to take the lock to check this. If we race with an update moving 610 * EOF, then we'll either still be beyond EOF and need to take the lock, 611 * or we'll be within EOF and we don't need to take it at all. 612 */ 613 if (offset + size <= i_size_read(inode)) 614 goto out; 615 616 spin_lock(&ip->i_flags_lock); 617 if (offset + size > i_size_read(inode)) { 618 i_size_write(inode, offset + size); 619 spin_unlock(&ip->i_flags_lock); 620 error = xfs_setfilesize(ip, offset, size); 621 } else { 622 spin_unlock(&ip->i_flags_lock); 623 } 624 625 out: 626 memalloc_nofs_restore(nofs_flag); 627 return error; 628 } 629 630 static const struct iomap_dio_ops xfs_dio_write_ops = { 631 .end_io = xfs_dio_write_end_io, 632 }; 633 634 static void 635 xfs_dio_zoned_submit_io( 636 const struct iomap_iter *iter, 637 struct bio *bio, 638 loff_t file_offset) 639 { 640 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; 641 struct xfs_zone_alloc_ctx *ac = iter->private; 642 xfs_filblks_t count_fsb; 643 struct iomap_ioend *ioend; 644 645 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); 646 if (count_fsb > ac->reserved_blocks) { 647 xfs_err(mp, 648 "allocation (%lld) larger than reservation (%lld).", 649 count_fsb, ac->reserved_blocks); 650 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 651 bio_io_error(bio); 652 return; 653 } 654 ac->reserved_blocks -= count_fsb; 655 656 bio->bi_end_io = xfs_end_bio; 657 ioend = iomap_init_ioend(iter->inode, bio, file_offset, 658 IOMAP_IOEND_DIRECT); 659 xfs_zone_alloc_and_submit(ioend, &ac->open_zone); 660 } 661 662 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 663 .bio_set = &iomap_ioend_bioset, 664 .submit_io = xfs_dio_zoned_submit_io, 665 .end_io = xfs_dio_write_end_io, 666 }; 667 668 /* 669 * Handle block aligned direct I/O writes. 670 */ 671 static noinline ssize_t 672 xfs_file_dio_write_aligned( 673 struct xfs_inode *ip, 674 struct kiocb *iocb, 675 struct iov_iter *from, 676 const struct iomap_ops *ops, 677 const struct iomap_dio_ops *dops, 678 struct xfs_zone_alloc_ctx *ac) 679 { 680 unsigned int iolock = XFS_IOLOCK_SHARED; 681 ssize_t ret; 682 683 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 684 if (ret) 685 return ret; 686 ret = xfs_file_write_checks(iocb, from, &iolock, ac); 687 if (ret) 688 goto out_unlock; 689 690 /* 691 * We don't need to hold the IOLOCK exclusively across the IO, so demote 692 * the iolock back to shared if we had to take the exclusive lock in 693 * xfs_file_write_checks() for other reasons. 694 */ 695 if (iolock == XFS_IOLOCK_EXCL) { 696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 697 iolock = XFS_IOLOCK_SHARED; 698 } 699 trace_xfs_file_direct_write(iocb, from); 700 ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); 701 out_unlock: 702 xfs_iunlock(ip, iolock); 703 return ret; 704 } 705 706 /* 707 * Handle block aligned direct I/O writes to zoned devices. 708 */ 709 static noinline ssize_t 710 xfs_file_dio_write_zoned( 711 struct xfs_inode *ip, 712 struct kiocb *iocb, 713 struct iov_iter *from) 714 { 715 struct xfs_zone_alloc_ctx ac = { }; 716 ssize_t ret; 717 718 ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); 719 if (ret < 0) 720 return ret; 721 ret = xfs_file_dio_write_aligned(ip, iocb, from, 722 &xfs_zoned_direct_write_iomap_ops, 723 &xfs_dio_zoned_write_ops, &ac); 724 xfs_zoned_space_unreserve(ip, &ac); 725 return ret; 726 } 727 728 /* 729 * Handle block unaligned direct I/O writes 730 * 731 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 732 * them to be done in parallel with reads and other direct I/O writes. However, 733 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 734 * to do sub-block zeroing and that requires serialisation against other direct 735 * I/O to the same block. In this case we need to serialise the submission of 736 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 737 * In the case where sub-block zeroing is not required, we can do concurrent 738 * sub-block dios to the same block successfully. 739 * 740 * Optimistically submit the I/O using the shared lock first, but use the 741 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 742 * if block allocation or partial block zeroing would be required. In that case 743 * we try again with the exclusive lock. 744 */ 745 static noinline ssize_t 746 xfs_file_dio_write_unaligned( 747 struct xfs_inode *ip, 748 struct kiocb *iocb, 749 struct iov_iter *from) 750 { 751 size_t isize = i_size_read(VFS_I(ip)); 752 size_t count = iov_iter_count(from); 753 unsigned int iolock = XFS_IOLOCK_SHARED; 754 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 755 ssize_t ret; 756 757 /* 758 * Extending writes need exclusivity because of the sub-block zeroing 759 * that the DIO code always does for partial tail blocks beyond EOF, so 760 * don't even bother trying the fast path in this case. 761 */ 762 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 763 if (iocb->ki_flags & IOCB_NOWAIT) 764 return -EAGAIN; 765 retry_exclusive: 766 iolock = XFS_IOLOCK_EXCL; 767 flags = IOMAP_DIO_FORCE_WAIT; 768 } 769 770 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 771 if (ret) 772 return ret; 773 774 /* 775 * We can't properly handle unaligned direct I/O to reflink files yet, 776 * as we can't unshare a partial block. 777 */ 778 if (xfs_is_cow_inode(ip)) { 779 trace_xfs_reflink_bounce_dio_write(iocb, from); 780 ret = -ENOTBLK; 781 goto out_unlock; 782 } 783 784 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 785 if (ret) 786 goto out_unlock; 787 788 /* 789 * If we are doing exclusive unaligned I/O, this must be the only I/O 790 * in-flight. Otherwise we risk data corruption due to unwritten extent 791 * conversions from the AIO end_io handler. Wait for all other I/O to 792 * drain first. 793 */ 794 if (flags & IOMAP_DIO_FORCE_WAIT) 795 inode_dio_wait(VFS_I(ip)); 796 797 trace_xfs_file_direct_write(iocb, from); 798 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 799 &xfs_dio_write_ops, flags, NULL, 0); 800 801 /* 802 * Retry unaligned I/O with exclusive blocking semantics if the DIO 803 * layer rejected it for mapping or locking reasons. If we are doing 804 * nonblocking user I/O, propagate the error. 805 */ 806 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 807 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 808 xfs_iunlock(ip, iolock); 809 goto retry_exclusive; 810 } 811 812 out_unlock: 813 if (iolock) 814 xfs_iunlock(ip, iolock); 815 return ret; 816 } 817 818 static ssize_t 819 xfs_file_dio_write( 820 struct kiocb *iocb, 821 struct iov_iter *from) 822 { 823 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 824 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 825 size_t count = iov_iter_count(from); 826 827 /* direct I/O must be aligned to device logical sector size */ 828 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 829 return -EINVAL; 830 831 /* 832 * For always COW inodes we also must check the alignment of each 833 * individual iovec segment, as they could end up with different 834 * I/Os due to the way bio_iov_iter_get_pages works, and we'd 835 * then overwrite an already written block. 836 */ 837 if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || 838 (xfs_is_always_cow_inode(ip) && 839 (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) 840 return xfs_file_dio_write_unaligned(ip, iocb, from); 841 if (xfs_is_zoned_inode(ip)) 842 return xfs_file_dio_write_zoned(ip, iocb, from); 843 return xfs_file_dio_write_aligned(ip, iocb, from, 844 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); 845 } 846 847 static noinline ssize_t 848 xfs_file_dax_write( 849 struct kiocb *iocb, 850 struct iov_iter *from) 851 { 852 struct inode *inode = iocb->ki_filp->f_mapping->host; 853 struct xfs_inode *ip = XFS_I(inode); 854 unsigned int iolock = XFS_IOLOCK_EXCL; 855 ssize_t ret, error = 0; 856 loff_t pos; 857 858 ret = xfs_ilock_iocb(iocb, iolock); 859 if (ret) 860 return ret; 861 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 862 if (ret) 863 goto out; 864 865 pos = iocb->ki_pos; 866 867 trace_xfs_file_dax_write(iocb, from); 868 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 869 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 870 i_size_write(inode, iocb->ki_pos); 871 error = xfs_setfilesize(ip, pos, ret); 872 } 873 out: 874 if (iolock) 875 xfs_iunlock(ip, iolock); 876 if (error) 877 return error; 878 879 if (ret > 0) { 880 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 881 882 /* Handle various SYNC-type writes */ 883 ret = generic_write_sync(iocb, ret); 884 } 885 return ret; 886 } 887 888 STATIC ssize_t 889 xfs_file_buffered_write( 890 struct kiocb *iocb, 891 struct iov_iter *from) 892 { 893 struct inode *inode = iocb->ki_filp->f_mapping->host; 894 struct xfs_inode *ip = XFS_I(inode); 895 ssize_t ret; 896 bool cleared_space = false; 897 unsigned int iolock; 898 899 write_retry: 900 iolock = XFS_IOLOCK_EXCL; 901 ret = xfs_ilock_iocb(iocb, iolock); 902 if (ret) 903 return ret; 904 905 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 906 if (ret) 907 goto out; 908 909 trace_xfs_file_buffered_write(iocb, from); 910 ret = iomap_file_buffered_write(iocb, from, 911 &xfs_buffered_write_iomap_ops, NULL); 912 913 /* 914 * If we hit a space limit, try to free up some lingering preallocated 915 * space before returning an error. In the case of ENOSPC, first try to 916 * write back all dirty inodes to free up some of the excess reserved 917 * metadata space. This reduces the chances that the eofblocks scan 918 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 919 * also behaves as a filter to prevent too many eofblocks scans from 920 * running at the same time. Use a synchronous scan to increase the 921 * effectiveness of the scan. 922 */ 923 if (ret == -EDQUOT && !cleared_space) { 924 xfs_iunlock(ip, iolock); 925 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 926 cleared_space = true; 927 goto write_retry; 928 } else if (ret == -ENOSPC && !cleared_space) { 929 struct xfs_icwalk icw = {0}; 930 931 cleared_space = true; 932 xfs_flush_inodes(ip->i_mount); 933 934 xfs_iunlock(ip, iolock); 935 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 936 xfs_blockgc_free_space(ip->i_mount, &icw); 937 goto write_retry; 938 } 939 940 out: 941 if (iolock) 942 xfs_iunlock(ip, iolock); 943 944 if (ret > 0) { 945 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 946 /* Handle various SYNC-type writes */ 947 ret = generic_write_sync(iocb, ret); 948 } 949 return ret; 950 } 951 952 STATIC ssize_t 953 xfs_file_buffered_write_zoned( 954 struct kiocb *iocb, 955 struct iov_iter *from) 956 { 957 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 958 struct xfs_mount *mp = ip->i_mount; 959 unsigned int iolock = XFS_IOLOCK_EXCL; 960 bool cleared_space = false; 961 struct xfs_zone_alloc_ctx ac = { }; 962 ssize_t ret; 963 964 ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); 965 if (ret < 0) 966 return ret; 967 968 ret = xfs_ilock_iocb(iocb, iolock); 969 if (ret) 970 goto out_unreserve; 971 972 ret = xfs_file_write_checks(iocb, from, &iolock, &ac); 973 if (ret) 974 goto out_unlock; 975 976 /* 977 * Truncate the iter to the length that we were actually able to 978 * allocate blocks for. This needs to happen after 979 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND 980 * writes. 981 */ 982 iov_iter_truncate(from, 983 XFS_FSB_TO_B(mp, ac.reserved_blocks) - 984 (iocb->ki_pos & mp->m_blockmask)); 985 if (!iov_iter_count(from)) 986 goto out_unlock; 987 988 retry: 989 trace_xfs_file_buffered_write(iocb, from); 990 ret = iomap_file_buffered_write(iocb, from, 991 &xfs_buffered_write_iomap_ops, &ac); 992 if (ret == -ENOSPC && !cleared_space) { 993 /* 994 * Kick off writeback to convert delalloc space and release the 995 * usually too pessimistic indirect block reservations. 996 */ 997 xfs_flush_inodes(mp); 998 cleared_space = true; 999 goto retry; 1000 } 1001 1002 out_unlock: 1003 xfs_iunlock(ip, iolock); 1004 out_unreserve: 1005 xfs_zoned_space_unreserve(ip, &ac); 1006 if (ret > 0) { 1007 XFS_STATS_ADD(mp, xs_write_bytes, ret); 1008 ret = generic_write_sync(iocb, ret); 1009 } 1010 return ret; 1011 } 1012 1013 STATIC ssize_t 1014 xfs_file_write_iter( 1015 struct kiocb *iocb, 1016 struct iov_iter *from) 1017 { 1018 struct inode *inode = iocb->ki_filp->f_mapping->host; 1019 struct xfs_inode *ip = XFS_I(inode); 1020 ssize_t ret; 1021 size_t ocount = iov_iter_count(from); 1022 1023 XFS_STATS_INC(ip->i_mount, xs_write_calls); 1024 1025 if (ocount == 0) 1026 return 0; 1027 1028 if (xfs_is_shutdown(ip->i_mount)) 1029 return -EIO; 1030 1031 if (IS_DAX(inode)) 1032 return xfs_file_dax_write(iocb, from); 1033 1034 if (iocb->ki_flags & IOCB_ATOMIC) { 1035 /* 1036 * Currently only atomic writing of a single FS block is 1037 * supported. It would be possible to atomic write smaller than 1038 * a FS block, but there is no requirement to support this. 1039 * Note that iomap also does not support this yet. 1040 */ 1041 if (ocount != ip->i_mount->m_sb.sb_blocksize) 1042 return -EINVAL; 1043 ret = generic_atomic_write_valid(iocb, from); 1044 if (ret) 1045 return ret; 1046 } 1047 1048 if (iocb->ki_flags & IOCB_DIRECT) { 1049 /* 1050 * Allow a directio write to fall back to a buffered 1051 * write *only* in the case that we're doing a reflink 1052 * CoW. In all other directio scenarios we do not 1053 * allow an operation to fall back to buffered mode. 1054 */ 1055 ret = xfs_file_dio_write(iocb, from); 1056 if (ret != -ENOTBLK) 1057 return ret; 1058 } 1059 1060 if (xfs_is_zoned_inode(ip)) 1061 return xfs_file_buffered_write_zoned(iocb, from); 1062 return xfs_file_buffered_write(iocb, from); 1063 } 1064 1065 /* Does this file, inode, or mount want synchronous writes? */ 1066 static inline bool xfs_file_sync_writes(struct file *filp) 1067 { 1068 struct xfs_inode *ip = XFS_I(file_inode(filp)); 1069 1070 if (xfs_has_wsync(ip->i_mount)) 1071 return true; 1072 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 1073 return true; 1074 if (IS_SYNC(file_inode(filp))) 1075 return true; 1076 1077 return false; 1078 } 1079 1080 static int 1081 xfs_falloc_newsize( 1082 struct file *file, 1083 int mode, 1084 loff_t offset, 1085 loff_t len, 1086 loff_t *new_size) 1087 { 1088 struct inode *inode = file_inode(file); 1089 1090 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) 1091 return 0; 1092 *new_size = offset + len; 1093 return inode_newsize_ok(inode, *new_size); 1094 } 1095 1096 static int 1097 xfs_falloc_setsize( 1098 struct file *file, 1099 loff_t new_size) 1100 { 1101 struct iattr iattr = { 1102 .ia_valid = ATTR_SIZE, 1103 .ia_size = new_size, 1104 }; 1105 1106 if (!new_size) 1107 return 0; 1108 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), 1109 &iattr); 1110 } 1111 1112 static int 1113 xfs_falloc_collapse_range( 1114 struct file *file, 1115 loff_t offset, 1116 loff_t len, 1117 struct xfs_zone_alloc_ctx *ac) 1118 { 1119 struct inode *inode = file_inode(file); 1120 loff_t new_size = i_size_read(inode) - len; 1121 int error; 1122 1123 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1124 return -EINVAL; 1125 1126 /* 1127 * There is no need to overlap collapse range with EOF, in which case it 1128 * is effectively a truncate operation 1129 */ 1130 if (offset + len >= i_size_read(inode)) 1131 return -EINVAL; 1132 1133 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); 1134 if (error) 1135 return error; 1136 return xfs_falloc_setsize(file, new_size); 1137 } 1138 1139 static int 1140 xfs_falloc_insert_range( 1141 struct file *file, 1142 loff_t offset, 1143 loff_t len) 1144 { 1145 struct inode *inode = file_inode(file); 1146 loff_t isize = i_size_read(inode); 1147 int error; 1148 1149 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1150 return -EINVAL; 1151 1152 /* 1153 * New inode size must not exceed ->s_maxbytes, accounting for 1154 * possible signed overflow. 1155 */ 1156 if (inode->i_sb->s_maxbytes - isize < len) 1157 return -EFBIG; 1158 1159 /* Offset should be less than i_size */ 1160 if (offset >= isize) 1161 return -EINVAL; 1162 1163 error = xfs_falloc_setsize(file, isize + len); 1164 if (error) 1165 return error; 1166 1167 /* 1168 * Perform hole insertion now that the file size has been updated so 1169 * that if we crash during the operation we don't leave shifted extents 1170 * past EOF and hence losing access to the data that is contained within 1171 * them. 1172 */ 1173 return xfs_insert_file_space(XFS_I(inode), offset, len); 1174 } 1175 1176 /* 1177 * Punch a hole and prealloc the range. We use a hole punch rather than 1178 * unwritten extent conversion for two reasons: 1179 * 1180 * 1.) Hole punch handles partial block zeroing for us. 1181 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by 1182 * virtue of the hole punch. 1183 */ 1184 static int 1185 xfs_falloc_zero_range( 1186 struct file *file, 1187 int mode, 1188 loff_t offset, 1189 loff_t len, 1190 struct xfs_zone_alloc_ctx *ac) 1191 { 1192 struct inode *inode = file_inode(file); 1193 unsigned int blksize = i_blocksize(inode); 1194 loff_t new_size = 0; 1195 int error; 1196 1197 trace_xfs_zero_file_space(XFS_I(inode)); 1198 1199 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1200 if (error) 1201 return error; 1202 1203 error = xfs_free_file_space(XFS_I(inode), offset, len, ac); 1204 if (error) 1205 return error; 1206 1207 len = round_up(offset + len, blksize) - round_down(offset, blksize); 1208 offset = round_down(offset, blksize); 1209 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1210 if (error) 1211 return error; 1212 return xfs_falloc_setsize(file, new_size); 1213 } 1214 1215 static int 1216 xfs_falloc_unshare_range( 1217 struct file *file, 1218 int mode, 1219 loff_t offset, 1220 loff_t len) 1221 { 1222 struct inode *inode = file_inode(file); 1223 loff_t new_size = 0; 1224 int error; 1225 1226 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1227 if (error) 1228 return error; 1229 1230 error = xfs_reflink_unshare(XFS_I(inode), offset, len); 1231 if (error) 1232 return error; 1233 1234 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1235 if (error) 1236 return error; 1237 return xfs_falloc_setsize(file, new_size); 1238 } 1239 1240 static int 1241 xfs_falloc_allocate_range( 1242 struct file *file, 1243 int mode, 1244 loff_t offset, 1245 loff_t len) 1246 { 1247 struct inode *inode = file_inode(file); 1248 loff_t new_size = 0; 1249 int error; 1250 1251 /* 1252 * If always_cow mode we can't use preallocations and thus should not 1253 * create them. 1254 */ 1255 if (xfs_is_always_cow_inode(XFS_I(inode))) 1256 return -EOPNOTSUPP; 1257 1258 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1259 if (error) 1260 return error; 1261 1262 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1263 if (error) 1264 return error; 1265 return xfs_falloc_setsize(file, new_size); 1266 } 1267 1268 #define XFS_FALLOC_FL_SUPPORTED \ 1269 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 1270 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ 1271 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) 1272 1273 STATIC long 1274 __xfs_file_fallocate( 1275 struct file *file, 1276 int mode, 1277 loff_t offset, 1278 loff_t len, 1279 struct xfs_zone_alloc_ctx *ac) 1280 { 1281 struct inode *inode = file_inode(file); 1282 struct xfs_inode *ip = XFS_I(inode); 1283 long error; 1284 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1285 1286 xfs_ilock(ip, iolock); 1287 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 1288 if (error) 1289 goto out_unlock; 1290 1291 /* 1292 * Must wait for all AIO to complete before we continue as AIO can 1293 * change the file size on completion without holding any locks we 1294 * currently hold. We must do this first because AIO can update both 1295 * the on disk and in memory inode sizes, and the operations that follow 1296 * require the in-memory size to be fully up-to-date. 1297 */ 1298 inode_dio_wait(inode); 1299 1300 error = file_modified(file); 1301 if (error) 1302 goto out_unlock; 1303 1304 switch (mode & FALLOC_FL_MODE_MASK) { 1305 case FALLOC_FL_PUNCH_HOLE: 1306 error = xfs_free_file_space(ip, offset, len, ac); 1307 break; 1308 case FALLOC_FL_COLLAPSE_RANGE: 1309 error = xfs_falloc_collapse_range(file, offset, len, ac); 1310 break; 1311 case FALLOC_FL_INSERT_RANGE: 1312 error = xfs_falloc_insert_range(file, offset, len); 1313 break; 1314 case FALLOC_FL_ZERO_RANGE: 1315 error = xfs_falloc_zero_range(file, mode, offset, len, ac); 1316 break; 1317 case FALLOC_FL_UNSHARE_RANGE: 1318 error = xfs_falloc_unshare_range(file, mode, offset, len); 1319 break; 1320 case FALLOC_FL_ALLOCATE_RANGE: 1321 error = xfs_falloc_allocate_range(file, mode, offset, len); 1322 break; 1323 default: 1324 error = -EOPNOTSUPP; 1325 break; 1326 } 1327 1328 if (!error && xfs_file_sync_writes(file)) 1329 error = xfs_log_force_inode(ip); 1330 1331 out_unlock: 1332 xfs_iunlock(ip, iolock); 1333 return error; 1334 } 1335 1336 static long 1337 xfs_file_zoned_fallocate( 1338 struct file *file, 1339 int mode, 1340 loff_t offset, 1341 loff_t len) 1342 { 1343 struct xfs_zone_alloc_ctx ac = { }; 1344 struct xfs_inode *ip = XFS_I(file_inode(file)); 1345 int error; 1346 1347 error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); 1348 if (error) 1349 return error; 1350 error = __xfs_file_fallocate(file, mode, offset, len, &ac); 1351 xfs_zoned_space_unreserve(ip, &ac); 1352 return error; 1353 } 1354 1355 static long 1356 xfs_file_fallocate( 1357 struct file *file, 1358 int mode, 1359 loff_t offset, 1360 loff_t len) 1361 { 1362 struct inode *inode = file_inode(file); 1363 1364 if (!S_ISREG(inode->i_mode)) 1365 return -EINVAL; 1366 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1367 return -EOPNOTSUPP; 1368 1369 /* 1370 * For zoned file systems, zeroing the first and last block of a hole 1371 * punch requires allocating a new block to rewrite the remaining data 1372 * and new zeroes out of place. Get a reservations for those before 1373 * taking the iolock. Dip into the reserved pool because we are 1374 * expected to be able to punch a hole even on a completely full 1375 * file system. 1376 */ 1377 if (xfs_is_zoned_inode(XFS_I(inode)) && 1378 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 1379 FALLOC_FL_COLLAPSE_RANGE))) 1380 return xfs_file_zoned_fallocate(file, mode, offset, len); 1381 return __xfs_file_fallocate(file, mode, offset, len, NULL); 1382 } 1383 1384 STATIC int 1385 xfs_file_fadvise( 1386 struct file *file, 1387 loff_t start, 1388 loff_t end, 1389 int advice) 1390 { 1391 struct xfs_inode *ip = XFS_I(file_inode(file)); 1392 int ret; 1393 int lockflags = 0; 1394 1395 /* 1396 * Operations creating pages in page cache need protection from hole 1397 * punching and similar ops 1398 */ 1399 if (advice == POSIX_FADV_WILLNEED) { 1400 lockflags = XFS_IOLOCK_SHARED; 1401 xfs_ilock(ip, lockflags); 1402 } 1403 ret = generic_fadvise(file, start, end, advice); 1404 if (lockflags) 1405 xfs_iunlock(ip, lockflags); 1406 return ret; 1407 } 1408 1409 STATIC loff_t 1410 xfs_file_remap_range( 1411 struct file *file_in, 1412 loff_t pos_in, 1413 struct file *file_out, 1414 loff_t pos_out, 1415 loff_t len, 1416 unsigned int remap_flags) 1417 { 1418 struct inode *inode_in = file_inode(file_in); 1419 struct xfs_inode *src = XFS_I(inode_in); 1420 struct inode *inode_out = file_inode(file_out); 1421 struct xfs_inode *dest = XFS_I(inode_out); 1422 struct xfs_mount *mp = src->i_mount; 1423 loff_t remapped = 0; 1424 xfs_extlen_t cowextsize; 1425 int ret; 1426 1427 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1428 return -EINVAL; 1429 1430 if (!xfs_has_reflink(mp)) 1431 return -EOPNOTSUPP; 1432 1433 if (xfs_is_shutdown(mp)) 1434 return -EIO; 1435 1436 /* Prepare and then clone file data. */ 1437 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1438 &len, remap_flags); 1439 if (ret || len == 0) 1440 return ret; 1441 1442 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1443 1444 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1445 &remapped); 1446 if (ret) 1447 goto out_unlock; 1448 1449 /* 1450 * Carry the cowextsize hint from src to dest if we're sharing the 1451 * entire source file to the entire destination file, the source file 1452 * has a cowextsize hint, and the destination file does not. 1453 */ 1454 cowextsize = 0; 1455 if (pos_in == 0 && len == i_size_read(inode_in) && 1456 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1457 pos_out == 0 && len >= i_size_read(inode_out) && 1458 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1459 cowextsize = src->i_cowextsize; 1460 1461 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1462 remap_flags); 1463 if (ret) 1464 goto out_unlock; 1465 1466 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1467 xfs_log_force_inode(dest); 1468 out_unlock: 1469 xfs_iunlock2_remapping(src, dest); 1470 if (ret) 1471 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1472 /* 1473 * If the caller did not set CAN_SHORTEN, then it is not prepared to 1474 * handle partial results -- either the whole remap succeeds, or we 1475 * must say why it did not. In this case, any error should be returned 1476 * to the caller. 1477 */ 1478 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) 1479 return ret; 1480 return remapped > 0 ? remapped : ret; 1481 } 1482 1483 STATIC int 1484 xfs_file_open( 1485 struct inode *inode, 1486 struct file *file) 1487 { 1488 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1489 return -EIO; 1490 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 1491 if (xfs_inode_can_atomicwrite(XFS_I(inode))) 1492 file->f_mode |= FMODE_CAN_ATOMIC_WRITE; 1493 return generic_file_open(inode, file); 1494 } 1495 1496 STATIC int 1497 xfs_dir_open( 1498 struct inode *inode, 1499 struct file *file) 1500 { 1501 struct xfs_inode *ip = XFS_I(inode); 1502 unsigned int mode; 1503 int error; 1504 1505 if (xfs_is_shutdown(ip->i_mount)) 1506 return -EIO; 1507 error = generic_file_open(inode, file); 1508 if (error) 1509 return error; 1510 1511 /* 1512 * If there are any blocks, read-ahead block 0 as we're almost 1513 * certain to have the next operation be a read there. 1514 */ 1515 mode = xfs_ilock_data_map_shared(ip); 1516 if (ip->i_df.if_nextents > 0) 1517 error = xfs_dir3_data_readahead(ip, 0, 0); 1518 xfs_iunlock(ip, mode); 1519 return error; 1520 } 1521 1522 /* 1523 * Don't bother propagating errors. We're just doing cleanup, and the caller 1524 * ignores the return value anyway. 1525 */ 1526 STATIC int 1527 xfs_file_release( 1528 struct inode *inode, 1529 struct file *file) 1530 { 1531 struct xfs_inode *ip = XFS_I(inode); 1532 struct xfs_mount *mp = ip->i_mount; 1533 1534 /* 1535 * If this is a read-only mount or the file system has been shut down, 1536 * don't generate I/O. 1537 */ 1538 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) 1539 return 0; 1540 1541 /* 1542 * If we previously truncated this file and removed old data in the 1543 * process, we want to initiate "early" writeout on the last close. 1544 * This is an attempt to combat the notorious NULL files problem which 1545 * is particularly noticeable from a truncate down, buffered (re-)write 1546 * (delalloc), followed by a crash. What we are effectively doing here 1547 * is significantly reducing the time window where we'd otherwise be 1548 * exposed to that problem. 1549 */ 1550 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { 1551 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); 1552 if (ip->i_delayed_blks > 0) 1553 filemap_flush(inode->i_mapping); 1554 } 1555 1556 /* 1557 * XFS aggressively preallocates post-EOF space to generate contiguous 1558 * allocations for writers that append to the end of the file. 1559 * 1560 * To support workloads that close and reopen the file frequently, these 1561 * preallocations usually persist after a close unless it is the first 1562 * close for the inode. This is a tradeoff to generate tightly packed 1563 * data layouts for unpacking tarballs or similar archives that write 1564 * one file after another without going back to it while keeping the 1565 * preallocation for files that have recurring open/write/close cycles. 1566 * 1567 * This heuristic is skipped for inodes with the append-only flag as 1568 * that flag is rather pointless for inodes written only once. 1569 * 1570 * There is no point in freeing blocks here for open but unlinked files 1571 * as they will be taken care of by the inactivation path soon. 1572 * 1573 * When releasing a read-only context, don't flush data or trim post-EOF 1574 * blocks. This avoids open/read/close workloads from removing EOF 1575 * blocks that other writers depend upon to reduce fragmentation. 1576 * 1577 * Inodes on the zoned RT device never have preallocations, so skip 1578 * taking the locks below. 1579 */ 1580 if (!inode->i_nlink || 1581 !(file->f_mode & FMODE_WRITE) || 1582 (ip->i_diflags & XFS_DIFLAG_APPEND) || 1583 xfs_is_zoned_inode(ip)) 1584 return 0; 1585 1586 /* 1587 * If we can't get the iolock just skip truncating the blocks past EOF 1588 * because we could deadlock with the mmap_lock otherwise. We'll get 1589 * another chance to drop them once the last reference to the inode is 1590 * dropped, so we'll never leak blocks permanently. 1591 */ 1592 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1593 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1594 if (xfs_can_free_eofblocks(ip) && 1595 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) 1596 xfs_free_eofblocks(ip); 1597 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1598 } 1599 1600 return 0; 1601 } 1602 1603 STATIC int 1604 xfs_file_readdir( 1605 struct file *file, 1606 struct dir_context *ctx) 1607 { 1608 struct inode *inode = file_inode(file); 1609 xfs_inode_t *ip = XFS_I(inode); 1610 size_t bufsize; 1611 1612 /* 1613 * The Linux API doesn't pass down the total size of the buffer 1614 * we read into down to the filesystem. With the filldir concept 1615 * it's not needed for correct information, but the XFS dir2 leaf 1616 * code wants an estimate of the buffer size to calculate it's 1617 * readahead window and size the buffers used for mapping to 1618 * physical blocks. 1619 * 1620 * Try to give it an estimate that's good enough, maybe at some 1621 * point we can change the ->readdir prototype to include the 1622 * buffer size. For now we use the current glibc buffer size. 1623 */ 1624 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1625 1626 return xfs_readdir(NULL, ip, ctx, bufsize); 1627 } 1628 1629 STATIC loff_t 1630 xfs_file_llseek( 1631 struct file *file, 1632 loff_t offset, 1633 int whence) 1634 { 1635 struct inode *inode = file->f_mapping->host; 1636 1637 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1638 return -EIO; 1639 1640 switch (whence) { 1641 default: 1642 return generic_file_llseek(file, offset, whence); 1643 case SEEK_HOLE: 1644 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1645 break; 1646 case SEEK_DATA: 1647 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1648 break; 1649 } 1650 1651 if (offset < 0) 1652 return offset; 1653 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1654 } 1655 1656 static inline vm_fault_t 1657 xfs_dax_fault_locked( 1658 struct vm_fault *vmf, 1659 unsigned int order, 1660 bool write_fault) 1661 { 1662 vm_fault_t ret; 1663 pfn_t pfn; 1664 1665 if (!IS_ENABLED(CONFIG_FS_DAX)) { 1666 ASSERT(0); 1667 return VM_FAULT_SIGBUS; 1668 } 1669 ret = dax_iomap_fault(vmf, order, &pfn, NULL, 1670 (write_fault && !vmf->cow_page) ? 1671 &xfs_dax_write_iomap_ops : 1672 &xfs_read_iomap_ops); 1673 if (ret & VM_FAULT_NEEDDSYNC) 1674 ret = dax_finish_sync_fault(vmf, order, pfn); 1675 return ret; 1676 } 1677 1678 static vm_fault_t 1679 xfs_dax_read_fault( 1680 struct vm_fault *vmf, 1681 unsigned int order) 1682 { 1683 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1684 vm_fault_t ret; 1685 1686 trace_xfs_read_fault(ip, order); 1687 1688 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1689 ret = xfs_dax_fault_locked(vmf, order, false); 1690 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1691 1692 return ret; 1693 } 1694 1695 /* 1696 * Locking for serialisation of IO during page faults. This results in a lock 1697 * ordering of: 1698 * 1699 * mmap_lock (MM) 1700 * sb_start_pagefault(vfs, freeze) 1701 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1702 * page_lock (MM) 1703 * i_lock (XFS - extent map serialisation) 1704 */ 1705 static vm_fault_t 1706 __xfs_write_fault( 1707 struct vm_fault *vmf, 1708 unsigned int order, 1709 struct xfs_zone_alloc_ctx *ac) 1710 { 1711 struct inode *inode = file_inode(vmf->vma->vm_file); 1712 struct xfs_inode *ip = XFS_I(inode); 1713 unsigned int lock_mode = XFS_MMAPLOCK_SHARED; 1714 vm_fault_t ret; 1715 1716 trace_xfs_write_fault(ip, order); 1717 1718 sb_start_pagefault(inode->i_sb); 1719 file_update_time(vmf->vma->vm_file); 1720 1721 /* 1722 * Normally we only need the shared mmaplock, but if a reflink remap is 1723 * in progress we take the exclusive lock to wait for the remap to 1724 * finish before taking a write fault. 1725 */ 1726 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1727 if (xfs_iflags_test(ip, XFS_IREMAPPING)) { 1728 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1729 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1730 lock_mode = XFS_MMAPLOCK_EXCL; 1731 } 1732 1733 if (IS_DAX(inode)) 1734 ret = xfs_dax_fault_locked(vmf, order, true); 1735 else 1736 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, 1737 ac); 1738 xfs_iunlock(ip, lock_mode); 1739 1740 sb_end_pagefault(inode->i_sb); 1741 return ret; 1742 } 1743 1744 static vm_fault_t 1745 xfs_write_fault_zoned( 1746 struct vm_fault *vmf, 1747 unsigned int order) 1748 { 1749 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1750 unsigned int len = folio_size(page_folio(vmf->page)); 1751 struct xfs_zone_alloc_ctx ac = { }; 1752 int error; 1753 vm_fault_t ret; 1754 1755 /* 1756 * This could over-allocate as it doesn't check for truncation. 1757 * 1758 * But as the overallocation is limited to less than a folio and will be 1759 * release instantly that's just fine. 1760 */ 1761 error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, 1762 &ac); 1763 if (error < 0) 1764 return vmf_fs_error(error); 1765 ret = __xfs_write_fault(vmf, order, &ac); 1766 xfs_zoned_space_unreserve(ip, &ac); 1767 return ret; 1768 } 1769 1770 static vm_fault_t 1771 xfs_write_fault( 1772 struct vm_fault *vmf, 1773 unsigned int order) 1774 { 1775 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) 1776 return xfs_write_fault_zoned(vmf, order); 1777 return __xfs_write_fault(vmf, order, NULL); 1778 } 1779 1780 static inline bool 1781 xfs_is_write_fault( 1782 struct vm_fault *vmf) 1783 { 1784 return (vmf->flags & FAULT_FLAG_WRITE) && 1785 (vmf->vma->vm_flags & VM_SHARED); 1786 } 1787 1788 static vm_fault_t 1789 xfs_filemap_fault( 1790 struct vm_fault *vmf) 1791 { 1792 struct inode *inode = file_inode(vmf->vma->vm_file); 1793 1794 /* DAX can shortcut the normal fault path on write faults! */ 1795 if (IS_DAX(inode)) { 1796 if (xfs_is_write_fault(vmf)) 1797 return xfs_write_fault(vmf, 0); 1798 return xfs_dax_read_fault(vmf, 0); 1799 } 1800 1801 trace_xfs_read_fault(XFS_I(inode), 0); 1802 return filemap_fault(vmf); 1803 } 1804 1805 static vm_fault_t 1806 xfs_filemap_huge_fault( 1807 struct vm_fault *vmf, 1808 unsigned int order) 1809 { 1810 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1811 return VM_FAULT_FALLBACK; 1812 1813 /* DAX can shortcut the normal fault path on write faults! */ 1814 if (xfs_is_write_fault(vmf)) 1815 return xfs_write_fault(vmf, order); 1816 return xfs_dax_read_fault(vmf, order); 1817 } 1818 1819 static vm_fault_t 1820 xfs_filemap_page_mkwrite( 1821 struct vm_fault *vmf) 1822 { 1823 return xfs_write_fault(vmf, 0); 1824 } 1825 1826 /* 1827 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1828 * on write faults. In reality, it needs to serialise against truncate and 1829 * prepare memory for writing so handle is as standard write fault. 1830 */ 1831 static vm_fault_t 1832 xfs_filemap_pfn_mkwrite( 1833 struct vm_fault *vmf) 1834 { 1835 return xfs_write_fault(vmf, 0); 1836 } 1837 1838 static const struct vm_operations_struct xfs_file_vm_ops = { 1839 .fault = xfs_filemap_fault, 1840 .huge_fault = xfs_filemap_huge_fault, 1841 .map_pages = filemap_map_pages, 1842 .page_mkwrite = xfs_filemap_page_mkwrite, 1843 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1844 }; 1845 1846 STATIC int 1847 xfs_file_mmap( 1848 struct file *file, 1849 struct vm_area_struct *vma) 1850 { 1851 struct inode *inode = file_inode(file); 1852 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 1853 1854 /* 1855 * We don't support synchronous mappings for non-DAX files and 1856 * for DAX files if underneath dax_device is not synchronous. 1857 */ 1858 if (!daxdev_mapping_supported(vma, target->bt_daxdev)) 1859 return -EOPNOTSUPP; 1860 1861 file_accessed(file); 1862 vma->vm_ops = &xfs_file_vm_ops; 1863 if (IS_DAX(inode)) 1864 vm_flags_set(vma, VM_HUGEPAGE); 1865 return 0; 1866 } 1867 1868 const struct file_operations xfs_file_operations = { 1869 .llseek = xfs_file_llseek, 1870 .read_iter = xfs_file_read_iter, 1871 .write_iter = xfs_file_write_iter, 1872 .splice_read = xfs_file_splice_read, 1873 .splice_write = iter_file_splice_write, 1874 .iopoll = iocb_bio_iopoll, 1875 .unlocked_ioctl = xfs_file_ioctl, 1876 #ifdef CONFIG_COMPAT 1877 .compat_ioctl = xfs_file_compat_ioctl, 1878 #endif 1879 .mmap = xfs_file_mmap, 1880 .open = xfs_file_open, 1881 .release = xfs_file_release, 1882 .fsync = xfs_file_fsync, 1883 .get_unmapped_area = thp_get_unmapped_area, 1884 .fallocate = xfs_file_fallocate, 1885 .fadvise = xfs_file_fadvise, 1886 .remap_file_range = xfs_file_remap_range, 1887 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 1888 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | 1889 FOP_DONTCACHE, 1890 }; 1891 1892 const struct file_operations xfs_dir_file_operations = { 1893 .open = xfs_dir_open, 1894 .read = generic_read_dir, 1895 .iterate_shared = xfs_file_readdir, 1896 .llseek = generic_file_llseek, 1897 .unlocked_ioctl = xfs_file_ioctl, 1898 #ifdef CONFIG_COMPAT 1899 .compat_ioctl = xfs_file_compat_ioctl, 1900 #endif 1901 .fsync = xfs_dir_fsync, 1902 }; 1903