1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 #include "xfs_file.h" 28 #include "xfs_aops.h" 29 #include "xfs_zone_alloc.h" 30 #include "xfs_error.h" 31 #include "xfs_errortag.h" 32 33 #include <linux/dax.h> 34 #include <linux/falloc.h> 35 #include <linux/backing-dev.h> 36 #include <linux/mman.h> 37 #include <linux/fadvise.h> 38 #include <linux/mount.h> 39 #include <linux/filelock.h> 40 41 static const struct vm_operations_struct xfs_file_vm_ops; 42 43 /* 44 * Decide if the given file range is aligned to the size of the fundamental 45 * allocation unit for the file. 46 */ 47 bool 48 xfs_is_falloc_aligned( 49 struct xfs_inode *ip, 50 loff_t pos, 51 long long int len) 52 { 53 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); 54 55 if (!is_power_of_2(alloc_unit)) 56 return isaligned_64(pos, alloc_unit) && 57 isaligned_64(len, alloc_unit); 58 59 return !((pos | len) & (alloc_unit - 1)); 60 } 61 62 /* 63 * Fsync operations on directories are much simpler than on regular files, 64 * as there is no file data to flush, and thus also no need for explicit 65 * cache flush operations, and there are no non-transaction metadata updates 66 * on directories either. 67 */ 68 STATIC int 69 xfs_dir_fsync( 70 struct file *file, 71 loff_t start, 72 loff_t end, 73 int datasync) 74 { 75 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 76 77 trace_xfs_dir_fsync(ip); 78 return xfs_log_force_inode(ip); 79 } 80 81 /* 82 * All metadata updates are logged, which means that we just have to push the 83 * journal to the required sequence number than holds the updates. We track 84 * datasync commits separately to full sync commits, and hence only need to 85 * select the correct sequence number for the log force here. 86 * 87 * We don't have to serialise against concurrent modifications, as we do not 88 * have to wait for modifications that have not yet completed. We define a 89 * transaction commit as completing when the commit sequence number is updated, 90 * hence if the sequence number has not updated, the sync operation has been 91 * run before the commit completed and we don't have to wait for it. 92 * 93 * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain 94 * set on the log item until - at least - the journal flush completes. In 95 * reality, they are only cleared when the inode is fully unpinned (i.e. 96 * persistent in the journal and not dirty in the CIL), and so we rely on 97 * xfs_log_force_seq() either skipping sequences that have been persisted or 98 * waiting on sequences that are still in flight to correctly order concurrent 99 * sync operations. 100 */ 101 static int 102 xfs_fsync_flush_log( 103 struct xfs_inode *ip, 104 bool datasync, 105 int *log_flushed) 106 { 107 struct xfs_inode_log_item *iip = ip->i_itemp; 108 xfs_csn_t seq = 0; 109 110 spin_lock(&iip->ili_lock); 111 if (datasync) 112 seq = iip->ili_datasync_seq; 113 else 114 seq = iip->ili_commit_seq; 115 spin_unlock(&iip->ili_lock); 116 117 if (!seq) 118 return 0; 119 120 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 121 log_flushed); 122 } 123 124 STATIC int 125 xfs_file_fsync( 126 struct file *file, 127 loff_t start, 128 loff_t end, 129 int datasync) 130 { 131 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 132 struct xfs_mount *mp = ip->i_mount; 133 int error, err2; 134 int log_flushed = 0; 135 136 trace_xfs_file_fsync(ip); 137 138 error = file_write_and_wait_range(file, start, end); 139 if (error) 140 return error; 141 142 if (xfs_is_shutdown(mp)) 143 return -EIO; 144 145 xfs_iflags_clear(ip, XFS_ITRUNCATED); 146 147 /* 148 * If we have an RT and/or log subvolume we need to make sure to flush 149 * the write cache the device used for file data first. This is to 150 * ensure newly written file data make it to disk before logging the new 151 * inode size in case of an extending write. 152 */ 153 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) 154 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 155 else if (mp->m_logdev_targp != mp->m_ddev_targp) 156 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 157 158 /* 159 * If the inode has a inode log item attached, it may need the journal 160 * flushed to persist any changes the log item might be tracking. 161 */ 162 if (ip->i_itemp) { 163 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 164 if (err2 && !error) 165 error = err2; 166 } 167 168 /* 169 * If we only have a single device, and the log force about was 170 * a no-op we might have to flush the data device cache here. 171 * This can only happen for fdatasync/O_DSYNC if we were overwriting 172 * an already allocated file and thus do not have any metadata to 173 * commit. 174 */ 175 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 176 mp->m_logdev_targp == mp->m_ddev_targp) { 177 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 178 if (err2 && !error) 179 error = err2; 180 } 181 182 return error; 183 } 184 185 static int 186 xfs_ilock_iocb( 187 struct kiocb *iocb, 188 unsigned int lock_mode) 189 { 190 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 191 192 if (iocb->ki_flags & IOCB_NOWAIT) { 193 if (!xfs_ilock_nowait(ip, lock_mode)) 194 return -EAGAIN; 195 } else { 196 xfs_ilock(ip, lock_mode); 197 } 198 199 return 0; 200 } 201 202 static int 203 xfs_ilock_iocb_for_write( 204 struct kiocb *iocb, 205 unsigned int *lock_mode) 206 { 207 ssize_t ret; 208 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 209 210 ret = xfs_ilock_iocb(iocb, *lock_mode); 211 if (ret) 212 return ret; 213 214 /* 215 * If a reflink remap is in progress we always need to take the iolock 216 * exclusively to wait for it to finish. 217 */ 218 if (*lock_mode == XFS_IOLOCK_SHARED && 219 xfs_iflags_test(ip, XFS_IREMAPPING)) { 220 xfs_iunlock(ip, *lock_mode); 221 *lock_mode = XFS_IOLOCK_EXCL; 222 return xfs_ilock_iocb(iocb, *lock_mode); 223 } 224 225 return 0; 226 } 227 228 STATIC ssize_t 229 xfs_file_dio_read( 230 struct kiocb *iocb, 231 struct iov_iter *to) 232 { 233 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 234 ssize_t ret; 235 236 trace_xfs_file_direct_read(iocb, to); 237 238 if (!iov_iter_count(to)) 239 return 0; /* skip atime */ 240 241 file_accessed(iocb->ki_filp); 242 243 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 244 if (ret) 245 return ret; 246 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 247 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 248 249 return ret; 250 } 251 252 static noinline ssize_t 253 xfs_file_dax_read( 254 struct kiocb *iocb, 255 struct iov_iter *to) 256 { 257 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 258 ssize_t ret = 0; 259 260 trace_xfs_file_dax_read(iocb, to); 261 262 if (!iov_iter_count(to)) 263 return 0; /* skip atime */ 264 265 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 266 if (ret) 267 return ret; 268 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 269 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 270 271 file_accessed(iocb->ki_filp); 272 return ret; 273 } 274 275 STATIC ssize_t 276 xfs_file_buffered_read( 277 struct kiocb *iocb, 278 struct iov_iter *to) 279 { 280 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 281 ssize_t ret; 282 283 trace_xfs_file_buffered_read(iocb, to); 284 285 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 286 if (ret) 287 return ret; 288 ret = generic_file_read_iter(iocb, to); 289 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 290 291 return ret; 292 } 293 294 STATIC ssize_t 295 xfs_file_read_iter( 296 struct kiocb *iocb, 297 struct iov_iter *to) 298 { 299 struct inode *inode = file_inode(iocb->ki_filp); 300 struct xfs_mount *mp = XFS_I(inode)->i_mount; 301 ssize_t ret = 0; 302 303 XFS_STATS_INC(mp, xs_read_calls); 304 305 if (xfs_is_shutdown(mp)) 306 return -EIO; 307 308 if (IS_DAX(inode)) 309 ret = xfs_file_dax_read(iocb, to); 310 else if (iocb->ki_flags & IOCB_DIRECT) 311 ret = xfs_file_dio_read(iocb, to); 312 else 313 ret = xfs_file_buffered_read(iocb, to); 314 315 if (ret > 0) 316 XFS_STATS_ADD(mp, xs_read_bytes, ret); 317 return ret; 318 } 319 320 STATIC ssize_t 321 xfs_file_splice_read( 322 struct file *in, 323 loff_t *ppos, 324 struct pipe_inode_info *pipe, 325 size_t len, 326 unsigned int flags) 327 { 328 struct inode *inode = file_inode(in); 329 struct xfs_inode *ip = XFS_I(inode); 330 struct xfs_mount *mp = ip->i_mount; 331 ssize_t ret = 0; 332 333 XFS_STATS_INC(mp, xs_read_calls); 334 335 if (xfs_is_shutdown(mp)) 336 return -EIO; 337 338 trace_xfs_file_splice_read(ip, *ppos, len); 339 340 xfs_ilock(ip, XFS_IOLOCK_SHARED); 341 ret = filemap_splice_read(in, ppos, pipe, len, flags); 342 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 343 if (ret > 0) 344 XFS_STATS_ADD(mp, xs_read_bytes, ret); 345 return ret; 346 } 347 348 /* 349 * Take care of zeroing post-EOF blocks when they might exist. 350 * 351 * Returns 0 if successfully, a negative error for a failure, or 1 if this 352 * function dropped the iolock and reacquired it exclusively and the caller 353 * needs to restart the write sanity checks. 354 */ 355 static ssize_t 356 xfs_file_write_zero_eof( 357 struct kiocb *iocb, 358 struct iov_iter *from, 359 unsigned int *iolock, 360 size_t count, 361 bool *drained_dio, 362 struct xfs_zone_alloc_ctx *ac) 363 { 364 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 365 loff_t isize; 366 int error; 367 368 /* 369 * We need to serialise against EOF updates that occur in IO completions 370 * here. We want to make sure that nobody is changing the size while 371 * we do this check until we have placed an IO barrier (i.e. hold 372 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 373 * spinlock effectively forms a memory barrier once we have 374 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 375 * hence be able to correctly determine if we need to run zeroing. 376 */ 377 spin_lock(&ip->i_flags_lock); 378 isize = i_size_read(VFS_I(ip)); 379 if (iocb->ki_pos <= isize) { 380 spin_unlock(&ip->i_flags_lock); 381 return 0; 382 } 383 spin_unlock(&ip->i_flags_lock); 384 385 if (iocb->ki_flags & IOCB_NOWAIT) 386 return -EAGAIN; 387 388 if (!*drained_dio) { 389 /* 390 * If zeroing is needed and we are currently holding the iolock 391 * shared, we need to update it to exclusive which implies 392 * having to redo all checks before. 393 */ 394 if (*iolock == XFS_IOLOCK_SHARED) { 395 xfs_iunlock(ip, *iolock); 396 *iolock = XFS_IOLOCK_EXCL; 397 xfs_ilock(ip, *iolock); 398 iov_iter_reexpand(from, count); 399 } 400 401 /* 402 * We now have an IO submission barrier in place, but AIO can do 403 * EOF updates during IO completion and hence we now need to 404 * wait for all of them to drain. Non-AIO DIO will have drained 405 * before we are given the XFS_IOLOCK_EXCL, and so for most 406 * cases this wait is a no-op. 407 */ 408 inode_dio_wait(VFS_I(ip)); 409 *drained_dio = true; 410 return 1; 411 } 412 413 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 414 415 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 416 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); 417 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 418 419 return error; 420 } 421 422 /* 423 * Common pre-write limit and setup checks. 424 * 425 * Called with the iolock held either shared and exclusive according to 426 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 427 * if called for a direct write beyond i_size. 428 */ 429 STATIC ssize_t 430 xfs_file_write_checks( 431 struct kiocb *iocb, 432 struct iov_iter *from, 433 unsigned int *iolock, 434 struct xfs_zone_alloc_ctx *ac) 435 { 436 struct inode *inode = iocb->ki_filp->f_mapping->host; 437 size_t count = iov_iter_count(from); 438 bool drained_dio = false; 439 ssize_t error; 440 441 restart: 442 error = generic_write_checks(iocb, from); 443 if (error <= 0) 444 return error; 445 446 if (iocb->ki_flags & IOCB_NOWAIT) { 447 error = break_layout(inode, false); 448 if (error == -EWOULDBLOCK) 449 error = -EAGAIN; 450 } else { 451 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 452 } 453 454 if (error) 455 return error; 456 457 /* 458 * For changing security info in file_remove_privs() we need i_rwsem 459 * exclusively. 460 */ 461 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 462 xfs_iunlock(XFS_I(inode), *iolock); 463 *iolock = XFS_IOLOCK_EXCL; 464 error = xfs_ilock_iocb(iocb, *iolock); 465 if (error) { 466 *iolock = 0; 467 return error; 468 } 469 goto restart; 470 } 471 472 /* 473 * If the offset is beyond the size of the file, we need to zero all 474 * blocks that fall between the existing EOF and the start of this 475 * write. 476 * 477 * We can do an unlocked check for i_size here safely as I/O completion 478 * can only extend EOF. Truncate is locked out at this point, so the 479 * EOF can not move backwards, only forwards. Hence we only need to take 480 * the slow path when we are at or beyond the current EOF. 481 */ 482 if (iocb->ki_pos > i_size_read(inode)) { 483 error = xfs_file_write_zero_eof(iocb, from, iolock, count, 484 &drained_dio, ac); 485 if (error == 1) 486 goto restart; 487 if (error) 488 return error; 489 } 490 491 return kiocb_modified(iocb); 492 } 493 494 static ssize_t 495 xfs_zoned_write_space_reserve( 496 struct xfs_mount *mp, 497 struct kiocb *iocb, 498 struct iov_iter *from, 499 unsigned int flags, 500 struct xfs_zone_alloc_ctx *ac) 501 { 502 loff_t count = iov_iter_count(from); 503 int error; 504 505 if (iocb->ki_flags & IOCB_NOWAIT) 506 flags |= XFS_ZR_NOWAIT; 507 508 /* 509 * Check the rlimit and LFS boundary first so that we don't over-reserve 510 * by possibly a lot. 511 * 512 * The generic write path will redo this check later, and it might have 513 * changed by then. If it got expanded we'll stick to our earlier 514 * smaller limit, and if it is decreased the new smaller limit will be 515 * used and our extra space reservation will be returned after finishing 516 * the write. 517 */ 518 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); 519 if (error) 520 return error; 521 522 /* 523 * Sloppily round up count to file system blocks. 524 * 525 * This will often reserve an extra block, but that avoids having to look 526 * at the start offset, which isn't stable for O_APPEND until taking the 527 * iolock. Also we need to reserve a block each for zeroing the old 528 * EOF block and the new start block if they are unaligned. 529 * 530 * Any remaining block will be returned after the write. 531 */ 532 return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, 533 flags, ac); 534 } 535 536 static int 537 xfs_dio_write_end_io( 538 struct kiocb *iocb, 539 ssize_t size, 540 int error, 541 unsigned flags) 542 { 543 struct inode *inode = file_inode(iocb->ki_filp); 544 struct xfs_inode *ip = XFS_I(inode); 545 loff_t offset = iocb->ki_pos; 546 unsigned int nofs_flag; 547 548 ASSERT(!xfs_is_zoned_inode(ip) || 549 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 550 551 trace_xfs_end_io_direct_write(ip, offset, size); 552 553 if (xfs_is_shutdown(ip->i_mount)) 554 return -EIO; 555 556 if (error) 557 return error; 558 if (!size) 559 return 0; 560 561 /* 562 * Capture amount written on completion as we can't reliably account 563 * for it on submission. 564 */ 565 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 566 567 /* 568 * We can allocate memory here while doing writeback on behalf of 569 * memory reclaim. To avoid memory allocation deadlocks set the 570 * task-wide nofs context for the following operations. 571 */ 572 nofs_flag = memalloc_nofs_save(); 573 574 if (flags & IOMAP_DIO_COW) { 575 if (iocb->ki_flags & IOCB_ATOMIC) 576 error = xfs_reflink_end_atomic_cow(ip, offset, size); 577 else 578 error = xfs_reflink_end_cow(ip, offset, size); 579 if (error) 580 goto out; 581 } 582 583 /* 584 * Unwritten conversion updates the in-core isize after extent 585 * conversion but before updating the on-disk size. Updating isize any 586 * earlier allows a racing dio read to find unwritten extents before 587 * they are converted. 588 */ 589 if (flags & IOMAP_DIO_UNWRITTEN) { 590 error = xfs_iomap_write_unwritten(ip, offset, size, true); 591 goto out; 592 } 593 594 /* 595 * We need to update the in-core inode size here so that we don't end up 596 * with the on-disk inode size being outside the in-core inode size. We 597 * have no other method of updating EOF for AIO, so always do it here 598 * if necessary. 599 * 600 * We need to lock the test/set EOF update as we can be racing with 601 * other IO completions here to update the EOF. Failing to serialise 602 * here can result in EOF moving backwards and Bad Things Happen when 603 * that occurs. 604 * 605 * As IO completion only ever extends EOF, we can do an unlocked check 606 * here to avoid taking the spinlock. If we land within the current EOF, 607 * then we do not need to do an extending update at all, and we don't 608 * need to take the lock to check this. If we race with an update moving 609 * EOF, then we'll either still be beyond EOF and need to take the lock, 610 * or we'll be within EOF and we don't need to take it at all. 611 */ 612 if (offset + size <= i_size_read(inode)) 613 goto out; 614 615 spin_lock(&ip->i_flags_lock); 616 if (offset + size > i_size_read(inode)) { 617 i_size_write(inode, offset + size); 618 spin_unlock(&ip->i_flags_lock); 619 error = xfs_setfilesize(ip, offset, size); 620 } else { 621 spin_unlock(&ip->i_flags_lock); 622 } 623 624 out: 625 memalloc_nofs_restore(nofs_flag); 626 return error; 627 } 628 629 static const struct iomap_dio_ops xfs_dio_write_ops = { 630 .end_io = xfs_dio_write_end_io, 631 }; 632 633 static void 634 xfs_dio_zoned_submit_io( 635 const struct iomap_iter *iter, 636 struct bio *bio, 637 loff_t file_offset) 638 { 639 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; 640 struct xfs_zone_alloc_ctx *ac = iter->private; 641 xfs_filblks_t count_fsb; 642 struct iomap_ioend *ioend; 643 644 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); 645 if (count_fsb > ac->reserved_blocks) { 646 xfs_err(mp, 647 "allocation (%lld) larger than reservation (%lld).", 648 count_fsb, ac->reserved_blocks); 649 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 650 bio_io_error(bio); 651 return; 652 } 653 ac->reserved_blocks -= count_fsb; 654 655 bio->bi_end_io = xfs_end_bio; 656 ioend = iomap_init_ioend(iter->inode, bio, file_offset, 657 IOMAP_IOEND_DIRECT); 658 xfs_zone_alloc_and_submit(ioend, &ac->open_zone); 659 } 660 661 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 662 .bio_set = &iomap_ioend_bioset, 663 .submit_io = xfs_dio_zoned_submit_io, 664 .end_io = xfs_dio_write_end_io, 665 }; 666 667 /* 668 * Handle block aligned direct I/O writes. 669 */ 670 static noinline ssize_t 671 xfs_file_dio_write_aligned( 672 struct xfs_inode *ip, 673 struct kiocb *iocb, 674 struct iov_iter *from, 675 const struct iomap_ops *ops, 676 const struct iomap_dio_ops *dops, 677 struct xfs_zone_alloc_ctx *ac) 678 { 679 unsigned int iolock = XFS_IOLOCK_SHARED; 680 unsigned int dio_flags = 0; 681 ssize_t ret; 682 683 /* 684 * For always COW inodes, each bio must be aligned to the file system 685 * block size and not just the device sector size because we need to 686 * allocate a block-aligned amount of space for each write. 687 */ 688 if (xfs_is_always_cow_inode(ip)) 689 dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; 690 691 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 692 if (ret) 693 return ret; 694 ret = xfs_file_write_checks(iocb, from, &iolock, ac); 695 if (ret) 696 goto out_unlock; 697 698 /* 699 * We don't need to hold the IOLOCK exclusively across the IO, so demote 700 * the iolock back to shared if we had to take the exclusive lock in 701 * xfs_file_write_checks() for other reasons. 702 */ 703 if (iolock == XFS_IOLOCK_EXCL) { 704 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 705 iolock = XFS_IOLOCK_SHARED; 706 } 707 trace_xfs_file_direct_write(iocb, from); 708 ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); 709 out_unlock: 710 xfs_iunlock(ip, iolock); 711 return ret; 712 } 713 714 /* 715 * Handle block aligned direct I/O writes to zoned devices. 716 */ 717 static noinline ssize_t 718 xfs_file_dio_write_zoned( 719 struct xfs_inode *ip, 720 struct kiocb *iocb, 721 struct iov_iter *from) 722 { 723 struct xfs_zone_alloc_ctx ac = { }; 724 ssize_t ret; 725 726 ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); 727 if (ret < 0) 728 return ret; 729 ret = xfs_file_dio_write_aligned(ip, iocb, from, 730 &xfs_zoned_direct_write_iomap_ops, 731 &xfs_dio_zoned_write_ops, &ac); 732 xfs_zoned_space_unreserve(ip->i_mount, &ac); 733 return ret; 734 } 735 736 /* 737 * Handle block atomic writes 738 * 739 * Two methods of atomic writes are supported: 740 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the 741 * disk 742 * - COW-based, which uses a COW fork as a staging extent for data updates 743 * before atomically updating extent mappings for the range being written 744 * 745 */ 746 static noinline ssize_t 747 xfs_file_dio_write_atomic( 748 struct xfs_inode *ip, 749 struct kiocb *iocb, 750 struct iov_iter *from) 751 { 752 unsigned int iolock = XFS_IOLOCK_SHARED; 753 ssize_t ret, ocount = iov_iter_count(from); 754 const struct iomap_ops *dops; 755 756 /* 757 * HW offload should be faster, so try that first if it is already 758 * known that the write length is not too large. 759 */ 760 if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) 761 dops = &xfs_atomic_write_cow_iomap_ops; 762 else 763 dops = &xfs_direct_write_iomap_ops; 764 765 retry: 766 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 767 if (ret) 768 return ret; 769 770 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 771 if (ret) 772 goto out_unlock; 773 774 /* Demote similar to xfs_file_dio_write_aligned() */ 775 if (iolock == XFS_IOLOCK_EXCL) { 776 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 777 iolock = XFS_IOLOCK_SHARED; 778 } 779 780 trace_xfs_file_direct_write(iocb, from); 781 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, 782 0, NULL, 0); 783 784 /* 785 * The retry mechanism is based on the ->iomap_begin method returning 786 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not 787 * possible. The REQ_ATOMIC-based method typically not be possible if 788 * the write spans multiple extents or the disk blocks are misaligned. 789 */ 790 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { 791 xfs_iunlock(ip, iolock); 792 dops = &xfs_atomic_write_cow_iomap_ops; 793 goto retry; 794 } 795 796 out_unlock: 797 if (iolock) 798 xfs_iunlock(ip, iolock); 799 return ret; 800 } 801 802 /* 803 * Handle block unaligned direct I/O writes 804 * 805 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 806 * them to be done in parallel with reads and other direct I/O writes. However, 807 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 808 * to do sub-block zeroing and that requires serialisation against other direct 809 * I/O to the same block. In this case we need to serialise the submission of 810 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 811 * In the case where sub-block zeroing is not required, we can do concurrent 812 * sub-block dios to the same block successfully. 813 * 814 * Optimistically submit the I/O using the shared lock first, but use the 815 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 816 * if block allocation or partial block zeroing would be required. In that case 817 * we try again with the exclusive lock. 818 */ 819 static noinline ssize_t 820 xfs_file_dio_write_unaligned( 821 struct xfs_inode *ip, 822 struct kiocb *iocb, 823 struct iov_iter *from) 824 { 825 size_t isize = i_size_read(VFS_I(ip)); 826 size_t count = iov_iter_count(from); 827 unsigned int iolock = XFS_IOLOCK_SHARED; 828 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 829 ssize_t ret; 830 831 /* 832 * Extending writes need exclusivity because of the sub-block zeroing 833 * that the DIO code always does for partial tail blocks beyond EOF, so 834 * don't even bother trying the fast path in this case. 835 */ 836 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 837 if (iocb->ki_flags & IOCB_NOWAIT) 838 return -EAGAIN; 839 retry_exclusive: 840 iolock = XFS_IOLOCK_EXCL; 841 flags = IOMAP_DIO_FORCE_WAIT; 842 } 843 844 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 845 if (ret) 846 return ret; 847 848 /* 849 * We can't properly handle unaligned direct I/O to reflink files yet, 850 * as we can't unshare a partial block. 851 */ 852 if (xfs_is_cow_inode(ip)) { 853 trace_xfs_reflink_bounce_dio_write(iocb, from); 854 ret = -ENOTBLK; 855 goto out_unlock; 856 } 857 858 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 859 if (ret) 860 goto out_unlock; 861 862 /* 863 * If we are doing exclusive unaligned I/O, this must be the only I/O 864 * in-flight. Otherwise we risk data corruption due to unwritten extent 865 * conversions from the AIO end_io handler. Wait for all other I/O to 866 * drain first. 867 */ 868 if (flags & IOMAP_DIO_FORCE_WAIT) 869 inode_dio_wait(VFS_I(ip)); 870 871 trace_xfs_file_direct_write(iocb, from); 872 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 873 &xfs_dio_write_ops, flags, NULL, 0); 874 875 /* 876 * Retry unaligned I/O with exclusive blocking semantics if the DIO 877 * layer rejected it for mapping or locking reasons. If we are doing 878 * nonblocking user I/O, propagate the error. 879 */ 880 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 881 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 882 xfs_iunlock(ip, iolock); 883 goto retry_exclusive; 884 } 885 886 out_unlock: 887 if (iolock) 888 xfs_iunlock(ip, iolock); 889 return ret; 890 } 891 892 static ssize_t 893 xfs_file_dio_write( 894 struct kiocb *iocb, 895 struct iov_iter *from) 896 { 897 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 898 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 899 size_t count = iov_iter_count(from); 900 901 /* direct I/O must be aligned to device logical sector size */ 902 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 903 return -EINVAL; 904 905 if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) 906 return xfs_file_dio_write_unaligned(ip, iocb, from); 907 if (xfs_is_zoned_inode(ip)) 908 return xfs_file_dio_write_zoned(ip, iocb, from); 909 if (iocb->ki_flags & IOCB_ATOMIC) 910 return xfs_file_dio_write_atomic(ip, iocb, from); 911 return xfs_file_dio_write_aligned(ip, iocb, from, 912 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); 913 } 914 915 static noinline ssize_t 916 xfs_file_dax_write( 917 struct kiocb *iocb, 918 struct iov_iter *from) 919 { 920 struct inode *inode = iocb->ki_filp->f_mapping->host; 921 struct xfs_inode *ip = XFS_I(inode); 922 unsigned int iolock = XFS_IOLOCK_EXCL; 923 ssize_t ret, error = 0; 924 loff_t pos; 925 926 ret = xfs_ilock_iocb(iocb, iolock); 927 if (ret) 928 return ret; 929 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 930 if (ret) 931 goto out; 932 933 pos = iocb->ki_pos; 934 935 trace_xfs_file_dax_write(iocb, from); 936 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 937 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 938 i_size_write(inode, iocb->ki_pos); 939 error = xfs_setfilesize(ip, pos, ret); 940 } 941 out: 942 if (iolock) 943 xfs_iunlock(ip, iolock); 944 if (error) 945 return error; 946 947 if (ret > 0) { 948 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 949 950 /* Handle various SYNC-type writes */ 951 ret = generic_write_sync(iocb, ret); 952 } 953 return ret; 954 } 955 956 STATIC ssize_t 957 xfs_file_buffered_write( 958 struct kiocb *iocb, 959 struct iov_iter *from) 960 { 961 struct inode *inode = iocb->ki_filp->f_mapping->host; 962 struct xfs_inode *ip = XFS_I(inode); 963 ssize_t ret; 964 bool cleared_space = false; 965 unsigned int iolock; 966 967 write_retry: 968 iolock = XFS_IOLOCK_EXCL; 969 ret = xfs_ilock_iocb(iocb, iolock); 970 if (ret) 971 return ret; 972 973 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 974 if (ret) 975 goto out; 976 977 trace_xfs_file_buffered_write(iocb, from); 978 ret = iomap_file_buffered_write(iocb, from, 979 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 980 NULL); 981 982 /* 983 * If we hit a space limit, try to free up some lingering preallocated 984 * space before returning an error. In the case of ENOSPC, first try to 985 * write back all dirty inodes to free up some of the excess reserved 986 * metadata space. This reduces the chances that the eofblocks scan 987 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 988 * also behaves as a filter to prevent too many eofblocks scans from 989 * running at the same time. Use a synchronous scan to increase the 990 * effectiveness of the scan. 991 */ 992 if (ret == -EDQUOT && !cleared_space) { 993 xfs_iunlock(ip, iolock); 994 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 995 cleared_space = true; 996 goto write_retry; 997 } else if (ret == -ENOSPC && !cleared_space) { 998 struct xfs_icwalk icw = {0}; 999 1000 cleared_space = true; 1001 xfs_flush_inodes(ip->i_mount); 1002 1003 xfs_iunlock(ip, iolock); 1004 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 1005 xfs_blockgc_free_space(ip->i_mount, &icw); 1006 goto write_retry; 1007 } 1008 1009 out: 1010 if (iolock) 1011 xfs_iunlock(ip, iolock); 1012 1013 if (ret > 0) { 1014 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 1015 /* Handle various SYNC-type writes */ 1016 ret = generic_write_sync(iocb, ret); 1017 } 1018 return ret; 1019 } 1020 1021 STATIC ssize_t 1022 xfs_file_buffered_write_zoned( 1023 struct kiocb *iocb, 1024 struct iov_iter *from) 1025 { 1026 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 1027 struct xfs_mount *mp = ip->i_mount; 1028 unsigned int iolock = XFS_IOLOCK_EXCL; 1029 bool cleared_space = false; 1030 struct xfs_zone_alloc_ctx ac = { }; 1031 ssize_t ret; 1032 1033 ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); 1034 if (ret < 0) 1035 return ret; 1036 1037 ret = xfs_ilock_iocb(iocb, iolock); 1038 if (ret) 1039 goto out_unreserve; 1040 1041 ret = xfs_file_write_checks(iocb, from, &iolock, &ac); 1042 if (ret) 1043 goto out_unlock; 1044 1045 /* 1046 * Truncate the iter to the length that we were actually able to 1047 * allocate blocks for. This needs to happen after 1048 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND 1049 * writes. 1050 */ 1051 iov_iter_truncate(from, 1052 XFS_FSB_TO_B(mp, ac.reserved_blocks) - 1053 (iocb->ki_pos & mp->m_blockmask)); 1054 if (!iov_iter_count(from)) 1055 goto out_unlock; 1056 1057 retry: 1058 trace_xfs_file_buffered_write(iocb, from); 1059 ret = iomap_file_buffered_write(iocb, from, 1060 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 1061 &ac); 1062 if (ret == -ENOSPC && !cleared_space) { 1063 /* 1064 * Kick off writeback to convert delalloc space and release the 1065 * usually too pessimistic indirect block reservations. 1066 */ 1067 xfs_flush_inodes(mp); 1068 cleared_space = true; 1069 goto retry; 1070 } 1071 1072 out_unlock: 1073 xfs_iunlock(ip, iolock); 1074 out_unreserve: 1075 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1076 if (ret > 0) { 1077 XFS_STATS_ADD(mp, xs_write_bytes, ret); 1078 ret = generic_write_sync(iocb, ret); 1079 } 1080 return ret; 1081 } 1082 1083 STATIC ssize_t 1084 xfs_file_write_iter( 1085 struct kiocb *iocb, 1086 struct iov_iter *from) 1087 { 1088 struct inode *inode = iocb->ki_filp->f_mapping->host; 1089 struct xfs_inode *ip = XFS_I(inode); 1090 ssize_t ret; 1091 size_t ocount = iov_iter_count(from); 1092 1093 XFS_STATS_INC(ip->i_mount, xs_write_calls); 1094 1095 if (ocount == 0) 1096 return 0; 1097 1098 if (xfs_is_shutdown(ip->i_mount)) 1099 return -EIO; 1100 1101 if (iocb->ki_flags & IOCB_ATOMIC) { 1102 if (ocount < xfs_get_atomic_write_min(ip)) 1103 return -EINVAL; 1104 1105 if (ocount > xfs_get_atomic_write_max(ip)) 1106 return -EINVAL; 1107 1108 ret = generic_atomic_write_valid(iocb, from); 1109 if (ret) 1110 return ret; 1111 } 1112 1113 if (IS_DAX(inode)) 1114 return xfs_file_dax_write(iocb, from); 1115 1116 if (iocb->ki_flags & IOCB_DIRECT) { 1117 /* 1118 * Allow a directio write to fall back to a buffered 1119 * write *only* in the case that we're doing a reflink 1120 * CoW. In all other directio scenarios we do not 1121 * allow an operation to fall back to buffered mode. 1122 */ 1123 ret = xfs_file_dio_write(iocb, from); 1124 if (ret != -ENOTBLK) 1125 return ret; 1126 } 1127 1128 if (xfs_is_zoned_inode(ip)) 1129 return xfs_file_buffered_write_zoned(iocb, from); 1130 return xfs_file_buffered_write(iocb, from); 1131 } 1132 1133 /* Does this file, inode, or mount want synchronous writes? */ 1134 static inline bool xfs_file_sync_writes(struct file *filp) 1135 { 1136 struct xfs_inode *ip = XFS_I(file_inode(filp)); 1137 1138 if (xfs_has_wsync(ip->i_mount)) 1139 return true; 1140 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 1141 return true; 1142 if (IS_SYNC(file_inode(filp))) 1143 return true; 1144 1145 return false; 1146 } 1147 1148 static int 1149 xfs_falloc_newsize( 1150 struct file *file, 1151 int mode, 1152 loff_t offset, 1153 loff_t len, 1154 loff_t *new_size) 1155 { 1156 struct inode *inode = file_inode(file); 1157 1158 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) 1159 return 0; 1160 *new_size = offset + len; 1161 return inode_newsize_ok(inode, *new_size); 1162 } 1163 1164 static int 1165 xfs_falloc_setsize( 1166 struct file *file, 1167 loff_t new_size) 1168 { 1169 struct iattr iattr = { 1170 .ia_valid = ATTR_SIZE, 1171 .ia_size = new_size, 1172 }; 1173 1174 if (!new_size) 1175 return 0; 1176 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), 1177 &iattr); 1178 } 1179 1180 static int 1181 xfs_falloc_collapse_range( 1182 struct file *file, 1183 loff_t offset, 1184 loff_t len, 1185 struct xfs_zone_alloc_ctx *ac) 1186 { 1187 struct inode *inode = file_inode(file); 1188 loff_t new_size = i_size_read(inode) - len; 1189 int error; 1190 1191 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1192 return -EINVAL; 1193 1194 /* 1195 * There is no need to overlap collapse range with EOF, in which case it 1196 * is effectively a truncate operation 1197 */ 1198 if (offset + len >= i_size_read(inode)) 1199 return -EINVAL; 1200 1201 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); 1202 if (error) 1203 return error; 1204 return xfs_falloc_setsize(file, new_size); 1205 } 1206 1207 static int 1208 xfs_falloc_insert_range( 1209 struct file *file, 1210 loff_t offset, 1211 loff_t len) 1212 { 1213 struct inode *inode = file_inode(file); 1214 loff_t isize = i_size_read(inode); 1215 int error; 1216 1217 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1218 return -EINVAL; 1219 1220 /* 1221 * New inode size must not exceed ->s_maxbytes, accounting for 1222 * possible signed overflow. 1223 */ 1224 if (inode->i_sb->s_maxbytes - isize < len) 1225 return -EFBIG; 1226 1227 /* Offset should be less than i_size */ 1228 if (offset >= isize) 1229 return -EINVAL; 1230 1231 error = xfs_falloc_setsize(file, isize + len); 1232 if (error) 1233 return error; 1234 1235 /* 1236 * Perform hole insertion now that the file size has been updated so 1237 * that if we crash during the operation we don't leave shifted extents 1238 * past EOF and hence losing access to the data that is contained within 1239 * them. 1240 */ 1241 return xfs_insert_file_space(XFS_I(inode), offset, len); 1242 } 1243 1244 /* 1245 * For various operations we need to zero up to one block at each end of 1246 * the affected range. For zoned file systems this will require a space 1247 * allocation, for which we need a reservation ahead of time. 1248 */ 1249 #define XFS_ZONED_ZERO_EDGE_SPACE_RES 2 1250 1251 /* 1252 * Zero range implements a full zeroing mechanism but is only used in limited 1253 * situations. It is more efficient to allocate unwritten extents than to 1254 * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG 1255 * kernels for added test coverage. 1256 * 1257 * On zoned file systems, the error is already injected by 1258 * xfs_file_zoned_fallocate, which then reserves the additional space needed. 1259 * We only check for this extra space reservation here. 1260 */ 1261 static inline bool 1262 xfs_falloc_force_zero( 1263 struct xfs_inode *ip, 1264 struct xfs_zone_alloc_ctx *ac) 1265 { 1266 if (xfs_is_zoned_inode(ip)) { 1267 if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) { 1268 ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG)); 1269 return true; 1270 } 1271 return false; 1272 } 1273 return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE); 1274 } 1275 1276 /* 1277 * Punch a hole and prealloc the range. We use a hole punch rather than 1278 * unwritten extent conversion for two reasons: 1279 * 1280 * 1.) Hole punch handles partial block zeroing for us. 1281 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by 1282 * virtue of the hole punch. 1283 */ 1284 static int 1285 xfs_falloc_zero_range( 1286 struct file *file, 1287 int mode, 1288 loff_t offset, 1289 loff_t len, 1290 struct xfs_zone_alloc_ctx *ac) 1291 { 1292 struct inode *inode = file_inode(file); 1293 struct xfs_inode *ip = XFS_I(inode); 1294 unsigned int blksize = i_blocksize(inode); 1295 loff_t new_size = 0; 1296 int error; 1297 1298 trace_xfs_zero_file_space(ip); 1299 1300 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1301 if (error) 1302 return error; 1303 1304 if (xfs_falloc_force_zero(ip, ac)) { 1305 error = xfs_zero_range(ip, offset, len, ac, NULL); 1306 } else { 1307 error = xfs_free_file_space(ip, offset, len, ac); 1308 if (error) 1309 return error; 1310 1311 len = round_up(offset + len, blksize) - 1312 round_down(offset, blksize); 1313 offset = round_down(offset, blksize); 1314 error = xfs_alloc_file_space(ip, offset, len); 1315 } 1316 if (error) 1317 return error; 1318 return xfs_falloc_setsize(file, new_size); 1319 } 1320 1321 static int 1322 xfs_falloc_unshare_range( 1323 struct file *file, 1324 int mode, 1325 loff_t offset, 1326 loff_t len) 1327 { 1328 struct inode *inode = file_inode(file); 1329 loff_t new_size = 0; 1330 int error; 1331 1332 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1333 if (error) 1334 return error; 1335 1336 error = xfs_reflink_unshare(XFS_I(inode), offset, len); 1337 if (error) 1338 return error; 1339 1340 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1341 if (error) 1342 return error; 1343 return xfs_falloc_setsize(file, new_size); 1344 } 1345 1346 static int 1347 xfs_falloc_allocate_range( 1348 struct file *file, 1349 int mode, 1350 loff_t offset, 1351 loff_t len) 1352 { 1353 struct inode *inode = file_inode(file); 1354 loff_t new_size = 0; 1355 int error; 1356 1357 /* 1358 * If always_cow mode we can't use preallocations and thus should not 1359 * create them. 1360 */ 1361 if (xfs_is_always_cow_inode(XFS_I(inode))) 1362 return -EOPNOTSUPP; 1363 1364 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1365 if (error) 1366 return error; 1367 1368 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1369 if (error) 1370 return error; 1371 return xfs_falloc_setsize(file, new_size); 1372 } 1373 1374 #define XFS_FALLOC_FL_SUPPORTED \ 1375 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ 1376 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ 1377 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ 1378 FALLOC_FL_UNSHARE_RANGE) 1379 1380 STATIC long 1381 __xfs_file_fallocate( 1382 struct file *file, 1383 int mode, 1384 loff_t offset, 1385 loff_t len, 1386 struct xfs_zone_alloc_ctx *ac) 1387 { 1388 struct inode *inode = file_inode(file); 1389 struct xfs_inode *ip = XFS_I(inode); 1390 long error; 1391 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1392 1393 xfs_ilock(ip, iolock); 1394 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 1395 if (error) 1396 goto out_unlock; 1397 1398 /* 1399 * Must wait for all AIO to complete before we continue as AIO can 1400 * change the file size on completion without holding any locks we 1401 * currently hold. We must do this first because AIO can update both 1402 * the on disk and in memory inode sizes, and the operations that follow 1403 * require the in-memory size to be fully up-to-date. 1404 */ 1405 inode_dio_wait(inode); 1406 1407 error = file_modified(file); 1408 if (error) 1409 goto out_unlock; 1410 1411 switch (mode & FALLOC_FL_MODE_MASK) { 1412 case FALLOC_FL_PUNCH_HOLE: 1413 error = xfs_free_file_space(ip, offset, len, ac); 1414 break; 1415 case FALLOC_FL_COLLAPSE_RANGE: 1416 error = xfs_falloc_collapse_range(file, offset, len, ac); 1417 break; 1418 case FALLOC_FL_INSERT_RANGE: 1419 error = xfs_falloc_insert_range(file, offset, len); 1420 break; 1421 case FALLOC_FL_ZERO_RANGE: 1422 error = xfs_falloc_zero_range(file, mode, offset, len, ac); 1423 break; 1424 case FALLOC_FL_UNSHARE_RANGE: 1425 error = xfs_falloc_unshare_range(file, mode, offset, len); 1426 break; 1427 case FALLOC_FL_ALLOCATE_RANGE: 1428 error = xfs_falloc_allocate_range(file, mode, offset, len); 1429 break; 1430 default: 1431 error = -EOPNOTSUPP; 1432 break; 1433 } 1434 1435 if (!error && xfs_file_sync_writes(file)) 1436 error = xfs_log_force_inode(ip); 1437 1438 out_unlock: 1439 xfs_iunlock(ip, iolock); 1440 return error; 1441 } 1442 1443 static long 1444 xfs_file_zoned_fallocate( 1445 struct file *file, 1446 int mode, 1447 loff_t offset, 1448 loff_t len) 1449 { 1450 struct xfs_zone_alloc_ctx ac = { }; 1451 struct xfs_inode *ip = XFS_I(file_inode(file)); 1452 struct xfs_mount *mp = ip->i_mount; 1453 xfs_filblks_t count_fsb; 1454 int error; 1455 1456 /* 1457 * If full zeroing is forced by the error injection knob, we need a 1458 * space reservation that covers the entire range. See the comment in 1459 * xfs_zoned_write_space_reserve for the rationale for the calculation. 1460 * Otherwise just reserve space for the two boundary blocks. 1461 */ 1462 count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES; 1463 if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE && 1464 XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE)) 1465 count_fsb += XFS_B_TO_FSB(mp, len) + 1; 1466 1467 error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac); 1468 if (error) 1469 return error; 1470 error = __xfs_file_fallocate(file, mode, offset, len, &ac); 1471 xfs_zoned_space_unreserve(mp, &ac); 1472 return error; 1473 } 1474 1475 static long 1476 xfs_file_fallocate( 1477 struct file *file, 1478 int mode, 1479 loff_t offset, 1480 loff_t len) 1481 { 1482 struct inode *inode = file_inode(file); 1483 1484 if (!S_ISREG(inode->i_mode)) 1485 return -EINVAL; 1486 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1487 return -EOPNOTSUPP; 1488 1489 /* 1490 * For zoned file systems, zeroing the first and last block of a hole 1491 * punch requires allocating a new block to rewrite the remaining data 1492 * and new zeroes out of place. Get a reservations for those before 1493 * taking the iolock. Dip into the reserved pool because we are 1494 * expected to be able to punch a hole even on a completely full 1495 * file system. 1496 */ 1497 if (xfs_is_zoned_inode(XFS_I(inode)) && 1498 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 1499 FALLOC_FL_COLLAPSE_RANGE))) 1500 return xfs_file_zoned_fallocate(file, mode, offset, len); 1501 return __xfs_file_fallocate(file, mode, offset, len, NULL); 1502 } 1503 1504 STATIC int 1505 xfs_file_fadvise( 1506 struct file *file, 1507 loff_t start, 1508 loff_t end, 1509 int advice) 1510 { 1511 struct xfs_inode *ip = XFS_I(file_inode(file)); 1512 int ret; 1513 int lockflags = 0; 1514 1515 /* 1516 * Operations creating pages in page cache need protection from hole 1517 * punching and similar ops 1518 */ 1519 if (advice == POSIX_FADV_WILLNEED) { 1520 lockflags = XFS_IOLOCK_SHARED; 1521 xfs_ilock(ip, lockflags); 1522 } 1523 ret = generic_fadvise(file, start, end, advice); 1524 if (lockflags) 1525 xfs_iunlock(ip, lockflags); 1526 return ret; 1527 } 1528 1529 STATIC loff_t 1530 xfs_file_remap_range( 1531 struct file *file_in, 1532 loff_t pos_in, 1533 struct file *file_out, 1534 loff_t pos_out, 1535 loff_t len, 1536 unsigned int remap_flags) 1537 { 1538 struct inode *inode_in = file_inode(file_in); 1539 struct xfs_inode *src = XFS_I(inode_in); 1540 struct inode *inode_out = file_inode(file_out); 1541 struct xfs_inode *dest = XFS_I(inode_out); 1542 struct xfs_mount *mp = src->i_mount; 1543 loff_t remapped = 0; 1544 xfs_extlen_t cowextsize; 1545 int ret; 1546 1547 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1548 return -EINVAL; 1549 1550 if (!xfs_has_reflink(mp)) 1551 return -EOPNOTSUPP; 1552 1553 if (xfs_is_shutdown(mp)) 1554 return -EIO; 1555 1556 /* Prepare and then clone file data. */ 1557 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1558 &len, remap_flags); 1559 if (ret || len == 0) 1560 return ret; 1561 1562 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1563 1564 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1565 &remapped); 1566 if (ret) 1567 goto out_unlock; 1568 1569 /* 1570 * Carry the cowextsize hint from src to dest if we're sharing the 1571 * entire source file to the entire destination file, the source file 1572 * has a cowextsize hint, and the destination file does not. 1573 */ 1574 cowextsize = 0; 1575 if (pos_in == 0 && len == i_size_read(inode_in) && 1576 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1577 pos_out == 0 && len >= i_size_read(inode_out) && 1578 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1579 cowextsize = src->i_cowextsize; 1580 1581 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1582 remap_flags); 1583 if (ret) 1584 goto out_unlock; 1585 1586 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1587 xfs_log_force_inode(dest); 1588 out_unlock: 1589 xfs_iunlock2_remapping(src, dest); 1590 if (ret) 1591 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1592 /* 1593 * If the caller did not set CAN_SHORTEN, then it is not prepared to 1594 * handle partial results -- either the whole remap succeeds, or we 1595 * must say why it did not. In this case, any error should be returned 1596 * to the caller. 1597 */ 1598 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) 1599 return ret; 1600 return remapped > 0 ? remapped : ret; 1601 } 1602 1603 STATIC int 1604 xfs_file_open( 1605 struct inode *inode, 1606 struct file *file) 1607 { 1608 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1609 return -EIO; 1610 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 1611 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) 1612 file->f_mode |= FMODE_CAN_ATOMIC_WRITE; 1613 return generic_file_open(inode, file); 1614 } 1615 1616 STATIC int 1617 xfs_dir_open( 1618 struct inode *inode, 1619 struct file *file) 1620 { 1621 struct xfs_inode *ip = XFS_I(inode); 1622 unsigned int mode; 1623 int error; 1624 1625 if (xfs_is_shutdown(ip->i_mount)) 1626 return -EIO; 1627 error = generic_file_open(inode, file); 1628 if (error) 1629 return error; 1630 1631 /* 1632 * If there are any blocks, read-ahead block 0 as we're almost 1633 * certain to have the next operation be a read there. 1634 */ 1635 mode = xfs_ilock_data_map_shared(ip); 1636 if (ip->i_df.if_nextents > 0) 1637 error = xfs_dir3_data_readahead(ip, 0, 0); 1638 xfs_iunlock(ip, mode); 1639 return error; 1640 } 1641 1642 /* 1643 * Don't bother propagating errors. We're just doing cleanup, and the caller 1644 * ignores the return value anyway. 1645 */ 1646 STATIC int 1647 xfs_file_release( 1648 struct inode *inode, 1649 struct file *file) 1650 { 1651 struct xfs_inode *ip = XFS_I(inode); 1652 struct xfs_mount *mp = ip->i_mount; 1653 1654 /* 1655 * If this is a read-only mount or the file system has been shut down, 1656 * don't generate I/O. 1657 */ 1658 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) 1659 return 0; 1660 1661 /* 1662 * If we previously truncated this file and removed old data in the 1663 * process, we want to initiate "early" writeout on the last close. 1664 * This is an attempt to combat the notorious NULL files problem which 1665 * is particularly noticeable from a truncate down, buffered (re-)write 1666 * (delalloc), followed by a crash. What we are effectively doing here 1667 * is significantly reducing the time window where we'd otherwise be 1668 * exposed to that problem. 1669 */ 1670 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { 1671 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); 1672 if (ip->i_delayed_blks > 0) 1673 filemap_flush(inode->i_mapping); 1674 } 1675 1676 /* 1677 * XFS aggressively preallocates post-EOF space to generate contiguous 1678 * allocations for writers that append to the end of the file. 1679 * 1680 * To support workloads that close and reopen the file frequently, these 1681 * preallocations usually persist after a close unless it is the first 1682 * close for the inode. This is a tradeoff to generate tightly packed 1683 * data layouts for unpacking tarballs or similar archives that write 1684 * one file after another without going back to it while keeping the 1685 * preallocation for files that have recurring open/write/close cycles. 1686 * 1687 * This heuristic is skipped for inodes with the append-only flag as 1688 * that flag is rather pointless for inodes written only once. 1689 * 1690 * There is no point in freeing blocks here for open but unlinked files 1691 * as they will be taken care of by the inactivation path soon. 1692 * 1693 * When releasing a read-only context, don't flush data or trim post-EOF 1694 * blocks. This avoids open/read/close workloads from removing EOF 1695 * blocks that other writers depend upon to reduce fragmentation. 1696 * 1697 * Inodes on the zoned RT device never have preallocations, so skip 1698 * taking the locks below. 1699 */ 1700 if (!inode->i_nlink || 1701 !(file->f_mode & FMODE_WRITE) || 1702 (ip->i_diflags & XFS_DIFLAG_APPEND) || 1703 xfs_is_zoned_inode(ip)) 1704 return 0; 1705 1706 /* 1707 * If we can't get the iolock just skip truncating the blocks past EOF 1708 * because we could deadlock with the mmap_lock otherwise. We'll get 1709 * another chance to drop them once the last reference to the inode is 1710 * dropped, so we'll never leak blocks permanently. 1711 */ 1712 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1713 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1714 if (xfs_can_free_eofblocks(ip) && 1715 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) 1716 xfs_free_eofblocks(ip); 1717 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1718 } 1719 1720 return 0; 1721 } 1722 1723 STATIC int 1724 xfs_file_readdir( 1725 struct file *file, 1726 struct dir_context *ctx) 1727 { 1728 struct inode *inode = file_inode(file); 1729 xfs_inode_t *ip = XFS_I(inode); 1730 size_t bufsize; 1731 1732 /* 1733 * The Linux API doesn't pass down the total size of the buffer 1734 * we read into down to the filesystem. With the filldir concept 1735 * it's not needed for correct information, but the XFS dir2 leaf 1736 * code wants an estimate of the buffer size to calculate it's 1737 * readahead window and size the buffers used for mapping to 1738 * physical blocks. 1739 * 1740 * Try to give it an estimate that's good enough, maybe at some 1741 * point we can change the ->readdir prototype to include the 1742 * buffer size. For now we use the current glibc buffer size. 1743 */ 1744 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1745 1746 return xfs_readdir(NULL, ip, ctx, bufsize); 1747 } 1748 1749 STATIC loff_t 1750 xfs_file_llseek( 1751 struct file *file, 1752 loff_t offset, 1753 int whence) 1754 { 1755 struct inode *inode = file->f_mapping->host; 1756 1757 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1758 return -EIO; 1759 1760 switch (whence) { 1761 default: 1762 return generic_file_llseek(file, offset, whence); 1763 case SEEK_HOLE: 1764 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1765 break; 1766 case SEEK_DATA: 1767 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1768 break; 1769 } 1770 1771 if (offset < 0) 1772 return offset; 1773 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1774 } 1775 1776 static inline vm_fault_t 1777 xfs_dax_fault_locked( 1778 struct vm_fault *vmf, 1779 unsigned int order, 1780 bool write_fault) 1781 { 1782 vm_fault_t ret; 1783 unsigned long pfn; 1784 1785 if (!IS_ENABLED(CONFIG_FS_DAX)) { 1786 ASSERT(0); 1787 return VM_FAULT_SIGBUS; 1788 } 1789 ret = dax_iomap_fault(vmf, order, &pfn, NULL, 1790 (write_fault && !vmf->cow_page) ? 1791 &xfs_dax_write_iomap_ops : 1792 &xfs_read_iomap_ops); 1793 if (ret & VM_FAULT_NEEDDSYNC) 1794 ret = dax_finish_sync_fault(vmf, order, pfn); 1795 return ret; 1796 } 1797 1798 static vm_fault_t 1799 xfs_dax_read_fault( 1800 struct vm_fault *vmf, 1801 unsigned int order) 1802 { 1803 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1804 vm_fault_t ret; 1805 1806 trace_xfs_read_fault(ip, order); 1807 1808 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1809 ret = xfs_dax_fault_locked(vmf, order, false); 1810 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1811 1812 return ret; 1813 } 1814 1815 /* 1816 * Locking for serialisation of IO during page faults. This results in a lock 1817 * ordering of: 1818 * 1819 * mmap_lock (MM) 1820 * sb_start_pagefault(vfs, freeze) 1821 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1822 * page_lock (MM) 1823 * i_lock (XFS - extent map serialisation) 1824 */ 1825 static vm_fault_t 1826 __xfs_write_fault( 1827 struct vm_fault *vmf, 1828 unsigned int order, 1829 struct xfs_zone_alloc_ctx *ac) 1830 { 1831 struct inode *inode = file_inode(vmf->vma->vm_file); 1832 struct xfs_inode *ip = XFS_I(inode); 1833 unsigned int lock_mode = XFS_MMAPLOCK_SHARED; 1834 vm_fault_t ret; 1835 1836 trace_xfs_write_fault(ip, order); 1837 1838 sb_start_pagefault(inode->i_sb); 1839 file_update_time(vmf->vma->vm_file); 1840 1841 /* 1842 * Normally we only need the shared mmaplock, but if a reflink remap is 1843 * in progress we take the exclusive lock to wait for the remap to 1844 * finish before taking a write fault. 1845 */ 1846 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1847 if (xfs_iflags_test(ip, XFS_IREMAPPING)) { 1848 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1849 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1850 lock_mode = XFS_MMAPLOCK_EXCL; 1851 } 1852 1853 if (IS_DAX(inode)) 1854 ret = xfs_dax_fault_locked(vmf, order, true); 1855 else 1856 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, 1857 ac); 1858 xfs_iunlock(ip, lock_mode); 1859 1860 sb_end_pagefault(inode->i_sb); 1861 return ret; 1862 } 1863 1864 static vm_fault_t 1865 xfs_write_fault_zoned( 1866 struct vm_fault *vmf, 1867 unsigned int order) 1868 { 1869 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1870 unsigned int len = folio_size(page_folio(vmf->page)); 1871 struct xfs_zone_alloc_ctx ac = { }; 1872 int error; 1873 vm_fault_t ret; 1874 1875 /* 1876 * This could over-allocate as it doesn't check for truncation. 1877 * 1878 * But as the overallocation is limited to less than a folio and will be 1879 * release instantly that's just fine. 1880 */ 1881 error = xfs_zoned_space_reserve(ip->i_mount, 1882 XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); 1883 if (error < 0) 1884 return vmf_fs_error(error); 1885 ret = __xfs_write_fault(vmf, order, &ac); 1886 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1887 return ret; 1888 } 1889 1890 static vm_fault_t 1891 xfs_write_fault( 1892 struct vm_fault *vmf, 1893 unsigned int order) 1894 { 1895 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) 1896 return xfs_write_fault_zoned(vmf, order); 1897 return __xfs_write_fault(vmf, order, NULL); 1898 } 1899 1900 static inline bool 1901 xfs_is_write_fault( 1902 struct vm_fault *vmf) 1903 { 1904 return (vmf->flags & FAULT_FLAG_WRITE) && 1905 (vmf->vma->vm_flags & VM_SHARED); 1906 } 1907 1908 static vm_fault_t 1909 xfs_filemap_fault( 1910 struct vm_fault *vmf) 1911 { 1912 struct inode *inode = file_inode(vmf->vma->vm_file); 1913 1914 /* DAX can shortcut the normal fault path on write faults! */ 1915 if (IS_DAX(inode)) { 1916 if (xfs_is_write_fault(vmf)) 1917 return xfs_write_fault(vmf, 0); 1918 return xfs_dax_read_fault(vmf, 0); 1919 } 1920 1921 trace_xfs_read_fault(XFS_I(inode), 0); 1922 return filemap_fault(vmf); 1923 } 1924 1925 static vm_fault_t 1926 xfs_filemap_huge_fault( 1927 struct vm_fault *vmf, 1928 unsigned int order) 1929 { 1930 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1931 return VM_FAULT_FALLBACK; 1932 1933 /* DAX can shortcut the normal fault path on write faults! */ 1934 if (xfs_is_write_fault(vmf)) 1935 return xfs_write_fault(vmf, order); 1936 return xfs_dax_read_fault(vmf, order); 1937 } 1938 1939 static vm_fault_t 1940 xfs_filemap_page_mkwrite( 1941 struct vm_fault *vmf) 1942 { 1943 return xfs_write_fault(vmf, 0); 1944 } 1945 1946 /* 1947 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1948 * on write faults. In reality, it needs to serialise against truncate and 1949 * prepare memory for writing so handle is as standard write fault. 1950 */ 1951 static vm_fault_t 1952 xfs_filemap_pfn_mkwrite( 1953 struct vm_fault *vmf) 1954 { 1955 return xfs_write_fault(vmf, 0); 1956 } 1957 1958 static const struct vm_operations_struct xfs_file_vm_ops = { 1959 .fault = xfs_filemap_fault, 1960 .huge_fault = xfs_filemap_huge_fault, 1961 .map_pages = filemap_map_pages, 1962 .page_mkwrite = xfs_filemap_page_mkwrite, 1963 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1964 }; 1965 1966 STATIC int 1967 xfs_file_mmap_prepare( 1968 struct vm_area_desc *desc) 1969 { 1970 struct file *file = desc->file; 1971 struct inode *inode = file_inode(file); 1972 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 1973 1974 /* 1975 * We don't support synchronous mappings for non-DAX files and 1976 * for DAX files if underneath dax_device is not synchronous. 1977 */ 1978 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), 1979 target->bt_daxdev)) 1980 return -EOPNOTSUPP; 1981 1982 file_accessed(file); 1983 desc->vm_ops = &xfs_file_vm_ops; 1984 if (IS_DAX(inode)) 1985 desc->vm_flags |= VM_HUGEPAGE; 1986 return 0; 1987 } 1988 1989 const struct file_operations xfs_file_operations = { 1990 .llseek = xfs_file_llseek, 1991 .read_iter = xfs_file_read_iter, 1992 .write_iter = xfs_file_write_iter, 1993 .splice_read = xfs_file_splice_read, 1994 .splice_write = iter_file_splice_write, 1995 .iopoll = iocb_bio_iopoll, 1996 .unlocked_ioctl = xfs_file_ioctl, 1997 #ifdef CONFIG_COMPAT 1998 .compat_ioctl = xfs_file_compat_ioctl, 1999 #endif 2000 .mmap_prepare = xfs_file_mmap_prepare, 2001 .open = xfs_file_open, 2002 .release = xfs_file_release, 2003 .fsync = xfs_file_fsync, 2004 .get_unmapped_area = thp_get_unmapped_area, 2005 .fallocate = xfs_file_fallocate, 2006 .fadvise = xfs_file_fadvise, 2007 .remap_file_range = xfs_file_remap_range, 2008 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 2009 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | 2010 FOP_DONTCACHE, 2011 .setlease = generic_setlease, 2012 }; 2013 2014 const struct file_operations xfs_dir_file_operations = { 2015 .open = xfs_dir_open, 2016 .read = generic_read_dir, 2017 .iterate_shared = xfs_file_readdir, 2018 .llseek = generic_file_llseek, 2019 .unlocked_ioctl = xfs_file_ioctl, 2020 #ifdef CONFIG_COMPAT 2021 .compat_ioctl = xfs_file_compat_ioctl, 2022 #endif 2023 .fsync = xfs_dir_fsync, 2024 .setlease = generic_setlease, 2025 }; 2026