1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_bmap.h" 17 #include "xfs_bmap_util.h" 18 #include "xfs_dir2.h" 19 #include "xfs_dir2_priv.h" 20 #include "xfs_ioctl.h" 21 #include "xfs_trace.h" 22 #include "xfs_log.h" 23 #include "xfs_icache.h" 24 #include "xfs_pnfs.h" 25 #include "xfs_iomap.h" 26 #include "xfs_reflink.h" 27 #include "xfs_file.h" 28 #include "xfs_aops.h" 29 #include "xfs_zone_alloc.h" 30 #include "xfs_error.h" 31 #include "xfs_errortag.h" 32 33 #include <linux/dax.h> 34 #include <linux/falloc.h> 35 #include <linux/backing-dev.h> 36 #include <linux/mman.h> 37 #include <linux/fadvise.h> 38 #include <linux/mount.h> 39 40 static const struct vm_operations_struct xfs_file_vm_ops; 41 42 /* 43 * Decide if the given file range is aligned to the size of the fundamental 44 * allocation unit for the file. 45 */ 46 bool 47 xfs_is_falloc_aligned( 48 struct xfs_inode *ip, 49 loff_t pos, 50 long long int len) 51 { 52 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); 53 54 if (!is_power_of_2(alloc_unit)) 55 return isaligned_64(pos, alloc_unit) && 56 isaligned_64(len, alloc_unit); 57 58 return !((pos | len) & (alloc_unit - 1)); 59 } 60 61 /* 62 * Fsync operations on directories are much simpler than on regular files, 63 * as there is no file data to flush, and thus also no need for explicit 64 * cache flush operations, and there are no non-transaction metadata updates 65 * on directories either. 66 */ 67 STATIC int 68 xfs_dir_fsync( 69 struct file *file, 70 loff_t start, 71 loff_t end, 72 int datasync) 73 { 74 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 75 76 trace_xfs_dir_fsync(ip); 77 return xfs_log_force_inode(ip); 78 } 79 80 /* 81 * All metadata updates are logged, which means that we just have to push the 82 * journal to the required sequence number than holds the updates. We track 83 * datasync commits separately to full sync commits, and hence only need to 84 * select the correct sequence number for the log force here. 85 * 86 * We don't have to serialise against concurrent modifications, as we do not 87 * have to wait for modifications that have not yet completed. We define a 88 * transaction commit as completing when the commit sequence number is updated, 89 * hence if the sequence number has not updated, the sync operation has been 90 * run before the commit completed and we don't have to wait for it. 91 * 92 * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain 93 * set on the log item until - at least - the journal flush completes. In 94 * reality, they are only cleared when the inode is fully unpinned (i.e. 95 * persistent in the journal and not dirty in the CIL), and so we rely on 96 * xfs_log_force_seq() either skipping sequences that have been persisted or 97 * waiting on sequences that are still in flight to correctly order concurrent 98 * sync operations. 99 */ 100 static int 101 xfs_fsync_flush_log( 102 struct xfs_inode *ip, 103 bool datasync, 104 int *log_flushed) 105 { 106 struct xfs_inode_log_item *iip = ip->i_itemp; 107 xfs_csn_t seq = 0; 108 109 spin_lock(&iip->ili_lock); 110 if (datasync) 111 seq = iip->ili_datasync_seq; 112 else 113 seq = iip->ili_commit_seq; 114 spin_unlock(&iip->ili_lock); 115 116 if (!seq) 117 return 0; 118 119 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, 120 log_flushed); 121 } 122 123 STATIC int 124 xfs_file_fsync( 125 struct file *file, 126 loff_t start, 127 loff_t end, 128 int datasync) 129 { 130 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 131 struct xfs_mount *mp = ip->i_mount; 132 int error, err2; 133 int log_flushed = 0; 134 135 trace_xfs_file_fsync(ip); 136 137 error = file_write_and_wait_range(file, start, end); 138 if (error) 139 return error; 140 141 if (xfs_is_shutdown(mp)) 142 return -EIO; 143 144 xfs_iflags_clear(ip, XFS_ITRUNCATED); 145 146 /* 147 * If we have an RT and/or log subvolume we need to make sure to flush 148 * the write cache the device used for file data first. This is to 149 * ensure newly written file data make it to disk before logging the new 150 * inode size in case of an extending write. 151 */ 152 if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) 153 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 154 else if (mp->m_logdev_targp != mp->m_ddev_targp) 155 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 156 157 /* 158 * If the inode has a inode log item attached, it may need the journal 159 * flushed to persist any changes the log item might be tracking. 160 */ 161 if (ip->i_itemp) { 162 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); 163 if (err2 && !error) 164 error = err2; 165 } 166 167 /* 168 * If we only have a single device, and the log force about was 169 * a no-op we might have to flush the data device cache here. 170 * This can only happen for fdatasync/O_DSYNC if we were overwriting 171 * an already allocated file and thus do not have any metadata to 172 * commit. 173 */ 174 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && 175 mp->m_logdev_targp == mp->m_ddev_targp) { 176 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); 177 if (err2 && !error) 178 error = err2; 179 } 180 181 return error; 182 } 183 184 static int 185 xfs_ilock_iocb( 186 struct kiocb *iocb, 187 unsigned int lock_mode) 188 { 189 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 190 191 if (iocb->ki_flags & IOCB_NOWAIT) { 192 if (!xfs_ilock_nowait(ip, lock_mode)) 193 return -EAGAIN; 194 } else { 195 xfs_ilock(ip, lock_mode); 196 } 197 198 return 0; 199 } 200 201 static int 202 xfs_ilock_iocb_for_write( 203 struct kiocb *iocb, 204 unsigned int *lock_mode) 205 { 206 ssize_t ret; 207 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 208 209 ret = xfs_ilock_iocb(iocb, *lock_mode); 210 if (ret) 211 return ret; 212 213 /* 214 * If a reflink remap is in progress we always need to take the iolock 215 * exclusively to wait for it to finish. 216 */ 217 if (*lock_mode == XFS_IOLOCK_SHARED && 218 xfs_iflags_test(ip, XFS_IREMAPPING)) { 219 xfs_iunlock(ip, *lock_mode); 220 *lock_mode = XFS_IOLOCK_EXCL; 221 return xfs_ilock_iocb(iocb, *lock_mode); 222 } 223 224 return 0; 225 } 226 227 STATIC ssize_t 228 xfs_file_dio_read( 229 struct kiocb *iocb, 230 struct iov_iter *to) 231 { 232 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 233 ssize_t ret; 234 235 trace_xfs_file_direct_read(iocb, to); 236 237 if (!iov_iter_count(to)) 238 return 0; /* skip atime */ 239 240 file_accessed(iocb->ki_filp); 241 242 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 243 if (ret) 244 return ret; 245 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 246 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 247 248 return ret; 249 } 250 251 static noinline ssize_t 252 xfs_file_dax_read( 253 struct kiocb *iocb, 254 struct iov_iter *to) 255 { 256 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 257 ssize_t ret = 0; 258 259 trace_xfs_file_dax_read(iocb, to); 260 261 if (!iov_iter_count(to)) 262 return 0; /* skip atime */ 263 264 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 265 if (ret) 266 return ret; 267 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); 268 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 269 270 file_accessed(iocb->ki_filp); 271 return ret; 272 } 273 274 STATIC ssize_t 275 xfs_file_buffered_read( 276 struct kiocb *iocb, 277 struct iov_iter *to) 278 { 279 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 280 ssize_t ret; 281 282 trace_xfs_file_buffered_read(iocb, to); 283 284 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 285 if (ret) 286 return ret; 287 ret = generic_file_read_iter(iocb, to); 288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 289 290 return ret; 291 } 292 293 STATIC ssize_t 294 xfs_file_read_iter( 295 struct kiocb *iocb, 296 struct iov_iter *to) 297 { 298 struct inode *inode = file_inode(iocb->ki_filp); 299 struct xfs_mount *mp = XFS_I(inode)->i_mount; 300 ssize_t ret = 0; 301 302 XFS_STATS_INC(mp, xs_read_calls); 303 304 if (xfs_is_shutdown(mp)) 305 return -EIO; 306 307 if (IS_DAX(inode)) 308 ret = xfs_file_dax_read(iocb, to); 309 else if (iocb->ki_flags & IOCB_DIRECT) 310 ret = xfs_file_dio_read(iocb, to); 311 else 312 ret = xfs_file_buffered_read(iocb, to); 313 314 if (ret > 0) 315 XFS_STATS_ADD(mp, xs_read_bytes, ret); 316 return ret; 317 } 318 319 STATIC ssize_t 320 xfs_file_splice_read( 321 struct file *in, 322 loff_t *ppos, 323 struct pipe_inode_info *pipe, 324 size_t len, 325 unsigned int flags) 326 { 327 struct inode *inode = file_inode(in); 328 struct xfs_inode *ip = XFS_I(inode); 329 struct xfs_mount *mp = ip->i_mount; 330 ssize_t ret = 0; 331 332 XFS_STATS_INC(mp, xs_read_calls); 333 334 if (xfs_is_shutdown(mp)) 335 return -EIO; 336 337 trace_xfs_file_splice_read(ip, *ppos, len); 338 339 xfs_ilock(ip, XFS_IOLOCK_SHARED); 340 ret = filemap_splice_read(in, ppos, pipe, len, flags); 341 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 342 if (ret > 0) 343 XFS_STATS_ADD(mp, xs_read_bytes, ret); 344 return ret; 345 } 346 347 /* 348 * Take care of zeroing post-EOF blocks when they might exist. 349 * 350 * Returns 0 if successfully, a negative error for a failure, or 1 if this 351 * function dropped the iolock and reacquired it exclusively and the caller 352 * needs to restart the write sanity checks. 353 */ 354 static ssize_t 355 xfs_file_write_zero_eof( 356 struct kiocb *iocb, 357 struct iov_iter *from, 358 unsigned int *iolock, 359 size_t count, 360 bool *drained_dio, 361 struct xfs_zone_alloc_ctx *ac) 362 { 363 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 364 loff_t isize; 365 int error; 366 367 /* 368 * We need to serialise against EOF updates that occur in IO completions 369 * here. We want to make sure that nobody is changing the size while 370 * we do this check until we have placed an IO barrier (i.e. hold 371 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The 372 * spinlock effectively forms a memory barrier once we have 373 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and 374 * hence be able to correctly determine if we need to run zeroing. 375 */ 376 spin_lock(&ip->i_flags_lock); 377 isize = i_size_read(VFS_I(ip)); 378 if (iocb->ki_pos <= isize) { 379 spin_unlock(&ip->i_flags_lock); 380 return 0; 381 } 382 spin_unlock(&ip->i_flags_lock); 383 384 if (iocb->ki_flags & IOCB_NOWAIT) 385 return -EAGAIN; 386 387 if (!*drained_dio) { 388 /* 389 * If zeroing is needed and we are currently holding the iolock 390 * shared, we need to update it to exclusive which implies 391 * having to redo all checks before. 392 */ 393 if (*iolock == XFS_IOLOCK_SHARED) { 394 xfs_iunlock(ip, *iolock); 395 *iolock = XFS_IOLOCK_EXCL; 396 xfs_ilock(ip, *iolock); 397 iov_iter_reexpand(from, count); 398 } 399 400 /* 401 * We now have an IO submission barrier in place, but AIO can do 402 * EOF updates during IO completion and hence we now need to 403 * wait for all of them to drain. Non-AIO DIO will have drained 404 * before we are given the XFS_IOLOCK_EXCL, and so for most 405 * cases this wait is a no-op. 406 */ 407 inode_dio_wait(VFS_I(ip)); 408 *drained_dio = true; 409 return 1; 410 } 411 412 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 413 414 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 415 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); 416 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 417 418 return error; 419 } 420 421 /* 422 * Common pre-write limit and setup checks. 423 * 424 * Called with the iolock held either shared and exclusive according to 425 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 426 * if called for a direct write beyond i_size. 427 */ 428 STATIC ssize_t 429 xfs_file_write_checks( 430 struct kiocb *iocb, 431 struct iov_iter *from, 432 unsigned int *iolock, 433 struct xfs_zone_alloc_ctx *ac) 434 { 435 struct inode *inode = iocb->ki_filp->f_mapping->host; 436 size_t count = iov_iter_count(from); 437 bool drained_dio = false; 438 ssize_t error; 439 440 restart: 441 error = generic_write_checks(iocb, from); 442 if (error <= 0) 443 return error; 444 445 if (iocb->ki_flags & IOCB_NOWAIT) { 446 error = break_layout(inode, false); 447 if (error == -EWOULDBLOCK) 448 error = -EAGAIN; 449 } else { 450 error = xfs_break_layouts(inode, iolock, BREAK_WRITE); 451 } 452 453 if (error) 454 return error; 455 456 /* 457 * For changing security info in file_remove_privs() we need i_rwsem 458 * exclusively. 459 */ 460 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 461 xfs_iunlock(XFS_I(inode), *iolock); 462 *iolock = XFS_IOLOCK_EXCL; 463 error = xfs_ilock_iocb(iocb, *iolock); 464 if (error) { 465 *iolock = 0; 466 return error; 467 } 468 goto restart; 469 } 470 471 /* 472 * If the offset is beyond the size of the file, we need to zero all 473 * blocks that fall between the existing EOF and the start of this 474 * write. 475 * 476 * We can do an unlocked check for i_size here safely as I/O completion 477 * can only extend EOF. Truncate is locked out at this point, so the 478 * EOF can not move backwards, only forwards. Hence we only need to take 479 * the slow path when we are at or beyond the current EOF. 480 */ 481 if (iocb->ki_pos > i_size_read(inode)) { 482 error = xfs_file_write_zero_eof(iocb, from, iolock, count, 483 &drained_dio, ac); 484 if (error == 1) 485 goto restart; 486 if (error) 487 return error; 488 } 489 490 return kiocb_modified(iocb); 491 } 492 493 static ssize_t 494 xfs_zoned_write_space_reserve( 495 struct xfs_mount *mp, 496 struct kiocb *iocb, 497 struct iov_iter *from, 498 unsigned int flags, 499 struct xfs_zone_alloc_ctx *ac) 500 { 501 loff_t count = iov_iter_count(from); 502 int error; 503 504 if (iocb->ki_flags & IOCB_NOWAIT) 505 flags |= XFS_ZR_NOWAIT; 506 507 /* 508 * Check the rlimit and LFS boundary first so that we don't over-reserve 509 * by possibly a lot. 510 * 511 * The generic write path will redo this check later, and it might have 512 * changed by then. If it got expanded we'll stick to our earlier 513 * smaller limit, and if it is decreased the new smaller limit will be 514 * used and our extra space reservation will be returned after finishing 515 * the write. 516 */ 517 error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); 518 if (error) 519 return error; 520 521 /* 522 * Sloppily round up count to file system blocks. 523 * 524 * This will often reserve an extra block, but that avoids having to look 525 * at the start offset, which isn't stable for O_APPEND until taking the 526 * iolock. Also we need to reserve a block each for zeroing the old 527 * EOF block and the new start block if they are unaligned. 528 * 529 * Any remaining block will be returned after the write. 530 */ 531 return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, 532 flags, ac); 533 } 534 535 static int 536 xfs_dio_write_end_io( 537 struct kiocb *iocb, 538 ssize_t size, 539 int error, 540 unsigned flags) 541 { 542 struct inode *inode = file_inode(iocb->ki_filp); 543 struct xfs_inode *ip = XFS_I(inode); 544 loff_t offset = iocb->ki_pos; 545 unsigned int nofs_flag; 546 547 ASSERT(!xfs_is_zoned_inode(ip) || 548 !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 549 550 trace_xfs_end_io_direct_write(ip, offset, size); 551 552 if (xfs_is_shutdown(ip->i_mount)) 553 return -EIO; 554 555 if (error) 556 return error; 557 if (!size) 558 return 0; 559 560 /* 561 * Capture amount written on completion as we can't reliably account 562 * for it on submission. 563 */ 564 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 565 566 /* 567 * We can allocate memory here while doing writeback on behalf of 568 * memory reclaim. To avoid memory allocation deadlocks set the 569 * task-wide nofs context for the following operations. 570 */ 571 nofs_flag = memalloc_nofs_save(); 572 573 if (flags & IOMAP_DIO_COW) { 574 if (iocb->ki_flags & IOCB_ATOMIC) 575 error = xfs_reflink_end_atomic_cow(ip, offset, size); 576 else 577 error = xfs_reflink_end_cow(ip, offset, size); 578 if (error) 579 goto out; 580 } 581 582 /* 583 * Unwritten conversion updates the in-core isize after extent 584 * conversion but before updating the on-disk size. Updating isize any 585 * earlier allows a racing dio read to find unwritten extents before 586 * they are converted. 587 */ 588 if (flags & IOMAP_DIO_UNWRITTEN) { 589 error = xfs_iomap_write_unwritten(ip, offset, size, true); 590 goto out; 591 } 592 593 /* 594 * We need to update the in-core inode size here so that we don't end up 595 * with the on-disk inode size being outside the in-core inode size. We 596 * have no other method of updating EOF for AIO, so always do it here 597 * if necessary. 598 * 599 * We need to lock the test/set EOF update as we can be racing with 600 * other IO completions here to update the EOF. Failing to serialise 601 * here can result in EOF moving backwards and Bad Things Happen when 602 * that occurs. 603 * 604 * As IO completion only ever extends EOF, we can do an unlocked check 605 * here to avoid taking the spinlock. If we land within the current EOF, 606 * then we do not need to do an extending update at all, and we don't 607 * need to take the lock to check this. If we race with an update moving 608 * EOF, then we'll either still be beyond EOF and need to take the lock, 609 * or we'll be within EOF and we don't need to take it at all. 610 */ 611 if (offset + size <= i_size_read(inode)) 612 goto out; 613 614 spin_lock(&ip->i_flags_lock); 615 if (offset + size > i_size_read(inode)) { 616 i_size_write(inode, offset + size); 617 spin_unlock(&ip->i_flags_lock); 618 error = xfs_setfilesize(ip, offset, size); 619 } else { 620 spin_unlock(&ip->i_flags_lock); 621 } 622 623 out: 624 memalloc_nofs_restore(nofs_flag); 625 return error; 626 } 627 628 static const struct iomap_dio_ops xfs_dio_write_ops = { 629 .end_io = xfs_dio_write_end_io, 630 }; 631 632 static void 633 xfs_dio_zoned_submit_io( 634 const struct iomap_iter *iter, 635 struct bio *bio, 636 loff_t file_offset) 637 { 638 struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; 639 struct xfs_zone_alloc_ctx *ac = iter->private; 640 xfs_filblks_t count_fsb; 641 struct iomap_ioend *ioend; 642 643 count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); 644 if (count_fsb > ac->reserved_blocks) { 645 xfs_err(mp, 646 "allocation (%lld) larger than reservation (%lld).", 647 count_fsb, ac->reserved_blocks); 648 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 649 bio_io_error(bio); 650 return; 651 } 652 ac->reserved_blocks -= count_fsb; 653 654 bio->bi_end_io = xfs_end_bio; 655 ioend = iomap_init_ioend(iter->inode, bio, file_offset, 656 IOMAP_IOEND_DIRECT); 657 xfs_zone_alloc_and_submit(ioend, &ac->open_zone); 658 } 659 660 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 661 .bio_set = &iomap_ioend_bioset, 662 .submit_io = xfs_dio_zoned_submit_io, 663 .end_io = xfs_dio_write_end_io, 664 }; 665 666 /* 667 * Handle block aligned direct I/O writes. 668 */ 669 static noinline ssize_t 670 xfs_file_dio_write_aligned( 671 struct xfs_inode *ip, 672 struct kiocb *iocb, 673 struct iov_iter *from, 674 const struct iomap_ops *ops, 675 const struct iomap_dio_ops *dops, 676 struct xfs_zone_alloc_ctx *ac) 677 { 678 unsigned int iolock = XFS_IOLOCK_SHARED; 679 unsigned int dio_flags = 0; 680 ssize_t ret; 681 682 /* 683 * For always COW inodes, each bio must be aligned to the file system 684 * block size and not just the device sector size because we need to 685 * allocate a block-aligned amount of space for each write. 686 */ 687 if (xfs_is_always_cow_inode(ip)) 688 dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; 689 690 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 691 if (ret) 692 return ret; 693 ret = xfs_file_write_checks(iocb, from, &iolock, ac); 694 if (ret) 695 goto out_unlock; 696 697 /* 698 * We don't need to hold the IOLOCK exclusively across the IO, so demote 699 * the iolock back to shared if we had to take the exclusive lock in 700 * xfs_file_write_checks() for other reasons. 701 */ 702 if (iolock == XFS_IOLOCK_EXCL) { 703 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 704 iolock = XFS_IOLOCK_SHARED; 705 } 706 trace_xfs_file_direct_write(iocb, from); 707 ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); 708 out_unlock: 709 xfs_iunlock(ip, iolock); 710 return ret; 711 } 712 713 /* 714 * Handle block aligned direct I/O writes to zoned devices. 715 */ 716 static noinline ssize_t 717 xfs_file_dio_write_zoned( 718 struct xfs_inode *ip, 719 struct kiocb *iocb, 720 struct iov_iter *from) 721 { 722 struct xfs_zone_alloc_ctx ac = { }; 723 ssize_t ret; 724 725 ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); 726 if (ret < 0) 727 return ret; 728 ret = xfs_file_dio_write_aligned(ip, iocb, from, 729 &xfs_zoned_direct_write_iomap_ops, 730 &xfs_dio_zoned_write_ops, &ac); 731 xfs_zoned_space_unreserve(ip->i_mount, &ac); 732 return ret; 733 } 734 735 /* 736 * Handle block atomic writes 737 * 738 * Two methods of atomic writes are supported: 739 * - REQ_ATOMIC-based, which would typically use some form of HW offload in the 740 * disk 741 * - COW-based, which uses a COW fork as a staging extent for data updates 742 * before atomically updating extent mappings for the range being written 743 * 744 */ 745 static noinline ssize_t 746 xfs_file_dio_write_atomic( 747 struct xfs_inode *ip, 748 struct kiocb *iocb, 749 struct iov_iter *from) 750 { 751 unsigned int iolock = XFS_IOLOCK_SHARED; 752 ssize_t ret, ocount = iov_iter_count(from); 753 const struct iomap_ops *dops; 754 755 /* 756 * HW offload should be faster, so try that first if it is already 757 * known that the write length is not too large. 758 */ 759 if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) 760 dops = &xfs_atomic_write_cow_iomap_ops; 761 else 762 dops = &xfs_direct_write_iomap_ops; 763 764 retry: 765 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 766 if (ret) 767 return ret; 768 769 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 770 if (ret) 771 goto out_unlock; 772 773 /* Demote similar to xfs_file_dio_write_aligned() */ 774 if (iolock == XFS_IOLOCK_EXCL) { 775 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 776 iolock = XFS_IOLOCK_SHARED; 777 } 778 779 trace_xfs_file_direct_write(iocb, from); 780 ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, 781 0, NULL, 0); 782 783 /* 784 * The retry mechanism is based on the ->iomap_begin method returning 785 * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not 786 * possible. The REQ_ATOMIC-based method typically not be possible if 787 * the write spans multiple extents or the disk blocks are misaligned. 788 */ 789 if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { 790 xfs_iunlock(ip, iolock); 791 dops = &xfs_atomic_write_cow_iomap_ops; 792 goto retry; 793 } 794 795 out_unlock: 796 if (iolock) 797 xfs_iunlock(ip, iolock); 798 return ret; 799 } 800 801 /* 802 * Handle block unaligned direct I/O writes 803 * 804 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing 805 * them to be done in parallel with reads and other direct I/O writes. However, 806 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need 807 * to do sub-block zeroing and that requires serialisation against other direct 808 * I/O to the same block. In this case we need to serialise the submission of 809 * the unaligned I/O so that we don't get racing block zeroing in the dio layer. 810 * In the case where sub-block zeroing is not required, we can do concurrent 811 * sub-block dios to the same block successfully. 812 * 813 * Optimistically submit the I/O using the shared lock first, but use the 814 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN 815 * if block allocation or partial block zeroing would be required. In that case 816 * we try again with the exclusive lock. 817 */ 818 static noinline ssize_t 819 xfs_file_dio_write_unaligned( 820 struct xfs_inode *ip, 821 struct kiocb *iocb, 822 struct iov_iter *from) 823 { 824 size_t isize = i_size_read(VFS_I(ip)); 825 size_t count = iov_iter_count(from); 826 unsigned int iolock = XFS_IOLOCK_SHARED; 827 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; 828 ssize_t ret; 829 830 /* 831 * Extending writes need exclusivity because of the sub-block zeroing 832 * that the DIO code always does for partial tail blocks beyond EOF, so 833 * don't even bother trying the fast path in this case. 834 */ 835 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { 836 if (iocb->ki_flags & IOCB_NOWAIT) 837 return -EAGAIN; 838 retry_exclusive: 839 iolock = XFS_IOLOCK_EXCL; 840 flags = IOMAP_DIO_FORCE_WAIT; 841 } 842 843 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 844 if (ret) 845 return ret; 846 847 /* 848 * We can't properly handle unaligned direct I/O to reflink files yet, 849 * as we can't unshare a partial block. 850 */ 851 if (xfs_is_cow_inode(ip)) { 852 trace_xfs_reflink_bounce_dio_write(iocb, from); 853 ret = -ENOTBLK; 854 goto out_unlock; 855 } 856 857 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 858 if (ret) 859 goto out_unlock; 860 861 /* 862 * If we are doing exclusive unaligned I/O, this must be the only I/O 863 * in-flight. Otherwise we risk data corruption due to unwritten extent 864 * conversions from the AIO end_io handler. Wait for all other I/O to 865 * drain first. 866 */ 867 if (flags & IOMAP_DIO_FORCE_WAIT) 868 inode_dio_wait(VFS_I(ip)); 869 870 trace_xfs_file_direct_write(iocb, from); 871 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 872 &xfs_dio_write_ops, flags, NULL, 0); 873 874 /* 875 * Retry unaligned I/O with exclusive blocking semantics if the DIO 876 * layer rejected it for mapping or locking reasons. If we are doing 877 * nonblocking user I/O, propagate the error. 878 */ 879 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { 880 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); 881 xfs_iunlock(ip, iolock); 882 goto retry_exclusive; 883 } 884 885 out_unlock: 886 if (iolock) 887 xfs_iunlock(ip, iolock); 888 return ret; 889 } 890 891 static ssize_t 892 xfs_file_dio_write( 893 struct kiocb *iocb, 894 struct iov_iter *from) 895 { 896 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); 897 struct xfs_buftarg *target = xfs_inode_buftarg(ip); 898 size_t count = iov_iter_count(from); 899 900 /* direct I/O must be aligned to device logical sector size */ 901 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 902 return -EINVAL; 903 904 if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) 905 return xfs_file_dio_write_unaligned(ip, iocb, from); 906 if (xfs_is_zoned_inode(ip)) 907 return xfs_file_dio_write_zoned(ip, iocb, from); 908 if (iocb->ki_flags & IOCB_ATOMIC) 909 return xfs_file_dio_write_atomic(ip, iocb, from); 910 return xfs_file_dio_write_aligned(ip, iocb, from, 911 &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); 912 } 913 914 static noinline ssize_t 915 xfs_file_dax_write( 916 struct kiocb *iocb, 917 struct iov_iter *from) 918 { 919 struct inode *inode = iocb->ki_filp->f_mapping->host; 920 struct xfs_inode *ip = XFS_I(inode); 921 unsigned int iolock = XFS_IOLOCK_EXCL; 922 ssize_t ret, error = 0; 923 loff_t pos; 924 925 ret = xfs_ilock_iocb(iocb, iolock); 926 if (ret) 927 return ret; 928 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 929 if (ret) 930 goto out; 931 932 pos = iocb->ki_pos; 933 934 trace_xfs_file_dax_write(iocb, from); 935 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); 936 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 937 i_size_write(inode, iocb->ki_pos); 938 error = xfs_setfilesize(ip, pos, ret); 939 } 940 out: 941 if (iolock) 942 xfs_iunlock(ip, iolock); 943 if (error) 944 return error; 945 946 if (ret > 0) { 947 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 948 949 /* Handle various SYNC-type writes */ 950 ret = generic_write_sync(iocb, ret); 951 } 952 return ret; 953 } 954 955 STATIC ssize_t 956 xfs_file_buffered_write( 957 struct kiocb *iocb, 958 struct iov_iter *from) 959 { 960 struct inode *inode = iocb->ki_filp->f_mapping->host; 961 struct xfs_inode *ip = XFS_I(inode); 962 ssize_t ret; 963 bool cleared_space = false; 964 unsigned int iolock; 965 966 write_retry: 967 iolock = XFS_IOLOCK_EXCL; 968 ret = xfs_ilock_iocb(iocb, iolock); 969 if (ret) 970 return ret; 971 972 ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 973 if (ret) 974 goto out; 975 976 trace_xfs_file_buffered_write(iocb, from); 977 ret = iomap_file_buffered_write(iocb, from, 978 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 979 NULL); 980 981 /* 982 * If we hit a space limit, try to free up some lingering preallocated 983 * space before returning an error. In the case of ENOSPC, first try to 984 * write back all dirty inodes to free up some of the excess reserved 985 * metadata space. This reduces the chances that the eofblocks scan 986 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 987 * also behaves as a filter to prevent too many eofblocks scans from 988 * running at the same time. Use a synchronous scan to increase the 989 * effectiveness of the scan. 990 */ 991 if (ret == -EDQUOT && !cleared_space) { 992 xfs_iunlock(ip, iolock); 993 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); 994 cleared_space = true; 995 goto write_retry; 996 } else if (ret == -ENOSPC && !cleared_space) { 997 struct xfs_icwalk icw = {0}; 998 999 cleared_space = true; 1000 xfs_flush_inodes(ip->i_mount); 1001 1002 xfs_iunlock(ip, iolock); 1003 icw.icw_flags = XFS_ICWALK_FLAG_SYNC; 1004 xfs_blockgc_free_space(ip->i_mount, &icw); 1005 goto write_retry; 1006 } 1007 1008 out: 1009 if (iolock) 1010 xfs_iunlock(ip, iolock); 1011 1012 if (ret > 0) { 1013 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 1014 /* Handle various SYNC-type writes */ 1015 ret = generic_write_sync(iocb, ret); 1016 } 1017 return ret; 1018 } 1019 1020 STATIC ssize_t 1021 xfs_file_buffered_write_zoned( 1022 struct kiocb *iocb, 1023 struct iov_iter *from) 1024 { 1025 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 1026 struct xfs_mount *mp = ip->i_mount; 1027 unsigned int iolock = XFS_IOLOCK_EXCL; 1028 bool cleared_space = false; 1029 struct xfs_zone_alloc_ctx ac = { }; 1030 ssize_t ret; 1031 1032 ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); 1033 if (ret < 0) 1034 return ret; 1035 1036 ret = xfs_ilock_iocb(iocb, iolock); 1037 if (ret) 1038 goto out_unreserve; 1039 1040 ret = xfs_file_write_checks(iocb, from, &iolock, &ac); 1041 if (ret) 1042 goto out_unlock; 1043 1044 /* 1045 * Truncate the iter to the length that we were actually able to 1046 * allocate blocks for. This needs to happen after 1047 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND 1048 * writes. 1049 */ 1050 iov_iter_truncate(from, 1051 XFS_FSB_TO_B(mp, ac.reserved_blocks) - 1052 (iocb->ki_pos & mp->m_blockmask)); 1053 if (!iov_iter_count(from)) 1054 goto out_unlock; 1055 1056 retry: 1057 trace_xfs_file_buffered_write(iocb, from); 1058 ret = iomap_file_buffered_write(iocb, from, 1059 &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, 1060 &ac); 1061 if (ret == -ENOSPC && !cleared_space) { 1062 /* 1063 * Kick off writeback to convert delalloc space and release the 1064 * usually too pessimistic indirect block reservations. 1065 */ 1066 xfs_flush_inodes(mp); 1067 cleared_space = true; 1068 goto retry; 1069 } 1070 1071 out_unlock: 1072 xfs_iunlock(ip, iolock); 1073 out_unreserve: 1074 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1075 if (ret > 0) { 1076 XFS_STATS_ADD(mp, xs_write_bytes, ret); 1077 ret = generic_write_sync(iocb, ret); 1078 } 1079 return ret; 1080 } 1081 1082 STATIC ssize_t 1083 xfs_file_write_iter( 1084 struct kiocb *iocb, 1085 struct iov_iter *from) 1086 { 1087 struct inode *inode = iocb->ki_filp->f_mapping->host; 1088 struct xfs_inode *ip = XFS_I(inode); 1089 ssize_t ret; 1090 size_t ocount = iov_iter_count(from); 1091 1092 XFS_STATS_INC(ip->i_mount, xs_write_calls); 1093 1094 if (ocount == 0) 1095 return 0; 1096 1097 if (xfs_is_shutdown(ip->i_mount)) 1098 return -EIO; 1099 1100 if (iocb->ki_flags & IOCB_ATOMIC) { 1101 if (ocount < xfs_get_atomic_write_min(ip)) 1102 return -EINVAL; 1103 1104 if (ocount > xfs_get_atomic_write_max(ip)) 1105 return -EINVAL; 1106 1107 ret = generic_atomic_write_valid(iocb, from); 1108 if (ret) 1109 return ret; 1110 } 1111 1112 if (IS_DAX(inode)) 1113 return xfs_file_dax_write(iocb, from); 1114 1115 if (iocb->ki_flags & IOCB_DIRECT) { 1116 /* 1117 * Allow a directio write to fall back to a buffered 1118 * write *only* in the case that we're doing a reflink 1119 * CoW. In all other directio scenarios we do not 1120 * allow an operation to fall back to buffered mode. 1121 */ 1122 ret = xfs_file_dio_write(iocb, from); 1123 if (ret != -ENOTBLK) 1124 return ret; 1125 } 1126 1127 if (xfs_is_zoned_inode(ip)) 1128 return xfs_file_buffered_write_zoned(iocb, from); 1129 return xfs_file_buffered_write(iocb, from); 1130 } 1131 1132 /* Does this file, inode, or mount want synchronous writes? */ 1133 static inline bool xfs_file_sync_writes(struct file *filp) 1134 { 1135 struct xfs_inode *ip = XFS_I(file_inode(filp)); 1136 1137 if (xfs_has_wsync(ip->i_mount)) 1138 return true; 1139 if (filp->f_flags & (__O_SYNC | O_DSYNC)) 1140 return true; 1141 if (IS_SYNC(file_inode(filp))) 1142 return true; 1143 1144 return false; 1145 } 1146 1147 static int 1148 xfs_falloc_newsize( 1149 struct file *file, 1150 int mode, 1151 loff_t offset, 1152 loff_t len, 1153 loff_t *new_size) 1154 { 1155 struct inode *inode = file_inode(file); 1156 1157 if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) 1158 return 0; 1159 *new_size = offset + len; 1160 return inode_newsize_ok(inode, *new_size); 1161 } 1162 1163 static int 1164 xfs_falloc_setsize( 1165 struct file *file, 1166 loff_t new_size) 1167 { 1168 struct iattr iattr = { 1169 .ia_valid = ATTR_SIZE, 1170 .ia_size = new_size, 1171 }; 1172 1173 if (!new_size) 1174 return 0; 1175 return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), 1176 &iattr); 1177 } 1178 1179 static int 1180 xfs_falloc_collapse_range( 1181 struct file *file, 1182 loff_t offset, 1183 loff_t len, 1184 struct xfs_zone_alloc_ctx *ac) 1185 { 1186 struct inode *inode = file_inode(file); 1187 loff_t new_size = i_size_read(inode) - len; 1188 int error; 1189 1190 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1191 return -EINVAL; 1192 1193 /* 1194 * There is no need to overlap collapse range with EOF, in which case it 1195 * is effectively a truncate operation 1196 */ 1197 if (offset + len >= i_size_read(inode)) 1198 return -EINVAL; 1199 1200 error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); 1201 if (error) 1202 return error; 1203 return xfs_falloc_setsize(file, new_size); 1204 } 1205 1206 static int 1207 xfs_falloc_insert_range( 1208 struct file *file, 1209 loff_t offset, 1210 loff_t len) 1211 { 1212 struct inode *inode = file_inode(file); 1213 loff_t isize = i_size_read(inode); 1214 int error; 1215 1216 if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) 1217 return -EINVAL; 1218 1219 /* 1220 * New inode size must not exceed ->s_maxbytes, accounting for 1221 * possible signed overflow. 1222 */ 1223 if (inode->i_sb->s_maxbytes - isize < len) 1224 return -EFBIG; 1225 1226 /* Offset should be less than i_size */ 1227 if (offset >= isize) 1228 return -EINVAL; 1229 1230 error = xfs_falloc_setsize(file, isize + len); 1231 if (error) 1232 return error; 1233 1234 /* 1235 * Perform hole insertion now that the file size has been updated so 1236 * that if we crash during the operation we don't leave shifted extents 1237 * past EOF and hence losing access to the data that is contained within 1238 * them. 1239 */ 1240 return xfs_insert_file_space(XFS_I(inode), offset, len); 1241 } 1242 1243 /* 1244 * For various operations we need to zero up to one block at each end of 1245 * the affected range. For zoned file systems this will require a space 1246 * allocation, for which we need a reservation ahead of time. 1247 */ 1248 #define XFS_ZONED_ZERO_EDGE_SPACE_RES 2 1249 1250 /* 1251 * Zero range implements a full zeroing mechanism but is only used in limited 1252 * situations. It is more efficient to allocate unwritten extents than to 1253 * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG 1254 * kernels for added test coverage. 1255 * 1256 * On zoned file systems, the error is already injected by 1257 * xfs_file_zoned_fallocate, which then reserves the additional space needed. 1258 * We only check for this extra space reservation here. 1259 */ 1260 static inline bool 1261 xfs_falloc_force_zero( 1262 struct xfs_inode *ip, 1263 struct xfs_zone_alloc_ctx *ac) 1264 { 1265 if (xfs_is_zoned_inode(ip)) { 1266 if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) { 1267 ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG)); 1268 return true; 1269 } 1270 return false; 1271 } 1272 return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE); 1273 } 1274 1275 /* 1276 * Punch a hole and prealloc the range. We use a hole punch rather than 1277 * unwritten extent conversion for two reasons: 1278 * 1279 * 1.) Hole punch handles partial block zeroing for us. 1280 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by 1281 * virtue of the hole punch. 1282 */ 1283 static int 1284 xfs_falloc_zero_range( 1285 struct file *file, 1286 int mode, 1287 loff_t offset, 1288 loff_t len, 1289 struct xfs_zone_alloc_ctx *ac) 1290 { 1291 struct inode *inode = file_inode(file); 1292 struct xfs_inode *ip = XFS_I(inode); 1293 unsigned int blksize = i_blocksize(inode); 1294 loff_t new_size = 0; 1295 int error; 1296 1297 trace_xfs_zero_file_space(ip); 1298 1299 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1300 if (error) 1301 return error; 1302 1303 if (xfs_falloc_force_zero(ip, ac)) { 1304 error = xfs_zero_range(ip, offset, len, ac, NULL); 1305 } else { 1306 error = xfs_free_file_space(ip, offset, len, ac); 1307 if (error) 1308 return error; 1309 1310 len = round_up(offset + len, blksize) - 1311 round_down(offset, blksize); 1312 offset = round_down(offset, blksize); 1313 error = xfs_alloc_file_space(ip, offset, len); 1314 } 1315 if (error) 1316 return error; 1317 return xfs_falloc_setsize(file, new_size); 1318 } 1319 1320 static int 1321 xfs_falloc_unshare_range( 1322 struct file *file, 1323 int mode, 1324 loff_t offset, 1325 loff_t len) 1326 { 1327 struct inode *inode = file_inode(file); 1328 loff_t new_size = 0; 1329 int error; 1330 1331 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1332 if (error) 1333 return error; 1334 1335 error = xfs_reflink_unshare(XFS_I(inode), offset, len); 1336 if (error) 1337 return error; 1338 1339 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1340 if (error) 1341 return error; 1342 return xfs_falloc_setsize(file, new_size); 1343 } 1344 1345 static int 1346 xfs_falloc_allocate_range( 1347 struct file *file, 1348 int mode, 1349 loff_t offset, 1350 loff_t len) 1351 { 1352 struct inode *inode = file_inode(file); 1353 loff_t new_size = 0; 1354 int error; 1355 1356 /* 1357 * If always_cow mode we can't use preallocations and thus should not 1358 * create them. 1359 */ 1360 if (xfs_is_always_cow_inode(XFS_I(inode))) 1361 return -EOPNOTSUPP; 1362 1363 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1364 if (error) 1365 return error; 1366 1367 error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1368 if (error) 1369 return error; 1370 return xfs_falloc_setsize(file, new_size); 1371 } 1372 1373 #define XFS_FALLOC_FL_SUPPORTED \ 1374 (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ 1375 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ 1376 FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ 1377 FALLOC_FL_UNSHARE_RANGE) 1378 1379 STATIC long 1380 __xfs_file_fallocate( 1381 struct file *file, 1382 int mode, 1383 loff_t offset, 1384 loff_t len, 1385 struct xfs_zone_alloc_ctx *ac) 1386 { 1387 struct inode *inode = file_inode(file); 1388 struct xfs_inode *ip = XFS_I(inode); 1389 long error; 1390 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1391 1392 xfs_ilock(ip, iolock); 1393 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); 1394 if (error) 1395 goto out_unlock; 1396 1397 /* 1398 * Must wait for all AIO to complete before we continue as AIO can 1399 * change the file size on completion without holding any locks we 1400 * currently hold. We must do this first because AIO can update both 1401 * the on disk and in memory inode sizes, and the operations that follow 1402 * require the in-memory size to be fully up-to-date. 1403 */ 1404 inode_dio_wait(inode); 1405 1406 error = file_modified(file); 1407 if (error) 1408 goto out_unlock; 1409 1410 switch (mode & FALLOC_FL_MODE_MASK) { 1411 case FALLOC_FL_PUNCH_HOLE: 1412 error = xfs_free_file_space(ip, offset, len, ac); 1413 break; 1414 case FALLOC_FL_COLLAPSE_RANGE: 1415 error = xfs_falloc_collapse_range(file, offset, len, ac); 1416 break; 1417 case FALLOC_FL_INSERT_RANGE: 1418 error = xfs_falloc_insert_range(file, offset, len); 1419 break; 1420 case FALLOC_FL_ZERO_RANGE: 1421 error = xfs_falloc_zero_range(file, mode, offset, len, ac); 1422 break; 1423 case FALLOC_FL_UNSHARE_RANGE: 1424 error = xfs_falloc_unshare_range(file, mode, offset, len); 1425 break; 1426 case FALLOC_FL_ALLOCATE_RANGE: 1427 error = xfs_falloc_allocate_range(file, mode, offset, len); 1428 break; 1429 default: 1430 error = -EOPNOTSUPP; 1431 break; 1432 } 1433 1434 if (!error && xfs_file_sync_writes(file)) 1435 error = xfs_log_force_inode(ip); 1436 1437 out_unlock: 1438 xfs_iunlock(ip, iolock); 1439 return error; 1440 } 1441 1442 static long 1443 xfs_file_zoned_fallocate( 1444 struct file *file, 1445 int mode, 1446 loff_t offset, 1447 loff_t len) 1448 { 1449 struct xfs_zone_alloc_ctx ac = { }; 1450 struct xfs_inode *ip = XFS_I(file_inode(file)); 1451 struct xfs_mount *mp = ip->i_mount; 1452 xfs_filblks_t count_fsb; 1453 int error; 1454 1455 /* 1456 * If full zeroing is forced by the error injection knob, we need a 1457 * space reservation that covers the entire range. See the comment in 1458 * xfs_zoned_write_space_reserve for the rationale for the calculation. 1459 * Otherwise just reserve space for the two boundary blocks. 1460 */ 1461 count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES; 1462 if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE && 1463 XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE)) 1464 count_fsb += XFS_B_TO_FSB(mp, len) + 1; 1465 1466 error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac); 1467 if (error) 1468 return error; 1469 error = __xfs_file_fallocate(file, mode, offset, len, &ac); 1470 xfs_zoned_space_unreserve(mp, &ac); 1471 return error; 1472 } 1473 1474 static long 1475 xfs_file_fallocate( 1476 struct file *file, 1477 int mode, 1478 loff_t offset, 1479 loff_t len) 1480 { 1481 struct inode *inode = file_inode(file); 1482 1483 if (!S_ISREG(inode->i_mode)) 1484 return -EINVAL; 1485 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1486 return -EOPNOTSUPP; 1487 1488 /* 1489 * For zoned file systems, zeroing the first and last block of a hole 1490 * punch requires allocating a new block to rewrite the remaining data 1491 * and new zeroes out of place. Get a reservations for those before 1492 * taking the iolock. Dip into the reserved pool because we are 1493 * expected to be able to punch a hole even on a completely full 1494 * file system. 1495 */ 1496 if (xfs_is_zoned_inode(XFS_I(inode)) && 1497 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 1498 FALLOC_FL_COLLAPSE_RANGE))) 1499 return xfs_file_zoned_fallocate(file, mode, offset, len); 1500 return __xfs_file_fallocate(file, mode, offset, len, NULL); 1501 } 1502 1503 STATIC int 1504 xfs_file_fadvise( 1505 struct file *file, 1506 loff_t start, 1507 loff_t end, 1508 int advice) 1509 { 1510 struct xfs_inode *ip = XFS_I(file_inode(file)); 1511 int ret; 1512 int lockflags = 0; 1513 1514 /* 1515 * Operations creating pages in page cache need protection from hole 1516 * punching and similar ops 1517 */ 1518 if (advice == POSIX_FADV_WILLNEED) { 1519 lockflags = XFS_IOLOCK_SHARED; 1520 xfs_ilock(ip, lockflags); 1521 } 1522 ret = generic_fadvise(file, start, end, advice); 1523 if (lockflags) 1524 xfs_iunlock(ip, lockflags); 1525 return ret; 1526 } 1527 1528 STATIC loff_t 1529 xfs_file_remap_range( 1530 struct file *file_in, 1531 loff_t pos_in, 1532 struct file *file_out, 1533 loff_t pos_out, 1534 loff_t len, 1535 unsigned int remap_flags) 1536 { 1537 struct inode *inode_in = file_inode(file_in); 1538 struct xfs_inode *src = XFS_I(inode_in); 1539 struct inode *inode_out = file_inode(file_out); 1540 struct xfs_inode *dest = XFS_I(inode_out); 1541 struct xfs_mount *mp = src->i_mount; 1542 loff_t remapped = 0; 1543 xfs_extlen_t cowextsize; 1544 int ret; 1545 1546 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 1547 return -EINVAL; 1548 1549 if (!xfs_has_reflink(mp)) 1550 return -EOPNOTSUPP; 1551 1552 if (xfs_is_shutdown(mp)) 1553 return -EIO; 1554 1555 /* Prepare and then clone file data. */ 1556 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1557 &len, remap_flags); 1558 if (ret || len == 0) 1559 return ret; 1560 1561 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); 1562 1563 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, 1564 &remapped); 1565 if (ret) 1566 goto out_unlock; 1567 1568 /* 1569 * Carry the cowextsize hint from src to dest if we're sharing the 1570 * entire source file to the entire destination file, the source file 1571 * has a cowextsize hint, and the destination file does not. 1572 */ 1573 cowextsize = 0; 1574 if (pos_in == 0 && len == i_size_read(inode_in) && 1575 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1576 pos_out == 0 && len >= i_size_read(inode_out) && 1577 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)) 1578 cowextsize = src->i_cowextsize; 1579 1580 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, 1581 remap_flags); 1582 if (ret) 1583 goto out_unlock; 1584 1585 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) 1586 xfs_log_force_inode(dest); 1587 out_unlock: 1588 xfs_iunlock2_remapping(src, dest); 1589 if (ret) 1590 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1591 /* 1592 * If the caller did not set CAN_SHORTEN, then it is not prepared to 1593 * handle partial results -- either the whole remap succeeds, or we 1594 * must say why it did not. In this case, any error should be returned 1595 * to the caller. 1596 */ 1597 if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) 1598 return ret; 1599 return remapped > 0 ? remapped : ret; 1600 } 1601 1602 STATIC int 1603 xfs_file_open( 1604 struct inode *inode, 1605 struct file *file) 1606 { 1607 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1608 return -EIO; 1609 file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 1610 if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) 1611 file->f_mode |= FMODE_CAN_ATOMIC_WRITE; 1612 return generic_file_open(inode, file); 1613 } 1614 1615 STATIC int 1616 xfs_dir_open( 1617 struct inode *inode, 1618 struct file *file) 1619 { 1620 struct xfs_inode *ip = XFS_I(inode); 1621 unsigned int mode; 1622 int error; 1623 1624 if (xfs_is_shutdown(ip->i_mount)) 1625 return -EIO; 1626 error = generic_file_open(inode, file); 1627 if (error) 1628 return error; 1629 1630 /* 1631 * If there are any blocks, read-ahead block 0 as we're almost 1632 * certain to have the next operation be a read there. 1633 */ 1634 mode = xfs_ilock_data_map_shared(ip); 1635 if (ip->i_df.if_nextents > 0) 1636 error = xfs_dir3_data_readahead(ip, 0, 0); 1637 xfs_iunlock(ip, mode); 1638 return error; 1639 } 1640 1641 /* 1642 * Don't bother propagating errors. We're just doing cleanup, and the caller 1643 * ignores the return value anyway. 1644 */ 1645 STATIC int 1646 xfs_file_release( 1647 struct inode *inode, 1648 struct file *file) 1649 { 1650 struct xfs_inode *ip = XFS_I(inode); 1651 struct xfs_mount *mp = ip->i_mount; 1652 1653 /* 1654 * If this is a read-only mount or the file system has been shut down, 1655 * don't generate I/O. 1656 */ 1657 if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) 1658 return 0; 1659 1660 /* 1661 * If we previously truncated this file and removed old data in the 1662 * process, we want to initiate "early" writeout on the last close. 1663 * This is an attempt to combat the notorious NULL files problem which 1664 * is particularly noticeable from a truncate down, buffered (re-)write 1665 * (delalloc), followed by a crash. What we are effectively doing here 1666 * is significantly reducing the time window where we'd otherwise be 1667 * exposed to that problem. 1668 */ 1669 if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { 1670 xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); 1671 if (ip->i_delayed_blks > 0) 1672 filemap_flush(inode->i_mapping); 1673 } 1674 1675 /* 1676 * XFS aggressively preallocates post-EOF space to generate contiguous 1677 * allocations for writers that append to the end of the file. 1678 * 1679 * To support workloads that close and reopen the file frequently, these 1680 * preallocations usually persist after a close unless it is the first 1681 * close for the inode. This is a tradeoff to generate tightly packed 1682 * data layouts for unpacking tarballs or similar archives that write 1683 * one file after another without going back to it while keeping the 1684 * preallocation for files that have recurring open/write/close cycles. 1685 * 1686 * This heuristic is skipped for inodes with the append-only flag as 1687 * that flag is rather pointless for inodes written only once. 1688 * 1689 * There is no point in freeing blocks here for open but unlinked files 1690 * as they will be taken care of by the inactivation path soon. 1691 * 1692 * When releasing a read-only context, don't flush data or trim post-EOF 1693 * blocks. This avoids open/read/close workloads from removing EOF 1694 * blocks that other writers depend upon to reduce fragmentation. 1695 * 1696 * Inodes on the zoned RT device never have preallocations, so skip 1697 * taking the locks below. 1698 */ 1699 if (!inode->i_nlink || 1700 !(file->f_mode & FMODE_WRITE) || 1701 (ip->i_diflags & XFS_DIFLAG_APPEND) || 1702 xfs_is_zoned_inode(ip)) 1703 return 0; 1704 1705 /* 1706 * If we can't get the iolock just skip truncating the blocks past EOF 1707 * because we could deadlock with the mmap_lock otherwise. We'll get 1708 * another chance to drop them once the last reference to the inode is 1709 * dropped, so we'll never leak blocks permanently. 1710 */ 1711 if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1712 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1713 if (xfs_can_free_eofblocks(ip) && 1714 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) 1715 xfs_free_eofblocks(ip); 1716 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1717 } 1718 1719 return 0; 1720 } 1721 1722 STATIC int 1723 xfs_file_readdir( 1724 struct file *file, 1725 struct dir_context *ctx) 1726 { 1727 struct inode *inode = file_inode(file); 1728 xfs_inode_t *ip = XFS_I(inode); 1729 size_t bufsize; 1730 1731 /* 1732 * The Linux API doesn't pass down the total size of the buffer 1733 * we read into down to the filesystem. With the filldir concept 1734 * it's not needed for correct information, but the XFS dir2 leaf 1735 * code wants an estimate of the buffer size to calculate it's 1736 * readahead window and size the buffers used for mapping to 1737 * physical blocks. 1738 * 1739 * Try to give it an estimate that's good enough, maybe at some 1740 * point we can change the ->readdir prototype to include the 1741 * buffer size. For now we use the current glibc buffer size. 1742 */ 1743 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); 1744 1745 return xfs_readdir(NULL, ip, ctx, bufsize); 1746 } 1747 1748 STATIC loff_t 1749 xfs_file_llseek( 1750 struct file *file, 1751 loff_t offset, 1752 int whence) 1753 { 1754 struct inode *inode = file->f_mapping->host; 1755 1756 if (xfs_is_shutdown(XFS_I(inode)->i_mount)) 1757 return -EIO; 1758 1759 switch (whence) { 1760 default: 1761 return generic_file_llseek(file, offset, whence); 1762 case SEEK_HOLE: 1763 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); 1764 break; 1765 case SEEK_DATA: 1766 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); 1767 break; 1768 } 1769 1770 if (offset < 0) 1771 return offset; 1772 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1773 } 1774 1775 static inline vm_fault_t 1776 xfs_dax_fault_locked( 1777 struct vm_fault *vmf, 1778 unsigned int order, 1779 bool write_fault) 1780 { 1781 vm_fault_t ret; 1782 unsigned long pfn; 1783 1784 if (!IS_ENABLED(CONFIG_FS_DAX)) { 1785 ASSERT(0); 1786 return VM_FAULT_SIGBUS; 1787 } 1788 ret = dax_iomap_fault(vmf, order, &pfn, NULL, 1789 (write_fault && !vmf->cow_page) ? 1790 &xfs_dax_write_iomap_ops : 1791 &xfs_read_iomap_ops); 1792 if (ret & VM_FAULT_NEEDDSYNC) 1793 ret = dax_finish_sync_fault(vmf, order, pfn); 1794 return ret; 1795 } 1796 1797 static vm_fault_t 1798 xfs_dax_read_fault( 1799 struct vm_fault *vmf, 1800 unsigned int order) 1801 { 1802 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1803 vm_fault_t ret; 1804 1805 trace_xfs_read_fault(ip, order); 1806 1807 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1808 ret = xfs_dax_fault_locked(vmf, order, false); 1809 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1810 1811 return ret; 1812 } 1813 1814 /* 1815 * Locking for serialisation of IO during page faults. This results in a lock 1816 * ordering of: 1817 * 1818 * mmap_lock (MM) 1819 * sb_start_pagefault(vfs, freeze) 1820 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) 1821 * page_lock (MM) 1822 * i_lock (XFS - extent map serialisation) 1823 */ 1824 static vm_fault_t 1825 __xfs_write_fault( 1826 struct vm_fault *vmf, 1827 unsigned int order, 1828 struct xfs_zone_alloc_ctx *ac) 1829 { 1830 struct inode *inode = file_inode(vmf->vma->vm_file); 1831 struct xfs_inode *ip = XFS_I(inode); 1832 unsigned int lock_mode = XFS_MMAPLOCK_SHARED; 1833 vm_fault_t ret; 1834 1835 trace_xfs_write_fault(ip, order); 1836 1837 sb_start_pagefault(inode->i_sb); 1838 file_update_time(vmf->vma->vm_file); 1839 1840 /* 1841 * Normally we only need the shared mmaplock, but if a reflink remap is 1842 * in progress we take the exclusive lock to wait for the remap to 1843 * finish before taking a write fault. 1844 */ 1845 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1846 if (xfs_iflags_test(ip, XFS_IREMAPPING)) { 1847 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1848 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1849 lock_mode = XFS_MMAPLOCK_EXCL; 1850 } 1851 1852 if (IS_DAX(inode)) 1853 ret = xfs_dax_fault_locked(vmf, order, true); 1854 else 1855 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, 1856 ac); 1857 xfs_iunlock(ip, lock_mode); 1858 1859 sb_end_pagefault(inode->i_sb); 1860 return ret; 1861 } 1862 1863 static vm_fault_t 1864 xfs_write_fault_zoned( 1865 struct vm_fault *vmf, 1866 unsigned int order) 1867 { 1868 struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1869 unsigned int len = folio_size(page_folio(vmf->page)); 1870 struct xfs_zone_alloc_ctx ac = { }; 1871 int error; 1872 vm_fault_t ret; 1873 1874 /* 1875 * This could over-allocate as it doesn't check for truncation. 1876 * 1877 * But as the overallocation is limited to less than a folio and will be 1878 * release instantly that's just fine. 1879 */ 1880 error = xfs_zoned_space_reserve(ip->i_mount, 1881 XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); 1882 if (error < 0) 1883 return vmf_fs_error(error); 1884 ret = __xfs_write_fault(vmf, order, &ac); 1885 xfs_zoned_space_unreserve(ip->i_mount, &ac); 1886 return ret; 1887 } 1888 1889 static vm_fault_t 1890 xfs_write_fault( 1891 struct vm_fault *vmf, 1892 unsigned int order) 1893 { 1894 if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) 1895 return xfs_write_fault_zoned(vmf, order); 1896 return __xfs_write_fault(vmf, order, NULL); 1897 } 1898 1899 static inline bool 1900 xfs_is_write_fault( 1901 struct vm_fault *vmf) 1902 { 1903 return (vmf->flags & FAULT_FLAG_WRITE) && 1904 (vmf->vma->vm_flags & VM_SHARED); 1905 } 1906 1907 static vm_fault_t 1908 xfs_filemap_fault( 1909 struct vm_fault *vmf) 1910 { 1911 struct inode *inode = file_inode(vmf->vma->vm_file); 1912 1913 /* DAX can shortcut the normal fault path on write faults! */ 1914 if (IS_DAX(inode)) { 1915 if (xfs_is_write_fault(vmf)) 1916 return xfs_write_fault(vmf, 0); 1917 return xfs_dax_read_fault(vmf, 0); 1918 } 1919 1920 trace_xfs_read_fault(XFS_I(inode), 0); 1921 return filemap_fault(vmf); 1922 } 1923 1924 static vm_fault_t 1925 xfs_filemap_huge_fault( 1926 struct vm_fault *vmf, 1927 unsigned int order) 1928 { 1929 if (!IS_DAX(file_inode(vmf->vma->vm_file))) 1930 return VM_FAULT_FALLBACK; 1931 1932 /* DAX can shortcut the normal fault path on write faults! */ 1933 if (xfs_is_write_fault(vmf)) 1934 return xfs_write_fault(vmf, order); 1935 return xfs_dax_read_fault(vmf, order); 1936 } 1937 1938 static vm_fault_t 1939 xfs_filemap_page_mkwrite( 1940 struct vm_fault *vmf) 1941 { 1942 return xfs_write_fault(vmf, 0); 1943 } 1944 1945 /* 1946 * pfn_mkwrite was originally intended to ensure we capture time stamp updates 1947 * on write faults. In reality, it needs to serialise against truncate and 1948 * prepare memory for writing so handle is as standard write fault. 1949 */ 1950 static vm_fault_t 1951 xfs_filemap_pfn_mkwrite( 1952 struct vm_fault *vmf) 1953 { 1954 return xfs_write_fault(vmf, 0); 1955 } 1956 1957 static const struct vm_operations_struct xfs_file_vm_ops = { 1958 .fault = xfs_filemap_fault, 1959 .huge_fault = xfs_filemap_huge_fault, 1960 .map_pages = filemap_map_pages, 1961 .page_mkwrite = xfs_filemap_page_mkwrite, 1962 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1963 }; 1964 1965 STATIC int 1966 xfs_file_mmap_prepare( 1967 struct vm_area_desc *desc) 1968 { 1969 struct file *file = desc->file; 1970 struct inode *inode = file_inode(file); 1971 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); 1972 1973 /* 1974 * We don't support synchronous mappings for non-DAX files and 1975 * for DAX files if underneath dax_device is not synchronous. 1976 */ 1977 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), 1978 target->bt_daxdev)) 1979 return -EOPNOTSUPP; 1980 1981 file_accessed(file); 1982 desc->vm_ops = &xfs_file_vm_ops; 1983 if (IS_DAX(inode)) 1984 desc->vm_flags |= VM_HUGEPAGE; 1985 return 0; 1986 } 1987 1988 const struct file_operations xfs_file_operations = { 1989 .llseek = xfs_file_llseek, 1990 .read_iter = xfs_file_read_iter, 1991 .write_iter = xfs_file_write_iter, 1992 .splice_read = xfs_file_splice_read, 1993 .splice_write = iter_file_splice_write, 1994 .iopoll = iocb_bio_iopoll, 1995 .unlocked_ioctl = xfs_file_ioctl, 1996 #ifdef CONFIG_COMPAT 1997 .compat_ioctl = xfs_file_compat_ioctl, 1998 #endif 1999 .mmap_prepare = xfs_file_mmap_prepare, 2000 .open = xfs_file_open, 2001 .release = xfs_file_release, 2002 .fsync = xfs_file_fsync, 2003 .get_unmapped_area = thp_get_unmapped_area, 2004 .fallocate = xfs_file_fallocate, 2005 .fadvise = xfs_file_fadvise, 2006 .remap_file_range = xfs_file_remap_range, 2007 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 2008 FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | 2009 FOP_DONTCACHE, 2010 }; 2011 2012 const struct file_operations xfs_dir_file_operations = { 2013 .open = xfs_dir_open, 2014 .read = generic_read_dir, 2015 .iterate_shared = xfs_file_readdir, 2016 .llseek = generic_file_llseek, 2017 .unlocked_ioctl = xfs_file_ioctl, 2018 #ifdef CONFIG_COMPAT 2019 .compat_ioctl = xfs_file_compat_ioctl, 2020 #endif 2021 .fsync = xfs_dir_fsync, 2022 }; 2023