1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/ext4/file.c 4 * 5 * Copyright (C) 1992, 1993, 1994, 1995 6 * Remy Card (card@masi.ibp.fr) 7 * Laboratoire MASI - Institut Blaise Pascal 8 * Universite Pierre et Marie Curie (Paris VI) 9 * 10 * from 11 * 12 * linux/fs/minix/file.c 13 * 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * ext4 fs regular file handling primitives 17 * 18 * 64-bit file support on 64-bit platforms by Jakub Jelinek 19 * (jj@sunsite.ms.mff.cuni.cz) 20 */ 21 22 #include <linux/time.h> 23 #include <linux/fs.h> 24 #include <linux/iomap.h> 25 #include <linux/mount.h> 26 #include <linux/path.h> 27 #include <linux/dax.h> 28 #include <linux/filelock.h> 29 #include <linux/quotaops.h> 30 #include <linux/pagevec.h> 31 #include <linux/uio.h> 32 #include <linux/mman.h> 33 #include <linux/backing-dev.h> 34 #include "ext4.h" 35 #include "ext4_jbd2.h" 36 #include "xattr.h" 37 #include "acl.h" 38 #include "truncate.h" 39 40 /* 41 * Returns %true if the given DIO request should be attempted with DIO, or 42 * %false if it should fall back to buffered I/O. 43 * 44 * DIO isn't well specified; when it's unsupported (either due to the request 45 * being misaligned, or due to the file not supporting DIO at all), filesystems 46 * either fall back to buffered I/O or return EINVAL. For files that don't use 47 * any special features like encryption or verity, ext4 has traditionally 48 * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. 49 * In this case, we should attempt the DIO, *not* fall back to buffered I/O. 50 * 51 * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 52 * traditionally falls back to buffered I/O. 53 * 54 * This function implements the traditional ext4 behavior in all these cases. 55 */ 56 static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) 57 { 58 struct inode *inode = file_inode(iocb->ki_filp); 59 u32 dio_align = ext4_dio_alignment(inode); 60 61 if (dio_align == 0) 62 return false; 63 64 if (dio_align == 1) 65 return true; 66 67 return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); 68 } 69 70 static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) 71 { 72 ssize_t ret; 73 struct inode *inode = file_inode(iocb->ki_filp); 74 75 if (iocb->ki_flags & IOCB_NOWAIT) { 76 if (!inode_trylock_shared(inode)) 77 return -EAGAIN; 78 } else { 79 inode_lock_shared(inode); 80 } 81 82 if (!ext4_should_use_dio(iocb, to)) { 83 inode_unlock_shared(inode); 84 /* 85 * Fallback to buffered I/O if the operation being performed on 86 * the inode is not supported by direct I/O. The IOCB_DIRECT 87 * flag needs to be cleared here in order to ensure that the 88 * direct I/O path within generic_file_read_iter() is not 89 * taken. 90 */ 91 iocb->ki_flags &= ~IOCB_DIRECT; 92 return generic_file_read_iter(iocb, to); 93 } 94 95 ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); 96 inode_unlock_shared(inode); 97 98 file_accessed(iocb->ki_filp); 99 return ret; 100 } 101 102 #ifdef CONFIG_FS_DAX 103 static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) 104 { 105 struct inode *inode = file_inode(iocb->ki_filp); 106 ssize_t ret; 107 108 if (iocb->ki_flags & IOCB_NOWAIT) { 109 if (!inode_trylock_shared(inode)) 110 return -EAGAIN; 111 } else { 112 inode_lock_shared(inode); 113 } 114 /* 115 * Recheck under inode lock - at this point we are sure it cannot 116 * change anymore 117 */ 118 if (!IS_DAX(inode)) { 119 inode_unlock_shared(inode); 120 /* Fallback to buffered IO in case we cannot support DAX */ 121 return generic_file_read_iter(iocb, to); 122 } 123 ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); 124 inode_unlock_shared(inode); 125 126 file_accessed(iocb->ki_filp); 127 return ret; 128 } 129 #endif 130 131 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 132 { 133 struct inode *inode = file_inode(iocb->ki_filp); 134 135 if (unlikely(ext4_forced_shutdown(inode->i_sb))) 136 return -EIO; 137 138 if (!iov_iter_count(to)) 139 return 0; /* skip atime */ 140 141 #ifdef CONFIG_FS_DAX 142 if (IS_DAX(inode)) 143 return ext4_dax_read_iter(iocb, to); 144 #endif 145 if (iocb->ki_flags & IOCB_DIRECT) 146 return ext4_dio_read_iter(iocb, to); 147 148 return generic_file_read_iter(iocb, to); 149 } 150 151 static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, 152 struct pipe_inode_info *pipe, 153 size_t len, unsigned int flags) 154 { 155 struct inode *inode = file_inode(in); 156 157 if (unlikely(ext4_forced_shutdown(inode->i_sb))) 158 return -EIO; 159 return filemap_splice_read(in, ppos, pipe, len, flags); 160 } 161 162 /* 163 * Called when an inode is released. Note that this is different 164 * from ext4_file_open: open gets called at every open, but release 165 * gets called only when /all/ the files are closed. 166 */ 167 static int ext4_release_file(struct inode *inode, struct file *filp) 168 { 169 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { 170 ext4_alloc_da_blocks(inode); 171 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 172 } 173 /* if we are the last writer on the inode, drop the block reservation */ 174 if ((filp->f_mode & FMODE_WRITE) && 175 (atomic_read(&inode->i_writecount) == 1) && 176 !EXT4_I(inode)->i_reserved_data_blocks) { 177 down_write(&EXT4_I(inode)->i_data_sem); 178 ext4_discard_preallocations(inode); 179 up_write(&EXT4_I(inode)->i_data_sem); 180 } 181 if (is_dx(inode) && filp->private_data) 182 ext4_htree_free_dir_info(filp->private_data); 183 184 return 0; 185 } 186 187 /* 188 * This tests whether the IO in question is block-aligned or not. 189 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they 190 * are converted to written only after the IO is complete. Until they are 191 * mapped, these blocks appear as holes, so dio_zero_block() will assume that 192 * it needs to zero out portions of the start and/or end block. If 2 AIO 193 * threads are at work on the same unwritten block, they must be synchronized 194 * or one thread will zero the other's data, causing corruption. 195 */ 196 static bool 197 ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) 198 { 199 struct super_block *sb = inode->i_sb; 200 unsigned long blockmask = sb->s_blocksize - 1; 201 202 if ((pos | iov_iter_alignment(from)) & blockmask) 203 return true; 204 205 return false; 206 } 207 208 static bool 209 ext4_extending_io(struct inode *inode, loff_t offset, size_t len) 210 { 211 if (offset + len > i_size_read(inode) || 212 offset + len > EXT4_I(inode)->i_disksize) 213 return true; 214 return false; 215 } 216 217 /* Is IO overwriting allocated or initialized blocks? */ 218 static bool ext4_overwrite_io(struct inode *inode, 219 loff_t pos, loff_t len, bool *unwritten) 220 { 221 struct ext4_map_blocks map; 222 unsigned int blkbits = inode->i_blkbits; 223 int err, blklen; 224 225 if (pos + len > i_size_read(inode)) 226 return false; 227 228 map.m_lblk = pos >> blkbits; 229 map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); 230 blklen = map.m_len; 231 232 err = ext4_map_blocks(NULL, inode, &map, 0); 233 if (err != blklen) 234 return false; 235 /* 236 * 'err==len' means that all of the blocks have been preallocated, 237 * regardless of whether they have been initialized or not. We need to 238 * check m_flags to distinguish the unwritten extents. 239 */ 240 *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); 241 return true; 242 } 243 244 static ssize_t ext4_generic_write_checks(struct kiocb *iocb, 245 struct iov_iter *from) 246 { 247 struct inode *inode = file_inode(iocb->ki_filp); 248 ssize_t ret; 249 250 if (unlikely(IS_IMMUTABLE(inode))) 251 return -EPERM; 252 253 ret = generic_write_checks(iocb, from); 254 if (ret <= 0) 255 return ret; 256 257 /* 258 * If we have encountered a bitmap-format file, the size limit 259 * is smaller than s_maxbytes, which is for extent-mapped files. 260 */ 261 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 262 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 263 264 if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) 265 return -EFBIG; 266 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); 267 } 268 269 return iov_iter_count(from); 270 } 271 272 static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) 273 { 274 ssize_t ret, count; 275 276 count = ext4_generic_write_checks(iocb, from); 277 if (count <= 0) 278 return count; 279 280 ret = file_modified(iocb->ki_filp); 281 if (ret) 282 return ret; 283 return count; 284 } 285 286 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, 287 struct iov_iter *from) 288 { 289 ssize_t ret; 290 struct inode *inode = file_inode(iocb->ki_filp); 291 292 if (iocb->ki_flags & IOCB_NOWAIT) 293 return -EOPNOTSUPP; 294 295 inode_lock(inode); 296 ret = ext4_write_checks(iocb, from); 297 if (ret <= 0) 298 goto out; 299 300 ret = generic_perform_write(iocb, from); 301 302 out: 303 inode_unlock(inode); 304 if (unlikely(ret <= 0)) 305 return ret; 306 return generic_write_sync(iocb, ret); 307 } 308 309 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, 310 ssize_t written, ssize_t count) 311 { 312 handle_t *handle; 313 314 lockdep_assert_held_write(&inode->i_rwsem); 315 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 316 if (IS_ERR(handle)) 317 return PTR_ERR(handle); 318 319 if (ext4_update_inode_size(inode, offset + written)) { 320 int ret = ext4_mark_inode_dirty(handle, inode); 321 if (unlikely(ret)) { 322 ext4_journal_stop(handle); 323 return ret; 324 } 325 } 326 327 if ((written == count) && inode->i_nlink) 328 ext4_orphan_del(handle, inode); 329 ext4_journal_stop(handle); 330 331 return written; 332 } 333 334 /* 335 * Clean up the inode after DIO or DAX extending write has completed and the 336 * inode size has been updated using ext4_handle_inode_extension(). 337 */ 338 static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) 339 { 340 lockdep_assert_held_write(&inode->i_rwsem); 341 if (need_trunc) { 342 ext4_truncate_failed_write(inode); 343 /* 344 * If the truncate operation failed early, then the inode may 345 * still be on the orphan list. In that case, we need to try 346 * remove the inode from the in-memory linked list. 347 */ 348 if (inode->i_nlink) 349 ext4_orphan_del(NULL, inode); 350 return; 351 } 352 /* 353 * If i_disksize got extended either due to writeback of delalloc 354 * blocks or extending truncate while the DIO was running we could fail 355 * to cleanup the orphan list in ext4_handle_inode_extension(). Do it 356 * now. 357 */ 358 if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { 359 handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 360 361 if (IS_ERR(handle)) { 362 /* 363 * The write has successfully completed. Not much to 364 * do with the error here so just cleanup the orphan 365 * list and hope for the best. 366 */ 367 ext4_orphan_del(NULL, inode); 368 return; 369 } 370 ext4_orphan_del(handle, inode); 371 ext4_journal_stop(handle); 372 } 373 } 374 375 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, 376 int error, unsigned int flags) 377 { 378 loff_t pos = iocb->ki_pos; 379 struct inode *inode = file_inode(iocb->ki_filp); 380 381 382 if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && 383 (iocb->ki_flags & IOCB_ATOMIC)) 384 error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, 385 size); 386 else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) 387 error = ext4_convert_unwritten_extents(NULL, inode, pos, size); 388 if (error) 389 return error; 390 /* 391 * Note that EXT4_I(inode)->i_disksize can get extended up to 392 * inode->i_size while the I/O was running due to writeback of delalloc 393 * blocks. But the code in ext4_iomap_alloc() is careful to use 394 * zeroed/unwritten extents if this is possible; thus we won't leave 395 * uninitialized blocks in a file even if we didn't succeed in writing 396 * as much as we intended. Also we can race with truncate or write 397 * expanding the file so we have to be a bit careful here. 398 */ 399 if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && 400 pos + size <= i_size_read(inode)) 401 return 0; 402 error = ext4_handle_inode_extension(inode, pos, size, size); 403 return error < 0 ? error : 0; 404 } 405 406 static const struct iomap_dio_ops ext4_dio_write_ops = { 407 .end_io = ext4_dio_write_end_io, 408 }; 409 410 /* 411 * The intention here is to start with shared lock acquired then see if any 412 * condition requires an exclusive inode lock. If yes, then we restart the 413 * whole operation by releasing the shared lock and acquiring exclusive lock. 414 * 415 * - For unaligned_io we never take shared lock as it may cause data corruption 416 * when two unaligned IO tries to modify the same block e.g. while zeroing. 417 * 418 * - For extending writes case we don't take the shared lock, since it requires 419 * updating inode i_disksize and/or orphan handling with exclusive lock. 420 * 421 * - shared locking will only be true mostly with overwrites, including 422 * initialized blocks and unwritten blocks. 423 * 424 * - Otherwise we will switch to exclusive i_rwsem lock. 425 */ 426 static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, 427 bool *ilock_shared, bool *extend, 428 int *dio_flags) 429 { 430 struct file *file = iocb->ki_filp; 431 struct inode *inode = file_inode(file); 432 loff_t offset; 433 size_t count; 434 ssize_t ret; 435 bool overwrite, unaligned_io, unwritten; 436 437 restart: 438 ret = ext4_generic_write_checks(iocb, from); 439 if (ret <= 0) 440 goto out; 441 442 offset = iocb->ki_pos; 443 count = ret; 444 445 unaligned_io = ext4_unaligned_io(inode, from, offset); 446 *extend = ext4_extending_io(inode, offset, count); 447 overwrite = ext4_overwrite_io(inode, offset, count, &unwritten); 448 449 /* 450 * Determine whether we need to upgrade to an exclusive lock. This is 451 * required to change security info in file_modified(), for extending 452 * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten 453 * extents (as partial block zeroing may be required). 454 * 455 * Note that unaligned writes are allowed under shared lock so long as 456 * they are pure overwrites. Otherwise, concurrent unaligned writes risk 457 * data corruption due to partial block zeroing in the dio layer, and so 458 * the I/O must occur exclusively. 459 */ 460 if (*ilock_shared && 461 ((!IS_NOSEC(inode) || *extend || !overwrite || 462 (unaligned_io && unwritten)))) { 463 if (iocb->ki_flags & IOCB_NOWAIT) { 464 ret = -EAGAIN; 465 goto out; 466 } 467 inode_unlock_shared(inode); 468 *ilock_shared = false; 469 inode_lock(inode); 470 goto restart; 471 } 472 473 /* 474 * Now that locking is settled, determine dio flags and exclusivity 475 * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce 476 * behavior already. The inode lock is already held exclusive if the 477 * write is non-overwrite or extending, so drain all outstanding dio and 478 * set the force wait dio flag. 479 */ 480 if (!*ilock_shared && (unaligned_io || *extend)) { 481 if (iocb->ki_flags & IOCB_NOWAIT) { 482 ret = -EAGAIN; 483 goto out; 484 } 485 if (unaligned_io && (!overwrite || unwritten)) 486 inode_dio_wait(inode); 487 *dio_flags = IOMAP_DIO_FORCE_WAIT; 488 } 489 490 ret = file_modified(file); 491 if (ret < 0) 492 goto out; 493 494 return count; 495 out: 496 if (*ilock_shared) 497 inode_unlock_shared(inode); 498 else 499 inode_unlock(inode); 500 return ret; 501 } 502 503 static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) 504 { 505 ssize_t ret; 506 handle_t *handle; 507 struct inode *inode = file_inode(iocb->ki_filp); 508 loff_t offset = iocb->ki_pos; 509 size_t count = iov_iter_count(from); 510 bool extend = false; 511 bool ilock_shared = true; 512 int dio_flags = 0; 513 514 /* 515 * Quick check here without any i_rwsem lock to see if it is extending 516 * IO. A more reliable check is done in ext4_dio_write_checks() with 517 * proper locking in place. 518 */ 519 if (offset + count > i_size_read(inode)) 520 ilock_shared = false; 521 522 if (iocb->ki_flags & IOCB_NOWAIT) { 523 if (ilock_shared) { 524 if (!inode_trylock_shared(inode)) 525 return -EAGAIN; 526 } else { 527 if (!inode_trylock(inode)) 528 return -EAGAIN; 529 } 530 } else { 531 if (ilock_shared) 532 inode_lock_shared(inode); 533 else 534 inode_lock(inode); 535 } 536 537 /* Fallback to buffered I/O if the inode does not support direct I/O. */ 538 if (!ext4_should_use_dio(iocb, from)) { 539 if (ilock_shared) 540 inode_unlock_shared(inode); 541 else 542 inode_unlock(inode); 543 return ext4_buffered_write_iter(iocb, from); 544 } 545 546 /* 547 * Prevent inline data from being created since we are going to allocate 548 * blocks for DIO. We know the inode does not currently have inline data 549 * because ext4_should_use_dio() checked for it, but we have to clear 550 * the state flag before the write checks because a lock cycle could 551 * introduce races with other writers. 552 */ 553 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 554 555 ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, 556 &dio_flags); 557 if (ret <= 0) 558 return ret; 559 560 offset = iocb->ki_pos; 561 count = ret; 562 563 if (extend) { 564 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 565 if (IS_ERR(handle)) { 566 ret = PTR_ERR(handle); 567 goto out; 568 } 569 570 ret = ext4_orphan_add(handle, inode); 571 ext4_journal_stop(handle); 572 if (ret) 573 goto out; 574 } 575 576 ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, 577 dio_flags, NULL, 0); 578 if (ret == -ENOTBLK) 579 ret = 0; 580 if (extend) { 581 /* 582 * We always perform extending DIO write synchronously so by 583 * now the IO is completed and ext4_handle_inode_extension() 584 * was called. Cleanup the inode in case of error or race with 585 * writeback of delalloc blocks. 586 */ 587 WARN_ON_ONCE(ret == -EIOCBQUEUED); 588 ext4_inode_extension_cleanup(inode, ret < 0); 589 } 590 591 out: 592 if (ilock_shared) 593 inode_unlock_shared(inode); 594 else 595 inode_unlock(inode); 596 597 if (ret >= 0 && iov_iter_count(from)) { 598 ssize_t err; 599 loff_t endbyte; 600 601 /* 602 * There is no support for atomic writes on buffered-io yet, 603 * we should never fallback to buffered-io for DIO atomic 604 * writes. 605 */ 606 WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); 607 608 offset = iocb->ki_pos; 609 err = ext4_buffered_write_iter(iocb, from); 610 if (err < 0) 611 return err; 612 613 /* 614 * We need to ensure that the pages within the page cache for 615 * the range covered by this I/O are written to disk and 616 * invalidated. This is in attempt to preserve the expected 617 * direct I/O semantics in the case we fallback to buffered I/O 618 * to complete off the I/O request. 619 */ 620 ret += err; 621 endbyte = offset + err - 1; 622 err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, 623 offset, endbyte); 624 if (!err) 625 invalidate_mapping_pages(iocb->ki_filp->f_mapping, 626 offset >> PAGE_SHIFT, 627 endbyte >> PAGE_SHIFT); 628 } 629 630 return ret; 631 } 632 633 #ifdef CONFIG_FS_DAX 634 static ssize_t 635 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) 636 { 637 ssize_t ret; 638 size_t count; 639 loff_t offset; 640 handle_t *handle; 641 bool extend = false; 642 struct inode *inode = file_inode(iocb->ki_filp); 643 644 if (iocb->ki_flags & IOCB_NOWAIT) { 645 if (!inode_trylock(inode)) 646 return -EAGAIN; 647 } else { 648 inode_lock(inode); 649 } 650 651 ret = ext4_write_checks(iocb, from); 652 if (ret <= 0) 653 goto out; 654 655 offset = iocb->ki_pos; 656 count = iov_iter_count(from); 657 658 if (offset + count > EXT4_I(inode)->i_disksize) { 659 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 660 if (IS_ERR(handle)) { 661 ret = PTR_ERR(handle); 662 goto out; 663 } 664 665 ret = ext4_orphan_add(handle, inode); 666 if (ret) { 667 ext4_journal_stop(handle); 668 goto out; 669 } 670 671 extend = true; 672 ext4_journal_stop(handle); 673 } 674 675 ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); 676 677 if (extend) { 678 ret = ext4_handle_inode_extension(inode, offset, ret, count); 679 ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); 680 } 681 out: 682 inode_unlock(inode); 683 if (ret > 0) 684 ret = generic_write_sync(iocb, ret); 685 return ret; 686 } 687 #endif 688 689 static ssize_t 690 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 691 { 692 int ret; 693 struct inode *inode = file_inode(iocb->ki_filp); 694 695 ret = ext4_emergency_state(inode->i_sb); 696 if (unlikely(ret)) 697 return ret; 698 699 #ifdef CONFIG_FS_DAX 700 if (IS_DAX(inode)) 701 return ext4_dax_write_iter(iocb, from); 702 #endif 703 704 if (iocb->ki_flags & IOCB_ATOMIC) { 705 size_t len = iov_iter_count(from); 706 707 if (len < EXT4_SB(inode->i_sb)->s_awu_min || 708 len > EXT4_SB(inode->i_sb)->s_awu_max) 709 return -EINVAL; 710 711 ret = generic_atomic_write_valid(iocb, from); 712 if (ret) 713 return ret; 714 } 715 716 if (iocb->ki_flags & IOCB_DIRECT) 717 return ext4_dio_write_iter(iocb, from); 718 else 719 return ext4_buffered_write_iter(iocb, from); 720 } 721 722 #ifdef CONFIG_FS_DAX 723 static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) 724 { 725 int error = 0; 726 vm_fault_t result; 727 int retries = 0; 728 handle_t *handle = NULL; 729 struct inode *inode = file_inode(vmf->vma->vm_file); 730 struct super_block *sb = inode->i_sb; 731 732 /* 733 * We have to distinguish real writes from writes which will result in a 734 * COW page; COW writes should *not* poke the journal (the file will not 735 * be changed). Doing so would cause unintended failures when mounted 736 * read-only. 737 * 738 * We check for VM_SHARED rather than vmf->cow_page since the latter is 739 * unset for order != 0 (i.e. only in do_cow_fault); for 740 * other sizes, dax_iomap_fault will handle splitting / fallback so that 741 * we eventually come back with a COW page. 742 */ 743 bool write = (vmf->flags & FAULT_FLAG_WRITE) && 744 (vmf->vma->vm_flags & VM_SHARED); 745 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 746 unsigned long pfn; 747 748 if (write) { 749 sb_start_pagefault(sb); 750 file_update_time(vmf->vma->vm_file); 751 filemap_invalidate_lock_shared(mapping); 752 retry: 753 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 754 EXT4_DATA_TRANS_BLOCKS(sb)); 755 if (IS_ERR(handle)) { 756 filemap_invalidate_unlock_shared(mapping); 757 sb_end_pagefault(sb); 758 return VM_FAULT_SIGBUS; 759 } 760 } else { 761 filemap_invalidate_lock_shared(mapping); 762 } 763 result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); 764 if (write) { 765 ext4_journal_stop(handle); 766 767 if ((result & VM_FAULT_ERROR) && error == -ENOSPC && 768 ext4_should_retry_alloc(sb, &retries)) 769 goto retry; 770 /* Handling synchronous page fault? */ 771 if (result & VM_FAULT_NEEDDSYNC) 772 result = dax_finish_sync_fault(vmf, order, pfn); 773 filemap_invalidate_unlock_shared(mapping); 774 sb_end_pagefault(sb); 775 } else { 776 filemap_invalidate_unlock_shared(mapping); 777 } 778 779 return result; 780 } 781 782 static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) 783 { 784 return ext4_dax_huge_fault(vmf, 0); 785 } 786 787 static const struct vm_operations_struct ext4_dax_vm_ops = { 788 .fault = ext4_dax_fault, 789 .huge_fault = ext4_dax_huge_fault, 790 .page_mkwrite = ext4_dax_fault, 791 .pfn_mkwrite = ext4_dax_fault, 792 }; 793 #else 794 #define ext4_dax_vm_ops ext4_file_vm_ops 795 #endif 796 797 static const struct vm_operations_struct ext4_file_vm_ops = { 798 .fault = filemap_fault, 799 .map_pages = filemap_map_pages, 800 .page_mkwrite = ext4_page_mkwrite, 801 }; 802 803 static int ext4_file_mmap_prepare(struct vm_area_desc *desc) 804 { 805 int ret; 806 struct file *file = desc->file; 807 struct inode *inode = file->f_mapping->host; 808 struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 809 810 if (file->f_mode & FMODE_WRITE) 811 ret = ext4_emergency_state(inode->i_sb); 812 else 813 ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; 814 if (unlikely(ret)) 815 return ret; 816 817 /* 818 * We don't support synchronous mappings for non-DAX files and 819 * for DAX files if underneath dax_device is not synchronous. 820 */ 821 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) 822 return -EOPNOTSUPP; 823 824 file_accessed(file); 825 if (IS_DAX(file_inode(file))) { 826 desc->vm_ops = &ext4_dax_vm_ops; 827 desc->vm_flags |= VM_HUGEPAGE; 828 } else { 829 desc->vm_ops = &ext4_file_vm_ops; 830 } 831 return 0; 832 } 833 834 static int ext4_sample_last_mounted(struct super_block *sb, 835 struct vfsmount *mnt) 836 { 837 struct ext4_sb_info *sbi = EXT4_SB(sb); 838 struct path path; 839 char buf[64], *cp; 840 handle_t *handle; 841 int err; 842 843 if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) 844 return 0; 845 846 if (ext4_emergency_state(sb) || sb_rdonly(sb) || 847 !sb_start_intwrite_trylock(sb)) 848 return 0; 849 850 ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); 851 /* 852 * Sample where the filesystem has been mounted and 853 * store it in the superblock for sysadmin convenience 854 * when trying to sort through large numbers of block 855 * devices or filesystem images. 856 */ 857 path.mnt = mnt; 858 path.dentry = mnt->mnt_root; 859 cp = d_path(&path, buf, sizeof(buf)); 860 err = 0; 861 if (IS_ERR(cp)) 862 goto out; 863 864 handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); 865 err = PTR_ERR(handle); 866 if (IS_ERR(handle)) 867 goto out; 868 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 869 err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, 870 EXT4_JTR_NONE); 871 if (err) 872 goto out_journal; 873 lock_buffer(sbi->s_sbh); 874 strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); 875 ext4_superblock_csum_set(sb); 876 unlock_buffer(sbi->s_sbh); 877 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 878 out_journal: 879 ext4_journal_stop(handle); 880 out: 881 sb_end_intwrite(sb); 882 return err; 883 } 884 885 static int ext4_file_open(struct inode *inode, struct file *filp) 886 { 887 int ret; 888 889 if (filp->f_mode & FMODE_WRITE) 890 ret = ext4_emergency_state(inode->i_sb); 891 else 892 ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; 893 if (unlikely(ret)) 894 return ret; 895 896 ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); 897 if (ret) 898 return ret; 899 900 ret = fscrypt_file_open(inode, filp); 901 if (ret) 902 return ret; 903 904 ret = fsverity_file_open(inode, filp); 905 if (ret) 906 return ret; 907 908 /* 909 * Set up the jbd2_inode if we are opening the inode for 910 * writing and the journal is present 911 */ 912 if (filp->f_mode & FMODE_WRITE) { 913 ret = ext4_inode_attach_jinode(inode); 914 if (ret < 0) 915 return ret; 916 } 917 918 if (ext4_inode_can_atomic_write(inode)) 919 filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; 920 921 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 922 return dquot_file_open(inode, filp); 923 } 924 925 /* 926 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 927 * by calling generic_file_llseek_size() with the appropriate maxbytes 928 * value for each. 929 */ 930 loff_t ext4_llseek(struct file *file, loff_t offset, int whence) 931 { 932 struct inode *inode = file->f_mapping->host; 933 loff_t maxbytes = ext4_get_maxbytes(inode); 934 935 switch (whence) { 936 default: 937 return generic_file_llseek_size(file, offset, whence, 938 maxbytes, i_size_read(inode)); 939 case SEEK_HOLE: 940 inode_lock_shared(inode); 941 offset = iomap_seek_hole(inode, offset, 942 &ext4_iomap_report_ops); 943 inode_unlock_shared(inode); 944 break; 945 case SEEK_DATA: 946 inode_lock_shared(inode); 947 offset = iomap_seek_data(inode, offset, 948 &ext4_iomap_report_ops); 949 inode_unlock_shared(inode); 950 break; 951 } 952 953 if (offset < 0) 954 return offset; 955 return vfs_setpos(file, offset, maxbytes); 956 } 957 958 const struct file_operations ext4_file_operations = { 959 .llseek = ext4_llseek, 960 .read_iter = ext4_file_read_iter, 961 .write_iter = ext4_file_write_iter, 962 .iopoll = iocb_bio_iopoll, 963 .unlocked_ioctl = ext4_ioctl, 964 #ifdef CONFIG_COMPAT 965 .compat_ioctl = ext4_compat_ioctl, 966 #endif 967 .mmap_prepare = ext4_file_mmap_prepare, 968 .open = ext4_file_open, 969 .release = ext4_release_file, 970 .fsync = ext4_sync_file, 971 .get_unmapped_area = thp_get_unmapped_area, 972 .splice_read = ext4_file_splice_read, 973 .splice_write = iter_file_splice_write, 974 .fallocate = ext4_fallocate, 975 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 976 FOP_DIO_PARALLEL_WRITE | 977 FOP_DONTCACHE, 978 .setlease = generic_setlease, 979 }; 980 981 const struct inode_operations ext4_file_inode_operations = { 982 .setattr = ext4_setattr, 983 .getattr = ext4_file_getattr, 984 .listxattr = ext4_listxattr, 985 .get_inode_acl = ext4_get_acl, 986 .set_acl = ext4_set_acl, 987 .fiemap = ext4_fiemap, 988 .fileattr_get = ext4_fileattr_get, 989 .fileattr_set = ext4_fileattr_set, 990 }; 991 992