1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/ext4/file.c 4 * 5 * Copyright (C) 1992, 1993, 1994, 1995 6 * Remy Card (card@masi.ibp.fr) 7 * Laboratoire MASI - Institut Blaise Pascal 8 * Universite Pierre et Marie Curie (Paris VI) 9 * 10 * from 11 * 12 * linux/fs/minix/file.c 13 * 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * ext4 fs regular file handling primitives 17 * 18 * 64-bit file support on 64-bit platforms by Jakub Jelinek 19 * (jj@sunsite.ms.mff.cuni.cz) 20 */ 21 22 #include <linux/time.h> 23 #include <linux/fs.h> 24 #include <linux/iomap.h> 25 #include <linux/mount.h> 26 #include <linux/path.h> 27 #include <linux/dax.h> 28 #include <linux/filelock.h> 29 #include <linux/quotaops.h> 30 #include <linux/pagevec.h> 31 #include <linux/uio.h> 32 #include <linux/mman.h> 33 #include <linux/backing-dev.h> 34 #include "ext4.h" 35 #include "ext4_jbd2.h" 36 #include "xattr.h" 37 #include "acl.h" 38 #include "truncate.h" 39 40 /* 41 * Returns %true if the given DIO request should be attempted with DIO, or 42 * %false if it should fall back to buffered I/O. 43 * 44 * DIO isn't well specified; when it's unsupported (either due to the request 45 * being misaligned, or due to the file not supporting DIO at all), filesystems 46 * either fall back to buffered I/O or return EINVAL. For files that don't use 47 * any special features like encryption or verity, ext4 has traditionally 48 * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. 49 * In this case, we should attempt the DIO, *not* fall back to buffered I/O. 50 * 51 * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 52 * traditionally falls back to buffered I/O. 53 * 54 * This function implements the traditional ext4 behavior in all these cases. 55 */ 56 static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) 57 { 58 struct inode *inode = file_inode(iocb->ki_filp); 59 u32 dio_align = ext4_dio_alignment(inode); 60 61 if (dio_align == 0) 62 return false; 63 64 if (dio_align == 1) 65 return true; 66 67 return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); 68 } 69 70 static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) 71 { 72 ssize_t ret; 73 struct inode *inode = file_inode(iocb->ki_filp); 74 75 if (iocb->ki_flags & IOCB_NOWAIT) { 76 if (!inode_trylock_shared(inode)) 77 return -EAGAIN; 78 } else { 79 inode_lock_shared(inode); 80 } 81 82 if (!ext4_should_use_dio(iocb, to)) { 83 inode_unlock_shared(inode); 84 /* 85 * Fallback to buffered I/O if the operation being performed on 86 * the inode is not supported by direct I/O. The IOCB_DIRECT 87 * flag needs to be cleared here in order to ensure that the 88 * direct I/O path within generic_file_read_iter() is not 89 * taken. 90 */ 91 iocb->ki_flags &= ~IOCB_DIRECT; 92 return generic_file_read_iter(iocb, to); 93 } 94 95 ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); 96 inode_unlock_shared(inode); 97 98 file_accessed(iocb->ki_filp); 99 return ret; 100 } 101 102 #ifdef CONFIG_FS_DAX 103 static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) 104 { 105 struct inode *inode = file_inode(iocb->ki_filp); 106 ssize_t ret; 107 108 if (iocb->ki_flags & IOCB_NOWAIT) { 109 if (!inode_trylock_shared(inode)) 110 return -EAGAIN; 111 } else { 112 inode_lock_shared(inode); 113 } 114 /* 115 * Recheck under inode lock - at this point we are sure it cannot 116 * change anymore 117 */ 118 if (!IS_DAX(inode)) { 119 inode_unlock_shared(inode); 120 /* Fallback to buffered IO in case we cannot support DAX */ 121 return generic_file_read_iter(iocb, to); 122 } 123 ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); 124 inode_unlock_shared(inode); 125 126 file_accessed(iocb->ki_filp); 127 return ret; 128 } 129 #endif 130 131 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 132 { 133 struct inode *inode = file_inode(iocb->ki_filp); 134 135 if (unlikely(ext4_forced_shutdown(inode->i_sb))) 136 return -EIO; 137 138 if (!iov_iter_count(to)) 139 return 0; /* skip atime */ 140 141 #ifdef CONFIG_FS_DAX 142 if (IS_DAX(inode)) 143 return ext4_dax_read_iter(iocb, to); 144 #endif 145 if (iocb->ki_flags & IOCB_DIRECT) 146 return ext4_dio_read_iter(iocb, to); 147 148 return generic_file_read_iter(iocb, to); 149 } 150 151 static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, 152 struct pipe_inode_info *pipe, 153 size_t len, unsigned int flags) 154 { 155 struct inode *inode = file_inode(in); 156 157 if (unlikely(ext4_forced_shutdown(inode->i_sb))) 158 return -EIO; 159 return filemap_splice_read(in, ppos, pipe, len, flags); 160 } 161 162 /* 163 * Called when an inode is released. Note that this is different 164 * from ext4_file_open: open gets called at every open, but release 165 * gets called only when /all/ the files are closed. 166 */ 167 static int ext4_release_file(struct inode *inode, struct file *filp) 168 { 169 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { 170 ext4_alloc_da_blocks(inode); 171 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 172 } 173 /* if we are the last writer on the inode, drop the block reservation */ 174 if ((filp->f_mode & FMODE_WRITE) && 175 (atomic_read(&inode->i_writecount) == 1) && 176 !EXT4_I(inode)->i_reserved_data_blocks) { 177 down_write(&EXT4_I(inode)->i_data_sem); 178 ext4_discard_preallocations(inode); 179 up_write(&EXT4_I(inode)->i_data_sem); 180 } 181 if (is_dx(inode) && filp->private_data) 182 ext4_htree_free_dir_info(filp->private_data); 183 184 return 0; 185 } 186 187 /* 188 * This tests whether the IO in question is block-aligned or not. 189 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they 190 * are converted to written only after the IO is complete. Until they are 191 * mapped, these blocks appear as holes, so dio_zero_block() will assume that 192 * it needs to zero out portions of the start and/or end block. If 2 AIO 193 * threads are at work on the same unwritten block, they must be synchronized 194 * or one thread will zero the other's data, causing corruption. 195 */ 196 static bool 197 ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) 198 { 199 struct super_block *sb = inode->i_sb; 200 unsigned long blockmask = sb->s_blocksize - 1; 201 202 if ((pos | iov_iter_alignment(from)) & blockmask) 203 return true; 204 205 return false; 206 } 207 208 static bool 209 ext4_extending_io(struct inode *inode, loff_t offset, size_t len) 210 { 211 if (offset + len > i_size_read(inode) || 212 offset + len > EXT4_I(inode)->i_disksize) 213 return true; 214 return false; 215 } 216 217 /* Is IO overwriting allocated or initialized blocks? */ 218 static bool ext4_overwrite_io(struct inode *inode, 219 loff_t pos, loff_t len, bool *unwritten) 220 { 221 struct ext4_map_blocks map; 222 unsigned int blkbits = inode->i_blkbits; 223 int err, blklen; 224 225 if (pos + len > i_size_read(inode)) 226 return false; 227 228 map.m_lblk = pos >> blkbits; 229 map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); 230 blklen = map.m_len; 231 232 err = ext4_map_blocks(NULL, inode, &map, 0); 233 if (err != blklen) 234 return false; 235 /* 236 * 'err==len' means that all of the blocks have been preallocated, 237 * regardless of whether they have been initialized or not. We need to 238 * check m_flags to distinguish the unwritten extents. 239 */ 240 *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); 241 return true; 242 } 243 244 static ssize_t ext4_generic_write_checks(struct kiocb *iocb, 245 struct iov_iter *from) 246 { 247 struct inode *inode = file_inode(iocb->ki_filp); 248 ssize_t ret; 249 250 if (unlikely(IS_IMMUTABLE(inode))) 251 return -EPERM; 252 253 ret = generic_write_checks(iocb, from); 254 if (ret <= 0) 255 return ret; 256 257 /* 258 * If we have encountered a bitmap-format file, the size limit 259 * is smaller than s_maxbytes, which is for extent-mapped files. 260 */ 261 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 262 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 263 264 if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) 265 return -EFBIG; 266 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); 267 } 268 269 return iov_iter_count(from); 270 } 271 272 static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) 273 { 274 ssize_t ret, count; 275 276 count = ext4_generic_write_checks(iocb, from); 277 if (count <= 0) 278 return count; 279 280 ret = file_modified(iocb->ki_filp); 281 if (ret) 282 return ret; 283 return count; 284 } 285 286 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, 287 struct iov_iter *from) 288 { 289 ssize_t ret; 290 struct inode *inode = file_inode(iocb->ki_filp); 291 292 if (iocb->ki_flags & IOCB_NOWAIT) 293 return -EOPNOTSUPP; 294 295 inode_lock(inode); 296 ret = ext4_write_checks(iocb, from); 297 if (ret <= 0) 298 goto out; 299 300 ret = generic_perform_write(iocb, from); 301 302 out: 303 inode_unlock(inode); 304 if (unlikely(ret <= 0)) 305 return ret; 306 return generic_write_sync(iocb, ret); 307 } 308 309 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, 310 ssize_t written, ssize_t count) 311 { 312 handle_t *handle; 313 314 lockdep_assert_held_write(&inode->i_rwsem); 315 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 316 if (IS_ERR(handle)) 317 return PTR_ERR(handle); 318 319 if (ext4_update_inode_size(inode, offset + written)) { 320 int ret = ext4_mark_inode_dirty(handle, inode); 321 if (unlikely(ret)) { 322 ext4_journal_stop(handle); 323 return ret; 324 } 325 } 326 327 if ((written == count) && inode->i_nlink) 328 ext4_orphan_del(handle, inode); 329 ext4_journal_stop(handle); 330 331 return written; 332 } 333 334 /* 335 * Clean up the inode after DIO or DAX extending write has completed and the 336 * inode size has been updated using ext4_handle_inode_extension(). 337 */ 338 static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) 339 { 340 lockdep_assert_held_write(&inode->i_rwsem); 341 if (need_trunc) { 342 ext4_truncate_failed_write(inode); 343 /* 344 * If the truncate operation failed early, then the inode may 345 * still be on the orphan list. In that case, we need to try 346 * remove the inode from the in-memory linked list. 347 */ 348 if (inode->i_nlink) 349 ext4_orphan_del(NULL, inode); 350 return; 351 } 352 /* 353 * If i_disksize got extended either due to writeback of delalloc 354 * blocks or extending truncate while the DIO was running we could fail 355 * to cleanup the orphan list in ext4_handle_inode_extension(). Do it 356 * now. 357 */ 358 if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { 359 handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 360 361 if (IS_ERR(handle)) { 362 /* 363 * The write has successfully completed. Not much to 364 * do with the error here so just cleanup the orphan 365 * list and hope for the best. 366 */ 367 ext4_orphan_del(NULL, inode); 368 return; 369 } 370 ext4_orphan_del(handle, inode); 371 ext4_journal_stop(handle); 372 } 373 } 374 375 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, 376 int error, unsigned int flags) 377 { 378 loff_t pos = iocb->ki_pos; 379 struct inode *inode = file_inode(iocb->ki_filp); 380 381 382 if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && 383 (iocb->ki_flags & IOCB_ATOMIC)) 384 error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, 385 size); 386 else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) 387 error = ext4_convert_unwritten_extents(NULL, inode, pos, size); 388 if (error) 389 return error; 390 /* 391 * Note that EXT4_I(inode)->i_disksize can get extended up to 392 * inode->i_size while the I/O was running due to writeback of delalloc 393 * blocks. But the code in ext4_iomap_alloc() is careful to use 394 * zeroed/unwritten extents if this is possible; thus we won't leave 395 * uninitialized blocks in a file even if we didn't succeed in writing 396 * as much as we intended. Also we can race with truncate or write 397 * expanding the file so we have to be a bit careful here. 398 */ 399 if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && 400 pos + size <= i_size_read(inode)) 401 return 0; 402 error = ext4_handle_inode_extension(inode, pos, size, size); 403 return error < 0 ? error : 0; 404 } 405 406 static const struct iomap_dio_ops ext4_dio_write_ops = { 407 .end_io = ext4_dio_write_end_io, 408 }; 409 410 /* 411 * The intention here is to start with shared lock acquired then see if any 412 * condition requires an exclusive inode lock. If yes, then we restart the 413 * whole operation by releasing the shared lock and acquiring exclusive lock. 414 * 415 * - For unaligned_io we never take shared lock as it may cause data corruption 416 * when two unaligned IO tries to modify the same block e.g. while zeroing. 417 * 418 * - For extending writes case we don't take the shared lock, since it requires 419 * updating inode i_disksize and/or orphan handling with exclusive lock. 420 * 421 * - shared locking will only be true mostly with overwrites, including 422 * initialized blocks and unwritten blocks. For overwrite unwritten blocks 423 * we protect splitting extents by i_data_sem in ext4_inode_info, so we can 424 * also release exclusive i_rwsem lock. 425 * 426 * - Otherwise we will switch to exclusive i_rwsem lock. 427 */ 428 static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, 429 bool *ilock_shared, bool *extend, 430 bool *unwritten, int *dio_flags) 431 { 432 struct file *file = iocb->ki_filp; 433 struct inode *inode = file_inode(file); 434 loff_t offset; 435 size_t count; 436 ssize_t ret; 437 bool overwrite, unaligned_io; 438 439 restart: 440 ret = ext4_generic_write_checks(iocb, from); 441 if (ret <= 0) 442 goto out; 443 444 offset = iocb->ki_pos; 445 count = ret; 446 447 unaligned_io = ext4_unaligned_io(inode, from, offset); 448 *extend = ext4_extending_io(inode, offset, count); 449 overwrite = ext4_overwrite_io(inode, offset, count, unwritten); 450 451 /* 452 * Determine whether we need to upgrade to an exclusive lock. This is 453 * required to change security info in file_modified(), for extending 454 * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten 455 * extents (as partial block zeroing may be required). 456 * 457 * Note that unaligned writes are allowed under shared lock so long as 458 * they are pure overwrites. Otherwise, concurrent unaligned writes risk 459 * data corruption due to partial block zeroing in the dio layer, and so 460 * the I/O must occur exclusively. 461 */ 462 if (*ilock_shared && 463 ((!IS_NOSEC(inode) || *extend || !overwrite || 464 (unaligned_io && *unwritten)))) { 465 if (iocb->ki_flags & IOCB_NOWAIT) { 466 ret = -EAGAIN; 467 goto out; 468 } 469 inode_unlock_shared(inode); 470 *ilock_shared = false; 471 inode_lock(inode); 472 goto restart; 473 } 474 475 /* 476 * Now that locking is settled, determine dio flags and exclusivity 477 * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce 478 * behavior already. The inode lock is already held exclusive if the 479 * write is non-overwrite or extending, so drain all outstanding dio and 480 * set the force wait dio flag. 481 */ 482 if (!*ilock_shared && (unaligned_io || *extend)) { 483 if (iocb->ki_flags & IOCB_NOWAIT) { 484 ret = -EAGAIN; 485 goto out; 486 } 487 if (unaligned_io && (!overwrite || *unwritten)) 488 inode_dio_wait(inode); 489 *dio_flags = IOMAP_DIO_FORCE_WAIT; 490 } 491 492 ret = file_modified(file); 493 if (ret < 0) 494 goto out; 495 496 return count; 497 out: 498 if (*ilock_shared) 499 inode_unlock_shared(inode); 500 else 501 inode_unlock(inode); 502 return ret; 503 } 504 505 static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) 506 { 507 ssize_t ret; 508 handle_t *handle; 509 struct inode *inode = file_inode(iocb->ki_filp); 510 loff_t offset = iocb->ki_pos; 511 size_t count = iov_iter_count(from); 512 const struct iomap_ops *iomap_ops = &ext4_iomap_ops; 513 bool extend = false, unwritten = false; 514 bool ilock_shared = true; 515 int dio_flags = 0; 516 517 /* 518 * Quick check here without any i_rwsem lock to see if it is extending 519 * IO. A more reliable check is done in ext4_dio_write_checks() with 520 * proper locking in place. 521 */ 522 if (offset + count > i_size_read(inode)) 523 ilock_shared = false; 524 525 if (iocb->ki_flags & IOCB_NOWAIT) { 526 if (ilock_shared) { 527 if (!inode_trylock_shared(inode)) 528 return -EAGAIN; 529 } else { 530 if (!inode_trylock(inode)) 531 return -EAGAIN; 532 } 533 } else { 534 if (ilock_shared) 535 inode_lock_shared(inode); 536 else 537 inode_lock(inode); 538 } 539 540 /* Fallback to buffered I/O if the inode does not support direct I/O. */ 541 if (!ext4_should_use_dio(iocb, from)) { 542 if (ilock_shared) 543 inode_unlock_shared(inode); 544 else 545 inode_unlock(inode); 546 return ext4_buffered_write_iter(iocb, from); 547 } 548 549 /* 550 * Prevent inline data from being created since we are going to allocate 551 * blocks for DIO. We know the inode does not currently have inline data 552 * because ext4_should_use_dio() checked for it, but we have to clear 553 * the state flag before the write checks because a lock cycle could 554 * introduce races with other writers. 555 */ 556 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 557 558 ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, 559 &unwritten, &dio_flags); 560 if (ret <= 0) 561 return ret; 562 563 offset = iocb->ki_pos; 564 count = ret; 565 566 if (extend) { 567 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 568 if (IS_ERR(handle)) { 569 ret = PTR_ERR(handle); 570 goto out; 571 } 572 573 ret = ext4_orphan_add(handle, inode); 574 ext4_journal_stop(handle); 575 if (ret) 576 goto out; 577 } 578 579 if (ilock_shared && !unwritten) 580 iomap_ops = &ext4_iomap_overwrite_ops; 581 ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, 582 dio_flags, NULL, 0); 583 if (ret == -ENOTBLK) 584 ret = 0; 585 if (extend) { 586 /* 587 * We always perform extending DIO write synchronously so by 588 * now the IO is completed and ext4_handle_inode_extension() 589 * was called. Cleanup the inode in case of error or race with 590 * writeback of delalloc blocks. 591 */ 592 WARN_ON_ONCE(ret == -EIOCBQUEUED); 593 ext4_inode_extension_cleanup(inode, ret < 0); 594 } 595 596 out: 597 if (ilock_shared) 598 inode_unlock_shared(inode); 599 else 600 inode_unlock(inode); 601 602 if (ret >= 0 && iov_iter_count(from)) { 603 ssize_t err; 604 loff_t endbyte; 605 606 /* 607 * There is no support for atomic writes on buffered-io yet, 608 * we should never fallback to buffered-io for DIO atomic 609 * writes. 610 */ 611 WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); 612 613 offset = iocb->ki_pos; 614 err = ext4_buffered_write_iter(iocb, from); 615 if (err < 0) 616 return err; 617 618 /* 619 * We need to ensure that the pages within the page cache for 620 * the range covered by this I/O are written to disk and 621 * invalidated. This is in attempt to preserve the expected 622 * direct I/O semantics in the case we fallback to buffered I/O 623 * to complete off the I/O request. 624 */ 625 ret += err; 626 endbyte = offset + err - 1; 627 err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, 628 offset, endbyte); 629 if (!err) 630 invalidate_mapping_pages(iocb->ki_filp->f_mapping, 631 offset >> PAGE_SHIFT, 632 endbyte >> PAGE_SHIFT); 633 } 634 635 return ret; 636 } 637 638 #ifdef CONFIG_FS_DAX 639 static ssize_t 640 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) 641 { 642 ssize_t ret; 643 size_t count; 644 loff_t offset; 645 handle_t *handle; 646 bool extend = false; 647 struct inode *inode = file_inode(iocb->ki_filp); 648 649 if (iocb->ki_flags & IOCB_NOWAIT) { 650 if (!inode_trylock(inode)) 651 return -EAGAIN; 652 } else { 653 inode_lock(inode); 654 } 655 656 ret = ext4_write_checks(iocb, from); 657 if (ret <= 0) 658 goto out; 659 660 offset = iocb->ki_pos; 661 count = iov_iter_count(from); 662 663 if (offset + count > EXT4_I(inode)->i_disksize) { 664 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 665 if (IS_ERR(handle)) { 666 ret = PTR_ERR(handle); 667 goto out; 668 } 669 670 ret = ext4_orphan_add(handle, inode); 671 if (ret) { 672 ext4_journal_stop(handle); 673 goto out; 674 } 675 676 extend = true; 677 ext4_journal_stop(handle); 678 } 679 680 ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); 681 682 if (extend) { 683 ret = ext4_handle_inode_extension(inode, offset, ret, count); 684 ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); 685 } 686 out: 687 inode_unlock(inode); 688 if (ret > 0) 689 ret = generic_write_sync(iocb, ret); 690 return ret; 691 } 692 #endif 693 694 static ssize_t 695 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 696 { 697 int ret; 698 struct inode *inode = file_inode(iocb->ki_filp); 699 700 ret = ext4_emergency_state(inode->i_sb); 701 if (unlikely(ret)) 702 return ret; 703 704 #ifdef CONFIG_FS_DAX 705 if (IS_DAX(inode)) 706 return ext4_dax_write_iter(iocb, from); 707 #endif 708 709 if (iocb->ki_flags & IOCB_ATOMIC) { 710 size_t len = iov_iter_count(from); 711 712 if (len < EXT4_SB(inode->i_sb)->s_awu_min || 713 len > EXT4_SB(inode->i_sb)->s_awu_max) 714 return -EINVAL; 715 716 ret = generic_atomic_write_valid(iocb, from); 717 if (ret) 718 return ret; 719 } 720 721 if (iocb->ki_flags & IOCB_DIRECT) 722 return ext4_dio_write_iter(iocb, from); 723 else 724 return ext4_buffered_write_iter(iocb, from); 725 } 726 727 #ifdef CONFIG_FS_DAX 728 static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) 729 { 730 int error = 0; 731 vm_fault_t result; 732 int retries = 0; 733 handle_t *handle = NULL; 734 struct inode *inode = file_inode(vmf->vma->vm_file); 735 struct super_block *sb = inode->i_sb; 736 737 /* 738 * We have to distinguish real writes from writes which will result in a 739 * COW page; COW writes should *not* poke the journal (the file will not 740 * be changed). Doing so would cause unintended failures when mounted 741 * read-only. 742 * 743 * We check for VM_SHARED rather than vmf->cow_page since the latter is 744 * unset for order != 0 (i.e. only in do_cow_fault); for 745 * other sizes, dax_iomap_fault will handle splitting / fallback so that 746 * we eventually come back with a COW page. 747 */ 748 bool write = (vmf->flags & FAULT_FLAG_WRITE) && 749 (vmf->vma->vm_flags & VM_SHARED); 750 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 751 unsigned long pfn; 752 753 if (write) { 754 sb_start_pagefault(sb); 755 file_update_time(vmf->vma->vm_file); 756 filemap_invalidate_lock_shared(mapping); 757 retry: 758 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 759 EXT4_DATA_TRANS_BLOCKS(sb)); 760 if (IS_ERR(handle)) { 761 filemap_invalidate_unlock_shared(mapping); 762 sb_end_pagefault(sb); 763 return VM_FAULT_SIGBUS; 764 } 765 } else { 766 filemap_invalidate_lock_shared(mapping); 767 } 768 result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); 769 if (write) { 770 ext4_journal_stop(handle); 771 772 if ((result & VM_FAULT_ERROR) && error == -ENOSPC && 773 ext4_should_retry_alloc(sb, &retries)) 774 goto retry; 775 /* Handling synchronous page fault? */ 776 if (result & VM_FAULT_NEEDDSYNC) 777 result = dax_finish_sync_fault(vmf, order, pfn); 778 filemap_invalidate_unlock_shared(mapping); 779 sb_end_pagefault(sb); 780 } else { 781 filemap_invalidate_unlock_shared(mapping); 782 } 783 784 return result; 785 } 786 787 static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) 788 { 789 return ext4_dax_huge_fault(vmf, 0); 790 } 791 792 static const struct vm_operations_struct ext4_dax_vm_ops = { 793 .fault = ext4_dax_fault, 794 .huge_fault = ext4_dax_huge_fault, 795 .page_mkwrite = ext4_dax_fault, 796 .pfn_mkwrite = ext4_dax_fault, 797 }; 798 #else 799 #define ext4_dax_vm_ops ext4_file_vm_ops 800 #endif 801 802 static const struct vm_operations_struct ext4_file_vm_ops = { 803 .fault = filemap_fault, 804 .map_pages = filemap_map_pages, 805 .page_mkwrite = ext4_page_mkwrite, 806 }; 807 808 static int ext4_file_mmap_prepare(struct vm_area_desc *desc) 809 { 810 int ret; 811 struct file *file = desc->file; 812 struct inode *inode = file->f_mapping->host; 813 struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 814 815 if (file->f_mode & FMODE_WRITE) 816 ret = ext4_emergency_state(inode->i_sb); 817 else 818 ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; 819 if (unlikely(ret)) 820 return ret; 821 822 /* 823 * We don't support synchronous mappings for non-DAX files and 824 * for DAX files if underneath dax_device is not synchronous. 825 */ 826 if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) 827 return -EOPNOTSUPP; 828 829 file_accessed(file); 830 if (IS_DAX(file_inode(file))) { 831 desc->vm_ops = &ext4_dax_vm_ops; 832 desc->vm_flags |= VM_HUGEPAGE; 833 } else { 834 desc->vm_ops = &ext4_file_vm_ops; 835 } 836 return 0; 837 } 838 839 static int ext4_sample_last_mounted(struct super_block *sb, 840 struct vfsmount *mnt) 841 { 842 struct ext4_sb_info *sbi = EXT4_SB(sb); 843 struct path path; 844 char buf[64], *cp; 845 handle_t *handle; 846 int err; 847 848 if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) 849 return 0; 850 851 if (ext4_emergency_state(sb) || sb_rdonly(sb) || 852 !sb_start_intwrite_trylock(sb)) 853 return 0; 854 855 ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); 856 /* 857 * Sample where the filesystem has been mounted and 858 * store it in the superblock for sysadmin convenience 859 * when trying to sort through large numbers of block 860 * devices or filesystem images. 861 */ 862 memset(buf, 0, sizeof(buf)); 863 path.mnt = mnt; 864 path.dentry = mnt->mnt_root; 865 cp = d_path(&path, buf, sizeof(buf)); 866 err = 0; 867 if (IS_ERR(cp)) 868 goto out; 869 870 handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); 871 err = PTR_ERR(handle); 872 if (IS_ERR(handle)) 873 goto out; 874 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 875 err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, 876 EXT4_JTR_NONE); 877 if (err) 878 goto out_journal; 879 lock_buffer(sbi->s_sbh); 880 strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); 881 ext4_superblock_csum_set(sb); 882 unlock_buffer(sbi->s_sbh); 883 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 884 out_journal: 885 ext4_journal_stop(handle); 886 out: 887 sb_end_intwrite(sb); 888 return err; 889 } 890 891 static int ext4_file_open(struct inode *inode, struct file *filp) 892 { 893 int ret; 894 895 if (filp->f_mode & FMODE_WRITE) 896 ret = ext4_emergency_state(inode->i_sb); 897 else 898 ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; 899 if (unlikely(ret)) 900 return ret; 901 902 ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); 903 if (ret) 904 return ret; 905 906 ret = fscrypt_file_open(inode, filp); 907 if (ret) 908 return ret; 909 910 ret = fsverity_file_open(inode, filp); 911 if (ret) 912 return ret; 913 914 /* 915 * Set up the jbd2_inode if we are opening the inode for 916 * writing and the journal is present 917 */ 918 if (filp->f_mode & FMODE_WRITE) { 919 ret = ext4_inode_attach_jinode(inode); 920 if (ret < 0) 921 return ret; 922 } 923 924 if (ext4_inode_can_atomic_write(inode)) 925 filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; 926 927 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 928 return dquot_file_open(inode, filp); 929 } 930 931 /* 932 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 933 * by calling generic_file_llseek_size() with the appropriate maxbytes 934 * value for each. 935 */ 936 loff_t ext4_llseek(struct file *file, loff_t offset, int whence) 937 { 938 struct inode *inode = file->f_mapping->host; 939 loff_t maxbytes = ext4_get_maxbytes(inode); 940 941 switch (whence) { 942 default: 943 return generic_file_llseek_size(file, offset, whence, 944 maxbytes, i_size_read(inode)); 945 case SEEK_HOLE: 946 inode_lock_shared(inode); 947 offset = iomap_seek_hole(inode, offset, 948 &ext4_iomap_report_ops); 949 inode_unlock_shared(inode); 950 break; 951 case SEEK_DATA: 952 inode_lock_shared(inode); 953 offset = iomap_seek_data(inode, offset, 954 &ext4_iomap_report_ops); 955 inode_unlock_shared(inode); 956 break; 957 } 958 959 if (offset < 0) 960 return offset; 961 return vfs_setpos(file, offset, maxbytes); 962 } 963 964 const struct file_operations ext4_file_operations = { 965 .llseek = ext4_llseek, 966 .read_iter = ext4_file_read_iter, 967 .write_iter = ext4_file_write_iter, 968 .iopoll = iocb_bio_iopoll, 969 .unlocked_ioctl = ext4_ioctl, 970 #ifdef CONFIG_COMPAT 971 .compat_ioctl = ext4_compat_ioctl, 972 #endif 973 .mmap_prepare = ext4_file_mmap_prepare, 974 .open = ext4_file_open, 975 .release = ext4_release_file, 976 .fsync = ext4_sync_file, 977 .get_unmapped_area = thp_get_unmapped_area, 978 .splice_read = ext4_file_splice_read, 979 .splice_write = iter_file_splice_write, 980 .fallocate = ext4_fallocate, 981 .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | 982 FOP_DIO_PARALLEL_WRITE | 983 FOP_DONTCACHE, 984 .setlease = generic_setlease, 985 }; 986 987 const struct inode_operations ext4_file_inode_operations = { 988 .setattr = ext4_setattr, 989 .getattr = ext4_file_getattr, 990 .listxattr = ext4_listxattr, 991 .get_inode_acl = ext4_get_acl, 992 .set_acl = ext4_set_acl, 993 .fiemap = ext4_fiemap, 994 .fileattr_get = ext4_fileattr_get, 995 .fileattr_set = ext4_fileattr_set, 996 }; 997 998