file.c (cdc627e65c7eb8d105f0b9e9695106e54eea1a6e) | file.c (9aa29a20b70097213d10e03a452366ceea72fc02) |
---|---|
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6#include <linux/fs.h> 7#include <linux/pagemap.h> 8#include <linux/time.h> 9#include <linux/init.h> 10#include <linux/string.h> 11#include <linux/backing-dev.h> 12#include <linux/falloc.h> 13#include <linux/writeback.h> 14#include <linux/compat.h> 15#include <linux/slab.h> 16#include <linux/btrfs.h> 17#include <linux/uio.h> 18#include <linux/iversion.h> 19#include <linux/fsverity.h> | 1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6#include <linux/fs.h> 7#include <linux/pagemap.h> 8#include <linux/time.h> 9#include <linux/init.h> 10#include <linux/string.h> 11#include <linux/backing-dev.h> 12#include <linux/falloc.h> 13#include <linux/writeback.h> 14#include <linux/compat.h> 15#include <linux/slab.h> 16#include <linux/btrfs.h> 17#include <linux/uio.h> 18#include <linux/iversion.h> 19#include <linux/fsverity.h> |
20#include <linux/iomap.h> | |
21#include "ctree.h" | 20#include "ctree.h" |
21#include "direct-io.h" |
|
22#include "disk-io.h" 23#include "transaction.h" 24#include "btrfs_inode.h" 25#include "tree-log.h" 26#include "locking.h" 27#include "qgroup.h" 28#include "compression.h" 29#include "delalloc-space.h" --- 1105 unchanged lines hidden (view full) --- 1135 ts = inode_get_ctime(inode); 1136 if (!timespec64_equal(&ts, &now)) 1137 inode_set_ctime_to_ts(inode, now); 1138 1139 if (IS_I_VERSION(inode)) 1140 inode_inc_iversion(inode); 1141} 1142 | 22#include "disk-io.h" 23#include "transaction.h" 24#include "btrfs_inode.h" 25#include "tree-log.h" 26#include "locking.h" 27#include "qgroup.h" 28#include "compression.h" 29#include "delalloc-space.h" --- 1105 unchanged lines hidden (view full) --- 1135 ts = inode_get_ctime(inode); 1136 if (!timespec64_equal(&ts, &now)) 1137 inode_set_ctime_to_ts(inode, now); 1138 1139 if (IS_I_VERSION(inode)) 1140 inode_inc_iversion(inode); 1141} 1142 |
1143static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, 1144 size_t count) | 1143int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) |
1145{ 1146 struct file *file = iocb->ki_filp; 1147 struct inode *inode = file_inode(file); 1148 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1149 loff_t pos = iocb->ki_pos; 1150 int ret; 1151 loff_t oldsize; 1152 loff_t start_pos; --- 29 unchanged lines hidden (view full) --- 1182 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); 1183 if (ret) 1184 return ret; 1185 } 1186 1187 return 0; 1188} 1189 | 1144{ 1145 struct file *file = iocb->ki_filp; 1146 struct inode *inode = file_inode(file); 1147 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1148 loff_t pos = iocb->ki_pos; 1149 int ret; 1150 loff_t oldsize; 1151 loff_t start_pos; --- 29 unchanged lines hidden (view full) --- 1181 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); 1182 if (ret) 1183 return ret; 1184 } 1185 1186 return 0; 1187} 1188 |
1190static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, 1191 struct iov_iter *i) | 1189ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) |
1192{ 1193 struct file *file = iocb->ki_filp; 1194 loff_t pos; 1195 struct inode *inode = file_inode(file); 1196 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1197 struct page **pages = NULL; 1198 struct extent_changeset *data_reserved = NULL; 1199 u64 release_bytes = 0; --- 246 unchanged lines hidden (view full) --- 1446 pagecache_isize_extended(inode, old_isize, iocb->ki_pos); 1447 iocb->ki_pos += num_written; 1448 } 1449out: 1450 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1451 return num_written ? num_written : ret; 1452} 1453 | 1190{ 1191 struct file *file = iocb->ki_filp; 1192 loff_t pos; 1193 struct inode *inode = file_inode(file); 1194 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1195 struct page **pages = NULL; 1196 struct extent_changeset *data_reserved = NULL; 1197 u64 release_bytes = 0; --- 246 unchanged lines hidden (view full) --- 1444 pagecache_isize_extended(inode, old_isize, iocb->ki_pos); 1445 iocb->ki_pos += num_written; 1446 } 1447out: 1448 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1449 return num_written ? num_written : ret; 1450} 1451 |
1454static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 1455 const struct iov_iter *iter, loff_t offset) 1456{ 1457 const u32 blocksize_mask = fs_info->sectorsize - 1; 1458 1459 if (offset & blocksize_mask) 1460 return -EINVAL; 1461 1462 if (iov_iter_alignment(iter) & blocksize_mask) 1463 return -EINVAL; 1464 1465 return 0; 1466} 1467 1468static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 1469{ 1470 struct file *file = iocb->ki_filp; 1471 struct inode *inode = file_inode(file); 1472 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1473 loff_t pos; 1474 ssize_t written = 0; 1475 ssize_t written_buffered; 1476 size_t prev_left = 0; 1477 loff_t endbyte; 1478 ssize_t ret; 1479 unsigned int ilock_flags = 0; 1480 struct iomap_dio *dio; 1481 1482 if (iocb->ki_flags & IOCB_NOWAIT) 1483 ilock_flags |= BTRFS_ILOCK_TRY; 1484 1485 /* 1486 * If the write DIO is within EOF, use a shared lock and also only if 1487 * security bits will likely not be dropped by file_remove_privs() called 1488 * from btrfs_write_check(). Either will need to be rechecked after the 1489 * lock was acquired. 1490 */ 1491 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) 1492 ilock_flags |= BTRFS_ILOCK_SHARED; 1493 1494relock: 1495 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 1496 if (ret < 0) 1497 return ret; 1498 1499 /* Shared lock cannot be used with security bits set. */ 1500 if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { 1501 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1502 ilock_flags &= ~BTRFS_ILOCK_SHARED; 1503 goto relock; 1504 } 1505 1506 ret = generic_write_checks(iocb, from); 1507 if (ret <= 0) { 1508 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1509 return ret; 1510 } 1511 1512 ret = btrfs_write_check(iocb, from, ret); 1513 if (ret < 0) { 1514 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1515 goto out; 1516 } 1517 1518 pos = iocb->ki_pos; 1519 /* 1520 * Re-check since file size may have changed just before taking the 1521 * lock or pos may have changed because of O_APPEND in generic_write_check() 1522 */ 1523 if ((ilock_flags & BTRFS_ILOCK_SHARED) && 1524 pos + iov_iter_count(from) > i_size_read(inode)) { 1525 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1526 ilock_flags &= ~BTRFS_ILOCK_SHARED; 1527 goto relock; 1528 } 1529 1530 if (check_direct_IO(fs_info, from, pos)) { 1531 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1532 goto buffered; 1533 } 1534 1535 /* 1536 * The iov_iter can be mapped to the same file range we are writing to. 1537 * If that's the case, then we will deadlock in the iomap code, because 1538 * it first calls our callback btrfs_dio_iomap_begin(), which will create 1539 * an ordered extent, and after that it will fault in the pages that the 1540 * iov_iter refers to. During the fault in we end up in the readahead 1541 * pages code (starting at btrfs_readahead()), which will lock the range, 1542 * find that ordered extent and then wait for it to complete (at 1543 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since 1544 * obviously the ordered extent can never complete as we didn't submit 1545 * yet the respective bio(s). This always happens when the buffer is 1546 * memory mapped to the same file range, since the iomap DIO code always 1547 * invalidates pages in the target file range (after starting and waiting 1548 * for any writeback). 1549 * 1550 * So here we disable page faults in the iov_iter and then retry if we 1551 * got -EFAULT, faulting in the pages before the retry. 1552 */ 1553 from->nofault = true; 1554 dio = btrfs_dio_write(iocb, from, written); 1555 from->nofault = false; 1556 1557 /* 1558 * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync 1559 * iocb, and that needs to lock the inode. So unlock it before calling 1560 * iomap_dio_complete() to avoid a deadlock. 1561 */ 1562 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1563 1564 if (IS_ERR_OR_NULL(dio)) 1565 ret = PTR_ERR_OR_ZERO(dio); 1566 else 1567 ret = iomap_dio_complete(dio); 1568 1569 /* No increment (+=) because iomap returns a cumulative value. */ 1570 if (ret > 0) 1571 written = ret; 1572 1573 if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { 1574 const size_t left = iov_iter_count(from); 1575 /* 1576 * We have more data left to write. Try to fault in as many as 1577 * possible of the remainder pages and retry. We do this without 1578 * releasing and locking again the inode, to prevent races with 1579 * truncate. 1580 * 1581 * Also, in case the iov refers to pages in the file range of the 1582 * file we want to write to (due to a mmap), we could enter an 1583 * infinite loop if we retry after faulting the pages in, since 1584 * iomap will invalidate any pages in the range early on, before 1585 * it tries to fault in the pages of the iov. So we keep track of 1586 * how much was left of iov in the previous EFAULT and fallback 1587 * to buffered IO in case we haven't made any progress. 1588 */ 1589 if (left == prev_left) { 1590 ret = -ENOTBLK; 1591 } else { 1592 fault_in_iov_iter_readable(from, left); 1593 prev_left = left; 1594 goto relock; 1595 } 1596 } 1597 1598 /* 1599 * If 'ret' is -ENOTBLK or we have not written all data, then it means 1600 * we must fallback to buffered IO. 1601 */ 1602 if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) 1603 goto out; 1604 1605buffered: 1606 /* 1607 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller 1608 * it must retry the operation in a context where blocking is acceptable, 1609 * because even if we end up not blocking during the buffered IO attempt 1610 * below, we will block when flushing and waiting for the IO. 1611 */ 1612 if (iocb->ki_flags & IOCB_NOWAIT) { 1613 ret = -EAGAIN; 1614 goto out; 1615 } 1616 1617 pos = iocb->ki_pos; 1618 written_buffered = btrfs_buffered_write(iocb, from); 1619 if (written_buffered < 0) { 1620 ret = written_buffered; 1621 goto out; 1622 } 1623 /* 1624 * Ensure all data is persisted. We want the next direct IO read to be 1625 * able to read what was just written. 1626 */ 1627 endbyte = pos + written_buffered - 1; 1628 ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte); 1629 if (ret) 1630 goto out; 1631 ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); 1632 if (ret) 1633 goto out; 1634 written += written_buffered; 1635 iocb->ki_pos = pos + written_buffered; 1636 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, 1637 endbyte >> PAGE_SHIFT); 1638out: 1639 return ret < 0 ? ret : written; 1640} 1641 | |
1642static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, 1643 const struct btrfs_ioctl_encoded_io_args *encoded) 1644{ 1645 struct file *file = iocb->ki_filp; 1646 struct inode *inode = file_inode(file); 1647 loff_t count; 1648 ssize_t ret; 1649 --- 2259 unchanged lines hidden (view full) --- 3909 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 3910 3911 ret = fsverity_file_open(inode, filp); 3912 if (ret) 3913 return ret; 3914 return generic_file_open(inode, filp); 3915} 3916 | 1452static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, 1453 const struct btrfs_ioctl_encoded_io_args *encoded) 1454{ 1455 struct file *file = iocb->ki_filp; 1456 struct inode *inode = file_inode(file); 1457 loff_t count; 1458 ssize_t ret; 1459 --- 2259 unchanged lines hidden (view full) --- 3719 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 3720 3721 ret = fsverity_file_open(inode, filp); 3722 if (ret) 3723 return ret; 3724 return generic_file_open(inode, filp); 3725} 3726 |
3917static int check_direct_read(struct btrfs_fs_info *fs_info, 3918 const struct iov_iter *iter, loff_t offset) 3919{ 3920 int ret; 3921 int i, seg; 3922 3923 ret = check_direct_IO(fs_info, iter, offset); 3924 if (ret < 0) 3925 return ret; 3926 3927 if (!iter_is_iovec(iter)) 3928 return 0; 3929 3930 for (seg = 0; seg < iter->nr_segs; seg++) { 3931 for (i = seg + 1; i < iter->nr_segs; i++) { 3932 const struct iovec *iov1 = iter_iov(iter) + seg; 3933 const struct iovec *iov2 = iter_iov(iter) + i; 3934 3935 if (iov1->iov_base == iov2->iov_base) 3936 return -EINVAL; 3937 } 3938 } 3939 return 0; 3940} 3941 3942static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) 3943{ 3944 struct inode *inode = file_inode(iocb->ki_filp); 3945 size_t prev_left = 0; 3946 ssize_t read = 0; 3947 ssize_t ret; 3948 3949 if (fsverity_active(inode)) 3950 return 0; 3951 3952 if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) 3953 return 0; 3954 3955 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 3956again: 3957 /* 3958 * This is similar to what we do for direct IO writes, see the comment 3959 * at btrfs_direct_write(), but we also disable page faults in addition 3960 * to disabling them only at the iov_iter level. This is because when 3961 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), 3962 * which can still trigger page fault ins despite having set ->nofault 3963 * to true of our 'to' iov_iter. 3964 * 3965 * The difference to direct IO writes is that we deadlock when trying 3966 * to lock the extent range in the inode's tree during he page reads 3967 * triggered by the fault in (while for writes it is due to waiting for 3968 * our own ordered extent). This is because for direct IO reads, 3969 * btrfs_dio_iomap_begin() returns with the extent range locked, which 3970 * is only unlocked in the endio callback (end_bio_extent_readpage()). 3971 */ 3972 pagefault_disable(); 3973 to->nofault = true; 3974 ret = btrfs_dio_read(iocb, to, read); 3975 to->nofault = false; 3976 pagefault_enable(); 3977 3978 /* No increment (+=) because iomap returns a cumulative value. */ 3979 if (ret > 0) 3980 read = ret; 3981 3982 if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { 3983 const size_t left = iov_iter_count(to); 3984 3985 if (left == prev_left) { 3986 /* 3987 * We didn't make any progress since the last attempt, 3988 * fallback to a buffered read for the remainder of the 3989 * range. This is just to avoid any possibility of looping 3990 * for too long. 3991 */ 3992 ret = read; 3993 } else { 3994 /* 3995 * We made some progress since the last retry or this is 3996 * the first time we are retrying. Fault in as many pages 3997 * as possible and retry. 3998 */ 3999 fault_in_iov_iter_writeable(to, left); 4000 prev_left = left; 4001 goto again; 4002 } 4003 } 4004 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 4005 return ret < 0 ? ret : read; 4006} 4007 | |
4008static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 4009{ 4010 ssize_t ret = 0; 4011 4012 if (iocb->ki_flags & IOCB_DIRECT) { 4013 ret = btrfs_direct_read(iocb, to); 4014 if (ret < 0 || !iov_iter_count(to) || 4015 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) --- 51 unchanged lines hidden --- | 3727static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 3728{ 3729 ssize_t ret = 0; 3730 3731 if (iocb->ki_flags & IOCB_DIRECT) { 3732 ret = btrfs_direct_read(iocb, to); 3733 if (ret < 0 || !iov_iter_count(to) || 3734 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) --- 51 unchanged lines hidden --- |