file.c (cdc627e65c7eb8d105f0b9e9695106e54eea1a6e) file.c (9aa29a20b70097213d10e03a452366ceea72fc02)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6#include <linux/fs.h>
7#include <linux/pagemap.h>
8#include <linux/time.h>
9#include <linux/init.h>
10#include <linux/string.h>
11#include <linux/backing-dev.h>
12#include <linux/falloc.h>
13#include <linux/writeback.h>
14#include <linux/compat.h>
15#include <linux/slab.h>
16#include <linux/btrfs.h>
17#include <linux/uio.h>
18#include <linux/iversion.h>
19#include <linux/fsverity.h>
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6#include <linux/fs.h>
7#include <linux/pagemap.h>
8#include <linux/time.h>
9#include <linux/init.h>
10#include <linux/string.h>
11#include <linux/backing-dev.h>
12#include <linux/falloc.h>
13#include <linux/writeback.h>
14#include <linux/compat.h>
15#include <linux/slab.h>
16#include <linux/btrfs.h>
17#include <linux/uio.h>
18#include <linux/iversion.h>
19#include <linux/fsverity.h>
20#include <linux/iomap.h>
21#include "ctree.h"
20#include "ctree.h"
21#include "direct-io.h"
22#include "disk-io.h"
23#include "transaction.h"
24#include "btrfs_inode.h"
25#include "tree-log.h"
26#include "locking.h"
27#include "qgroup.h"
28#include "compression.h"
29#include "delalloc-space.h"

--- 1105 unchanged lines hidden (view full) ---

1135 ts = inode_get_ctime(inode);
1136 if (!timespec64_equal(&ts, &now))
1137 inode_set_ctime_to_ts(inode, now);
1138
1139 if (IS_I_VERSION(inode))
1140 inode_inc_iversion(inode);
1141}
1142
22#include "disk-io.h"
23#include "transaction.h"
24#include "btrfs_inode.h"
25#include "tree-log.h"
26#include "locking.h"
27#include "qgroup.h"
28#include "compression.h"
29#include "delalloc-space.h"

--- 1105 unchanged lines hidden (view full) ---

1135 ts = inode_get_ctime(inode);
1136 if (!timespec64_equal(&ts, &now))
1137 inode_set_ctime_to_ts(inode, now);
1138
1139 if (IS_I_VERSION(inode))
1140 inode_inc_iversion(inode);
1141}
1142
1143static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1144 size_t count)
1143int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
1145{
1146 struct file *file = iocb->ki_filp;
1147 struct inode *inode = file_inode(file);
1148 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1149 loff_t pos = iocb->ki_pos;
1150 int ret;
1151 loff_t oldsize;
1152 loff_t start_pos;

--- 29 unchanged lines hidden (view full) ---

1182 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1183 if (ret)
1184 return ret;
1185 }
1186
1187 return 0;
1188}
1189
1144{
1145 struct file *file = iocb->ki_filp;
1146 struct inode *inode = file_inode(file);
1147 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1148 loff_t pos = iocb->ki_pos;
1149 int ret;
1150 loff_t oldsize;
1151 loff_t start_pos;

--- 29 unchanged lines hidden (view full) ---

1181 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1182 if (ret)
1183 return ret;
1184 }
1185
1186 return 0;
1187}
1188
1190static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1191 struct iov_iter *i)
1189ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
1192{
1193 struct file *file = iocb->ki_filp;
1194 loff_t pos;
1195 struct inode *inode = file_inode(file);
1196 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1197 struct page **pages = NULL;
1198 struct extent_changeset *data_reserved = NULL;
1199 u64 release_bytes = 0;

--- 246 unchanged lines hidden (view full) ---

1446 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1447 iocb->ki_pos += num_written;
1448 }
1449out:
1450 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1451 return num_written ? num_written : ret;
1452}
1453
1190{
1191 struct file *file = iocb->ki_filp;
1192 loff_t pos;
1193 struct inode *inode = file_inode(file);
1194 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1195 struct page **pages = NULL;
1196 struct extent_changeset *data_reserved = NULL;
1197 u64 release_bytes = 0;

--- 246 unchanged lines hidden (view full) ---

1444 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1445 iocb->ki_pos += num_written;
1446 }
1447out:
1448 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1449 return num_written ? num_written : ret;
1450}
1451
1454static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1455 const struct iov_iter *iter, loff_t offset)
1456{
1457 const u32 blocksize_mask = fs_info->sectorsize - 1;
1458
1459 if (offset & blocksize_mask)
1460 return -EINVAL;
1461
1462 if (iov_iter_alignment(iter) & blocksize_mask)
1463 return -EINVAL;
1464
1465 return 0;
1466}
1467
1468static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1469{
1470 struct file *file = iocb->ki_filp;
1471 struct inode *inode = file_inode(file);
1472 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1473 loff_t pos;
1474 ssize_t written = 0;
1475 ssize_t written_buffered;
1476 size_t prev_left = 0;
1477 loff_t endbyte;
1478 ssize_t ret;
1479 unsigned int ilock_flags = 0;
1480 struct iomap_dio *dio;
1481
1482 if (iocb->ki_flags & IOCB_NOWAIT)
1483 ilock_flags |= BTRFS_ILOCK_TRY;
1484
1485 /*
1486 * If the write DIO is within EOF, use a shared lock and also only if
1487 * security bits will likely not be dropped by file_remove_privs() called
1488 * from btrfs_write_check(). Either will need to be rechecked after the
1489 * lock was acquired.
1490 */
1491 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
1492 ilock_flags |= BTRFS_ILOCK_SHARED;
1493
1494relock:
1495 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1496 if (ret < 0)
1497 return ret;
1498
1499 /* Shared lock cannot be used with security bits set. */
1500 if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
1501 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1502 ilock_flags &= ~BTRFS_ILOCK_SHARED;
1503 goto relock;
1504 }
1505
1506 ret = generic_write_checks(iocb, from);
1507 if (ret <= 0) {
1508 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1509 return ret;
1510 }
1511
1512 ret = btrfs_write_check(iocb, from, ret);
1513 if (ret < 0) {
1514 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1515 goto out;
1516 }
1517
1518 pos = iocb->ki_pos;
1519 /*
1520 * Re-check since file size may have changed just before taking the
1521 * lock or pos may have changed because of O_APPEND in generic_write_check()
1522 */
1523 if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1524 pos + iov_iter_count(from) > i_size_read(inode)) {
1525 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1526 ilock_flags &= ~BTRFS_ILOCK_SHARED;
1527 goto relock;
1528 }
1529
1530 if (check_direct_IO(fs_info, from, pos)) {
1531 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1532 goto buffered;
1533 }
1534
1535 /*
1536 * The iov_iter can be mapped to the same file range we are writing to.
1537 * If that's the case, then we will deadlock in the iomap code, because
1538 * it first calls our callback btrfs_dio_iomap_begin(), which will create
1539 * an ordered extent, and after that it will fault in the pages that the
1540 * iov_iter refers to. During the fault in we end up in the readahead
1541 * pages code (starting at btrfs_readahead()), which will lock the range,
1542 * find that ordered extent and then wait for it to complete (at
1543 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1544 * obviously the ordered extent can never complete as we didn't submit
1545 * yet the respective bio(s). This always happens when the buffer is
1546 * memory mapped to the same file range, since the iomap DIO code always
1547 * invalidates pages in the target file range (after starting and waiting
1548 * for any writeback).
1549 *
1550 * So here we disable page faults in the iov_iter and then retry if we
1551 * got -EFAULT, faulting in the pages before the retry.
1552 */
1553 from->nofault = true;
1554 dio = btrfs_dio_write(iocb, from, written);
1555 from->nofault = false;
1556
1557 /*
1558 * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
1559 * iocb, and that needs to lock the inode. So unlock it before calling
1560 * iomap_dio_complete() to avoid a deadlock.
1561 */
1562 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1563
1564 if (IS_ERR_OR_NULL(dio))
1565 ret = PTR_ERR_OR_ZERO(dio);
1566 else
1567 ret = iomap_dio_complete(dio);
1568
1569 /* No increment (+=) because iomap returns a cumulative value. */
1570 if (ret > 0)
1571 written = ret;
1572
1573 if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
1574 const size_t left = iov_iter_count(from);
1575 /*
1576 * We have more data left to write. Try to fault in as many as
1577 * possible of the remainder pages and retry. We do this without
1578 * releasing and locking again the inode, to prevent races with
1579 * truncate.
1580 *
1581 * Also, in case the iov refers to pages in the file range of the
1582 * file we want to write to (due to a mmap), we could enter an
1583 * infinite loop if we retry after faulting the pages in, since
1584 * iomap will invalidate any pages in the range early on, before
1585 * it tries to fault in the pages of the iov. So we keep track of
1586 * how much was left of iov in the previous EFAULT and fallback
1587 * to buffered IO in case we haven't made any progress.
1588 */
1589 if (left == prev_left) {
1590 ret = -ENOTBLK;
1591 } else {
1592 fault_in_iov_iter_readable(from, left);
1593 prev_left = left;
1594 goto relock;
1595 }
1596 }
1597
1598 /*
1599 * If 'ret' is -ENOTBLK or we have not written all data, then it means
1600 * we must fallback to buffered IO.
1601 */
1602 if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
1603 goto out;
1604
1605buffered:
1606 /*
1607 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
1608 * it must retry the operation in a context where blocking is acceptable,
1609 * because even if we end up not blocking during the buffered IO attempt
1610 * below, we will block when flushing and waiting for the IO.
1611 */
1612 if (iocb->ki_flags & IOCB_NOWAIT) {
1613 ret = -EAGAIN;
1614 goto out;
1615 }
1616
1617 pos = iocb->ki_pos;
1618 written_buffered = btrfs_buffered_write(iocb, from);
1619 if (written_buffered < 0) {
1620 ret = written_buffered;
1621 goto out;
1622 }
1623 /*
1624 * Ensure all data is persisted. We want the next direct IO read to be
1625 * able to read what was just written.
1626 */
1627 endbyte = pos + written_buffered - 1;
1628 ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
1629 if (ret)
1630 goto out;
1631 ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1632 if (ret)
1633 goto out;
1634 written += written_buffered;
1635 iocb->ki_pos = pos + written_buffered;
1636 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1637 endbyte >> PAGE_SHIFT);
1638out:
1639 return ret < 0 ? ret : written;
1640}
1641
1642static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1643 const struct btrfs_ioctl_encoded_io_args *encoded)
1644{
1645 struct file *file = iocb->ki_filp;
1646 struct inode *inode = file_inode(file);
1647 loff_t count;
1648 ssize_t ret;
1649

--- 2259 unchanged lines hidden (view full) ---

3909 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3910
3911 ret = fsverity_file_open(inode, filp);
3912 if (ret)
3913 return ret;
3914 return generic_file_open(inode, filp);
3915}
3916
1452static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1453 const struct btrfs_ioctl_encoded_io_args *encoded)
1454{
1455 struct file *file = iocb->ki_filp;
1456 struct inode *inode = file_inode(file);
1457 loff_t count;
1458 ssize_t ret;
1459

--- 2259 unchanged lines hidden (view full) ---

3719 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3720
3721 ret = fsverity_file_open(inode, filp);
3722 if (ret)
3723 return ret;
3724 return generic_file_open(inode, filp);
3725}
3726
3917static int check_direct_read(struct btrfs_fs_info *fs_info,
3918 const struct iov_iter *iter, loff_t offset)
3919{
3920 int ret;
3921 int i, seg;
3922
3923 ret = check_direct_IO(fs_info, iter, offset);
3924 if (ret < 0)
3925 return ret;
3926
3927 if (!iter_is_iovec(iter))
3928 return 0;
3929
3930 for (seg = 0; seg < iter->nr_segs; seg++) {
3931 for (i = seg + 1; i < iter->nr_segs; i++) {
3932 const struct iovec *iov1 = iter_iov(iter) + seg;
3933 const struct iovec *iov2 = iter_iov(iter) + i;
3934
3935 if (iov1->iov_base == iov2->iov_base)
3936 return -EINVAL;
3937 }
3938 }
3939 return 0;
3940}
3941
3942static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3943{
3944 struct inode *inode = file_inode(iocb->ki_filp);
3945 size_t prev_left = 0;
3946 ssize_t read = 0;
3947 ssize_t ret;
3948
3949 if (fsverity_active(inode))
3950 return 0;
3951
3952 if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
3953 return 0;
3954
3955 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3956again:
3957 /*
3958 * This is similar to what we do for direct IO writes, see the comment
3959 * at btrfs_direct_write(), but we also disable page faults in addition
3960 * to disabling them only at the iov_iter level. This is because when
3961 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3962 * which can still trigger page fault ins despite having set ->nofault
3963 * to true of our 'to' iov_iter.
3964 *
3965 * The difference to direct IO writes is that we deadlock when trying
3966 * to lock the extent range in the inode's tree during he page reads
3967 * triggered by the fault in (while for writes it is due to waiting for
3968 * our own ordered extent). This is because for direct IO reads,
3969 * btrfs_dio_iomap_begin() returns with the extent range locked, which
3970 * is only unlocked in the endio callback (end_bio_extent_readpage()).
3971 */
3972 pagefault_disable();
3973 to->nofault = true;
3974 ret = btrfs_dio_read(iocb, to, read);
3975 to->nofault = false;
3976 pagefault_enable();
3977
3978 /* No increment (+=) because iomap returns a cumulative value. */
3979 if (ret > 0)
3980 read = ret;
3981
3982 if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
3983 const size_t left = iov_iter_count(to);
3984
3985 if (left == prev_left) {
3986 /*
3987 * We didn't make any progress since the last attempt,
3988 * fallback to a buffered read for the remainder of the
3989 * range. This is just to avoid any possibility of looping
3990 * for too long.
3991 */
3992 ret = read;
3993 } else {
3994 /*
3995 * We made some progress since the last retry or this is
3996 * the first time we are retrying. Fault in as many pages
3997 * as possible and retry.
3998 */
3999 fault_in_iov_iter_writeable(to, left);
4000 prev_left = left;
4001 goto again;
4002 }
4003 }
4004 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
4005 return ret < 0 ? ret : read;
4006}
4007
4008static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
4009{
4010 ssize_t ret = 0;
4011
4012 if (iocb->ki_flags & IOCB_DIRECT) {
4013 ret = btrfs_direct_read(iocb, to);
4014 if (ret < 0 || !iov_iter_count(to) ||
4015 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))

--- 51 unchanged lines hidden ---
3727static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3728{
3729 ssize_t ret = 0;
3730
3731 if (iocb->ki_flags & IOCB_DIRECT) {
3732 ret = btrfs_direct_read(iocb, to);
3733 if (ret < 0 || !iov_iter_count(to) ||
3734 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))

--- 51 unchanged lines hidden ---