11c6fdbd8SKent Overstreet // SPDX-License-Identifier: GPL-2.0 21c6fdbd8SKent Overstreet #ifndef NO_BCACHEFS_FS 31c6fdbd8SKent Overstreet 41c6fdbd8SKent Overstreet #include "bcachefs.h" 57b3f84eaSKent Overstreet #include "alloc_foreground.h" 607a1006aSKent Overstreet #include "bkey_buf.h" 71c6fdbd8SKent Overstreet #include "btree_update.h" 81c6fdbd8SKent Overstreet #include "buckets.h" 91c6fdbd8SKent Overstreet #include "clock.h" 101c6fdbd8SKent Overstreet #include "error.h" 11e2d9912cSKent Overstreet #include "extents.h" 1208c07feaSKent Overstreet #include "extent_update.h" 131c6fdbd8SKent Overstreet #include "fs.h" 141c6fdbd8SKent Overstreet #include "fs-io.h" 151c6fdbd8SKent Overstreet #include "fsck.h" 161c6fdbd8SKent Overstreet #include "inode.h" 171c6fdbd8SKent Overstreet #include "journal.h" 181c6fdbd8SKent Overstreet #include "io.h" 191c6fdbd8SKent Overstreet #include "keylist.h" 201c6fdbd8SKent Overstreet #include "quota.h" 2176426098SKent Overstreet #include "reflink.h" 221c6fdbd8SKent Overstreet #include "trace.h" 231c6fdbd8SKent Overstreet 241c6fdbd8SKent Overstreet #include <linux/aio.h> 251c6fdbd8SKent Overstreet #include <linux/backing-dev.h> 261c6fdbd8SKent Overstreet #include <linux/falloc.h> 271c6fdbd8SKent Overstreet #include <linux/migrate.h> 281c6fdbd8SKent Overstreet #include <linux/mmu_context.h> 291c6fdbd8SKent Overstreet #include <linux/pagevec.h> 309ba2eb25SKent Overstreet #include <linux/rmap.h> 311c6fdbd8SKent Overstreet #include <linux/sched/signal.h> 321c6fdbd8SKent Overstreet #include <linux/task_io_accounting_ops.h> 331c6fdbd8SKent Overstreet #include <linux/uio.h> 341c6fdbd8SKent Overstreet #include <linux/writeback.h> 351c6fdbd8SKent Overstreet 361c6fdbd8SKent Overstreet #include <trace/events/writeback.h> 371c6fdbd8SKent Overstreet 38*30bff594SKent Overstreet static inline loff_t folio_end_pos(struct folio *folio) 39*30bff594SKent Overstreet { 40*30bff594SKent Overstreet return folio_pos(folio) + folio_size(folio); 41*30bff594SKent Overstreet } 42*30bff594SKent Overstreet 43*30bff594SKent Overstreet static inline size_t folio_sectors(struct folio *folio) 44*30bff594SKent Overstreet { 45*30bff594SKent Overstreet return PAGE_SECTORS << folio_order(folio); 46*30bff594SKent Overstreet } 47*30bff594SKent Overstreet 48*30bff594SKent Overstreet static inline loff_t folio_sector(struct folio *folio) 49*30bff594SKent Overstreet { 50*30bff594SKent Overstreet return folio_pos(folio) >> 9; 51*30bff594SKent Overstreet } 52*30bff594SKent Overstreet 53*30bff594SKent Overstreet static inline loff_t folio_end_sector(struct folio *folio) 54*30bff594SKent Overstreet { 55*30bff594SKent Overstreet return folio_end_pos(folio) >> 9; 56*30bff594SKent Overstreet } 57*30bff594SKent Overstreet 58a8b3a677SKent Overstreet struct nocow_flush { 59a8b3a677SKent Overstreet struct closure *cl; 60a8b3a677SKent Overstreet struct bch_dev *ca; 61a8b3a677SKent Overstreet struct bio bio; 62a8b3a677SKent Overstreet }; 63a8b3a677SKent Overstreet 64a8b3a677SKent Overstreet static void nocow_flush_endio(struct bio *_bio) 65a8b3a677SKent Overstreet { 66a8b3a677SKent Overstreet 67a8b3a677SKent Overstreet struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); 68a8b3a677SKent Overstreet 69a8b3a677SKent Overstreet closure_put(bio->cl); 70a8b3a677SKent Overstreet percpu_ref_put(&bio->ca->io_ref); 71a8b3a677SKent Overstreet bio_put(&bio->bio); 72a8b3a677SKent Overstreet } 73a8b3a677SKent Overstreet 74a8b3a677SKent Overstreet static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 75a8b3a677SKent Overstreet struct bch_inode_info *inode, 76a8b3a677SKent Overstreet struct closure *cl) 77a8b3a677SKent Overstreet { 78a8b3a677SKent Overstreet struct nocow_flush *bio; 79a8b3a677SKent Overstreet struct bch_dev *ca; 80a8b3a677SKent Overstreet struct bch_devs_mask devs; 81a8b3a677SKent Overstreet unsigned dev; 82a8b3a677SKent Overstreet 83a8b3a677SKent Overstreet dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); 84a8b3a677SKent Overstreet if (dev == BCH_SB_MEMBERS_MAX) 85a8b3a677SKent Overstreet return; 86a8b3a677SKent Overstreet 87a8b3a677SKent Overstreet devs = inode->ei_devs_need_flush; 88a8b3a677SKent Overstreet memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 89a8b3a677SKent Overstreet 90a8b3a677SKent Overstreet for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { 91a8b3a677SKent Overstreet rcu_read_lock(); 92a8b3a677SKent Overstreet ca = rcu_dereference(c->devs[dev]); 93a8b3a677SKent Overstreet if (ca && !percpu_ref_tryget(&ca->io_ref)) 94a8b3a677SKent Overstreet ca = NULL; 95a8b3a677SKent Overstreet rcu_read_unlock(); 96a8b3a677SKent Overstreet 97a8b3a677SKent Overstreet if (!ca) 98a8b3a677SKent Overstreet continue; 99a8b3a677SKent Overstreet 100a8b3a677SKent Overstreet bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, 101a8b3a677SKent Overstreet REQ_OP_FLUSH, 102a8b3a677SKent Overstreet GFP_KERNEL, 103a8b3a677SKent Overstreet &c->nocow_flush_bioset), 104a8b3a677SKent Overstreet struct nocow_flush, bio); 105a8b3a677SKent Overstreet bio->cl = cl; 106a8b3a677SKent Overstreet bio->ca = ca; 107a8b3a677SKent Overstreet bio->bio.bi_end_io = nocow_flush_endio; 108a8b3a677SKent Overstreet closure_bio_submit(&bio->bio, cl); 109a8b3a677SKent Overstreet } 110a8b3a677SKent Overstreet } 111a8b3a677SKent Overstreet 112a8b3a677SKent Overstreet static int bch2_inode_flush_nocow_writes(struct bch_fs *c, 113a8b3a677SKent Overstreet struct bch_inode_info *inode) 114a8b3a677SKent Overstreet { 115a8b3a677SKent Overstreet struct closure cl; 116a8b3a677SKent Overstreet 117a8b3a677SKent Overstreet closure_init_stack(&cl); 118a8b3a677SKent Overstreet bch2_inode_flush_nocow_writes_async(c, inode, &cl); 119a8b3a677SKent Overstreet closure_sync(&cl); 120a8b3a677SKent Overstreet 121a8b3a677SKent Overstreet return 0; 122a8b3a677SKent Overstreet } 123a8b3a677SKent Overstreet 1247f5e31e1SKent Overstreet static inline bool bio_full(struct bio *bio, unsigned len) 1257f5e31e1SKent Overstreet { 1267f5e31e1SKent Overstreet if (bio->bi_vcnt >= bio->bi_max_vecs) 1277f5e31e1SKent Overstreet return true; 1287f5e31e1SKent Overstreet if (bio->bi_iter.bi_size > UINT_MAX - len) 1297f5e31e1SKent Overstreet return true; 1307f5e31e1SKent Overstreet return false; 1317f5e31e1SKent Overstreet } 1327f5e31e1SKent Overstreet 133eb8e6e9cSKent Overstreet static inline struct address_space *faults_disabled_mapping(void) 134eb8e6e9cSKent Overstreet { 135eb8e6e9cSKent Overstreet return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); 136eb8e6e9cSKent Overstreet } 137eb8e6e9cSKent Overstreet 138eb8e6e9cSKent Overstreet static inline void set_fdm_dropped_locks(void) 139eb8e6e9cSKent Overstreet { 140eb8e6e9cSKent Overstreet current->faults_disabled_mapping = 141eb8e6e9cSKent Overstreet (void *) (((unsigned long) current->faults_disabled_mapping)|1); 142eb8e6e9cSKent Overstreet } 143eb8e6e9cSKent Overstreet 144eb8e6e9cSKent Overstreet static inline bool fdm_dropped_locks(void) 145eb8e6e9cSKent Overstreet { 146eb8e6e9cSKent Overstreet return ((unsigned long) current->faults_disabled_mapping) & 1; 147eb8e6e9cSKent Overstreet } 148eb8e6e9cSKent Overstreet 1491c6fdbd8SKent Overstreet struct quota_res { 1501c6fdbd8SKent Overstreet u64 sectors; 1511c6fdbd8SKent Overstreet }; 1521c6fdbd8SKent Overstreet 1539a3df993SKent Overstreet struct bch_writepage_io { 1541c6fdbd8SKent Overstreet struct bch_inode_info *inode; 1551c6fdbd8SKent Overstreet 1561c6fdbd8SKent Overstreet /* must be last: */ 1571c6fdbd8SKent Overstreet struct bch_write_op op; 1581c6fdbd8SKent Overstreet }; 1591c6fdbd8SKent Overstreet 1601c6fdbd8SKent Overstreet struct dio_write { 1611c6fdbd8SKent Overstreet struct kiocb *req; 162182c7bbfSKent Overstreet struct address_space *mapping; 163182c7bbfSKent Overstreet struct bch_inode_info *inode; 164ed484030SKent Overstreet struct mm_struct *mm; 1651c6fdbd8SKent Overstreet unsigned loop:1, 1666b1b186aSKent Overstreet extending:1, 1671c6fdbd8SKent Overstreet sync:1, 168a1ee777bSKent Overstreet flush:1, 1691c6fdbd8SKent Overstreet free_iov:1; 1701c6fdbd8SKent Overstreet struct quota_res quota_res; 171042a1f26SKent Overstreet u64 written; 1721c6fdbd8SKent Overstreet 1731c6fdbd8SKent Overstreet struct iov_iter iter; 1741c6fdbd8SKent Overstreet struct iovec inline_vecs[2]; 1751c6fdbd8SKent Overstreet 1761c6fdbd8SKent Overstreet /* must be last: */ 1779a3df993SKent Overstreet struct bch_write_op op; 1781c6fdbd8SKent Overstreet }; 1791c6fdbd8SKent Overstreet 1801c6fdbd8SKent Overstreet struct dio_read { 1811c6fdbd8SKent Overstreet struct closure cl; 1821c6fdbd8SKent Overstreet struct kiocb *req; 1831c6fdbd8SKent Overstreet long ret; 184b4725cc1SKent Overstreet bool should_dirty; 1851c6fdbd8SKent Overstreet struct bch_read_bio rbio; 1861c6fdbd8SKent Overstreet }; 1871c6fdbd8SKent Overstreet 1881c6fdbd8SKent Overstreet /* pagecache_block must be held */ 189a023127aSKent Overstreet static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, 1901c6fdbd8SKent Overstreet loff_t start, loff_t end) 1911c6fdbd8SKent Overstreet { 1921c6fdbd8SKent Overstreet int ret; 1931c6fdbd8SKent Overstreet 1941c6fdbd8SKent Overstreet /* 1951c6fdbd8SKent Overstreet * XXX: the way this is currently implemented, we can spin if a process 1961c6fdbd8SKent Overstreet * is continually redirtying a specific page 1971c6fdbd8SKent Overstreet */ 1981c6fdbd8SKent Overstreet do { 1991c6fdbd8SKent Overstreet if (!mapping->nrpages) 2001c6fdbd8SKent Overstreet return 0; 2011c6fdbd8SKent Overstreet 2021c6fdbd8SKent Overstreet ret = filemap_write_and_wait_range(mapping, start, end); 2031c6fdbd8SKent Overstreet if (ret) 2041c6fdbd8SKent Overstreet break; 2051c6fdbd8SKent Overstreet 2061c6fdbd8SKent Overstreet if (!mapping->nrpages) 2071c6fdbd8SKent Overstreet return 0; 2081c6fdbd8SKent Overstreet 2091c6fdbd8SKent Overstreet ret = invalidate_inode_pages2_range(mapping, 2101c6fdbd8SKent Overstreet start >> PAGE_SHIFT, 2111c6fdbd8SKent Overstreet end >> PAGE_SHIFT); 2121c6fdbd8SKent Overstreet } while (ret == -EBUSY); 2131c6fdbd8SKent Overstreet 2141c6fdbd8SKent Overstreet return ret; 2151c6fdbd8SKent Overstreet } 2161c6fdbd8SKent Overstreet 2171c6fdbd8SKent Overstreet /* quotas */ 2181c6fdbd8SKent Overstreet 2191c6fdbd8SKent Overstreet #ifdef CONFIG_BCACHEFS_QUOTA 2201c6fdbd8SKent Overstreet 2216b1b186aSKent Overstreet static void __bch2_quota_reservation_put(struct bch_fs *c, 2221c6fdbd8SKent Overstreet struct bch_inode_info *inode, 2231c6fdbd8SKent Overstreet struct quota_res *res) 2241c6fdbd8SKent Overstreet { 2251c6fdbd8SKent Overstreet BUG_ON(res->sectors > inode->ei_quota_reserved); 2261c6fdbd8SKent Overstreet 2271c6fdbd8SKent Overstreet bch2_quota_acct(c, inode->ei_qid, Q_SPC, 22826609b61SKent Overstreet -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); 2291c6fdbd8SKent Overstreet inode->ei_quota_reserved -= res->sectors; 2301c6fdbd8SKent Overstreet res->sectors = 0; 2311c6fdbd8SKent Overstreet } 2321c6fdbd8SKent Overstreet 2336b1b186aSKent Overstreet static void bch2_quota_reservation_put(struct bch_fs *c, 2346b1b186aSKent Overstreet struct bch_inode_info *inode, 2356b1b186aSKent Overstreet struct quota_res *res) 2366b1b186aSKent Overstreet { 2376b1b186aSKent Overstreet if (res->sectors) { 2386b1b186aSKent Overstreet mutex_lock(&inode->ei_quota_lock); 2396b1b186aSKent Overstreet __bch2_quota_reservation_put(c, inode, res); 2406b1b186aSKent Overstreet mutex_unlock(&inode->ei_quota_lock); 2416b1b186aSKent Overstreet } 2426b1b186aSKent Overstreet } 2436b1b186aSKent Overstreet 2441c6fdbd8SKent Overstreet static int bch2_quota_reservation_add(struct bch_fs *c, 2451c6fdbd8SKent Overstreet struct bch_inode_info *inode, 2461c6fdbd8SKent Overstreet struct quota_res *res, 247e8540e56SKent Overstreet u64 sectors, 2481c6fdbd8SKent Overstreet bool check_enospc) 2491c6fdbd8SKent Overstreet { 2501c6fdbd8SKent Overstreet int ret; 2511c6fdbd8SKent Overstreet 2521c6fdbd8SKent Overstreet mutex_lock(&inode->ei_quota_lock); 2531c6fdbd8SKent Overstreet ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, 25426609b61SKent Overstreet check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); 2551c6fdbd8SKent Overstreet if (likely(!ret)) { 2561c6fdbd8SKent Overstreet inode->ei_quota_reserved += sectors; 2571c6fdbd8SKent Overstreet res->sectors += sectors; 2581c6fdbd8SKent Overstreet } 2591c6fdbd8SKent Overstreet mutex_unlock(&inode->ei_quota_lock); 2601c6fdbd8SKent Overstreet 2611c6fdbd8SKent Overstreet return ret; 2621c6fdbd8SKent Overstreet } 2631c6fdbd8SKent Overstreet 2641c6fdbd8SKent Overstreet #else 2651c6fdbd8SKent Overstreet 2666b1b186aSKent Overstreet static void __bch2_quota_reservation_put(struct bch_fs *c, 2676b1b186aSKent Overstreet struct bch_inode_info *inode, 2686b1b186aSKent Overstreet struct quota_res *res) {} 2696b1b186aSKent Overstreet 2701c6fdbd8SKent Overstreet static void bch2_quota_reservation_put(struct bch_fs *c, 2711c6fdbd8SKent Overstreet struct bch_inode_info *inode, 2726b1b186aSKent Overstreet struct quota_res *res) {} 2731c6fdbd8SKent Overstreet 2741c6fdbd8SKent Overstreet static int bch2_quota_reservation_add(struct bch_fs *c, 2751c6fdbd8SKent Overstreet struct bch_inode_info *inode, 2761c6fdbd8SKent Overstreet struct quota_res *res, 2771c6fdbd8SKent Overstreet unsigned sectors, 2781c6fdbd8SKent Overstreet bool check_enospc) 2791c6fdbd8SKent Overstreet { 2801c6fdbd8SKent Overstreet return 0; 2811c6fdbd8SKent Overstreet } 2821c6fdbd8SKent Overstreet 2831c6fdbd8SKent Overstreet #endif 2841c6fdbd8SKent Overstreet 2851c6fdbd8SKent Overstreet /* i_size updates: */ 2861c6fdbd8SKent Overstreet 2872ea90048SKent Overstreet struct inode_new_size { 2882ea90048SKent Overstreet loff_t new_size; 2892ea90048SKent Overstreet u64 now; 2902ea90048SKent Overstreet unsigned fields; 2912ea90048SKent Overstreet }; 2922ea90048SKent Overstreet 2931c6fdbd8SKent Overstreet static int inode_set_size(struct bch_inode_info *inode, 2941c6fdbd8SKent Overstreet struct bch_inode_unpacked *bi, 2951c6fdbd8SKent Overstreet void *p) 2961c6fdbd8SKent Overstreet { 2972ea90048SKent Overstreet struct inode_new_size *s = p; 2981c6fdbd8SKent Overstreet 2992ea90048SKent Overstreet bi->bi_size = s->new_size; 3002ea90048SKent Overstreet if (s->fields & ATTR_ATIME) 3012ea90048SKent Overstreet bi->bi_atime = s->now; 3022ea90048SKent Overstreet if (s->fields & ATTR_MTIME) 3032ea90048SKent Overstreet bi->bi_mtime = s->now; 3042ea90048SKent Overstreet if (s->fields & ATTR_CTIME) 3052ea90048SKent Overstreet bi->bi_ctime = s->now; 3061c6fdbd8SKent Overstreet 3071c6fdbd8SKent Overstreet return 0; 3081c6fdbd8SKent Overstreet } 3091c6fdbd8SKent Overstreet 31076426098SKent Overstreet int __must_check bch2_write_inode_size(struct bch_fs *c, 3111c6fdbd8SKent Overstreet struct bch_inode_info *inode, 3122ea90048SKent Overstreet loff_t new_size, unsigned fields) 3131c6fdbd8SKent Overstreet { 3142ea90048SKent Overstreet struct inode_new_size s = { 3152ea90048SKent Overstreet .new_size = new_size, 3162ea90048SKent Overstreet .now = bch2_current_time(c), 3172ea90048SKent Overstreet .fields = fields, 3182ea90048SKent Overstreet }; 3192ea90048SKent Overstreet 3202ea90048SKent Overstreet return bch2_write_inode(c, inode, inode_set_size, &s, fields); 3211c6fdbd8SKent Overstreet } 3221c6fdbd8SKent Overstreet 3236b1b186aSKent Overstreet static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 324190fa7afSKent Overstreet struct quota_res *quota_res, s64 sectors) 3251c6fdbd8SKent Overstreet { 326b33bf1bcSKent Overstreet bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, 327b33bf1bcSKent Overstreet "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", 328b33bf1bcSKent Overstreet inode->v.i_ino, (u64) inode->v.i_blocks, sectors, 329b33bf1bcSKent Overstreet inode->ei_inode.bi_sectors); 330b44a66a6SKent Overstreet inode->v.i_blocks += sectors; 331b44a66a6SKent Overstreet 3321c6fdbd8SKent Overstreet #ifdef CONFIG_BCACHEFS_QUOTA 3331c6fdbd8SKent Overstreet if (quota_res && sectors > 0) { 3341c6fdbd8SKent Overstreet BUG_ON(sectors > quota_res->sectors); 3351c6fdbd8SKent Overstreet BUG_ON(sectors > inode->ei_quota_reserved); 3361c6fdbd8SKent Overstreet 3371c6fdbd8SKent Overstreet quota_res->sectors -= sectors; 3381c6fdbd8SKent Overstreet inode->ei_quota_reserved -= sectors; 3391c6fdbd8SKent Overstreet } else { 34026609b61SKent Overstreet bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); 3411c6fdbd8SKent Overstreet } 3421c6fdbd8SKent Overstreet #endif 3436b1b186aSKent Overstreet } 3446b1b186aSKent Overstreet 3456b1b186aSKent Overstreet static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 3466b1b186aSKent Overstreet struct quota_res *quota_res, s64 sectors) 3476b1b186aSKent Overstreet { 3486b1b186aSKent Overstreet if (sectors) { 3496b1b186aSKent Overstreet mutex_lock(&inode->ei_quota_lock); 3506b1b186aSKent Overstreet __i_sectors_acct(c, inode, quota_res, sectors); 3511c6fdbd8SKent Overstreet mutex_unlock(&inode->ei_quota_lock); 3521c6fdbd8SKent Overstreet } 3536b1b186aSKent Overstreet } 3541c6fdbd8SKent Overstreet 3551c6fdbd8SKent Overstreet /* page state: */ 3561c6fdbd8SKent Overstreet 3571c6fdbd8SKent Overstreet /* stored in page->private: */ 3581c6fdbd8SKent Overstreet 3593342ac13SKent Overstreet struct bch_folio_sector { 360b44a66a6SKent Overstreet /* Uncompressed, fully allocated replicas (or on disk reservation): */ 361b44a66a6SKent Overstreet unsigned nr_replicas:4; 362f81b648dSKent Overstreet 363b44a66a6SKent Overstreet /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ 364b44a66a6SKent Overstreet unsigned replicas_reserved:4; 3651c6fdbd8SKent Overstreet 366f57a6a5dSKent Overstreet /* i_sectors: */ 367f57a6a5dSKent Overstreet enum { 368f57a6a5dSKent Overstreet SECTOR_UNALLOCATED, 3692ba5d38bSKent Overstreet SECTOR_RESERVED, 370f57a6a5dSKent Overstreet SECTOR_DIRTY, 371b44a66a6SKent Overstreet SECTOR_DIRTY_RESERVED, 372f57a6a5dSKent Overstreet SECTOR_ALLOCATED, 373b44a66a6SKent Overstreet } state:8; 3741c6fdbd8SKent Overstreet }; 3751c6fdbd8SKent Overstreet 3763342ac13SKent Overstreet struct bch_folio { 3773826ee0bSKent Overstreet spinlock_t lock; 3787f5e31e1SKent Overstreet atomic_t write_count; 3793342ac13SKent Overstreet /* 3803342ac13SKent Overstreet * Is the sector state up to date with the btree? 3813342ac13SKent Overstreet * (Not the data itself) 3823342ac13SKent Overstreet */ 383e6ec361fSKent Overstreet bool uptodate; 3843342ac13SKent Overstreet struct bch_folio_sector s[PAGE_SECTORS]; 385f57a6a5dSKent Overstreet }; 386f57a6a5dSKent Overstreet 387*30bff594SKent Overstreet static inline struct bch_folio *__bch2_folio(struct folio *folio) 3881c6fdbd8SKent Overstreet { 389*30bff594SKent Overstreet return folio_has_private(folio) 390*30bff594SKent Overstreet ? (struct bch_folio *) folio_get_private(folio) 391f57a6a5dSKent Overstreet : NULL; 392f57a6a5dSKent Overstreet } 3931c6fdbd8SKent Overstreet 394*30bff594SKent Overstreet static inline struct bch_folio *bch2_folio(struct folio *folio) 395f57a6a5dSKent Overstreet { 396*30bff594SKent Overstreet EBUG_ON(!folio_test_locked(folio)); 3971c6fdbd8SKent Overstreet 398*30bff594SKent Overstreet return __bch2_folio(folio); 399f57a6a5dSKent Overstreet } 400f57a6a5dSKent Overstreet 401*30bff594SKent Overstreet /* for newly allocated folios: */ 402*30bff594SKent Overstreet static void __bch2_folio_release(struct folio *folio) 403f57a6a5dSKent Overstreet { 404*30bff594SKent Overstreet kfree(folio_detach_private(folio)); 405f57a6a5dSKent Overstreet } 406f57a6a5dSKent Overstreet 407*30bff594SKent Overstreet static void bch2_folio_release(struct folio *folio) 408f57a6a5dSKent Overstreet { 409*30bff594SKent Overstreet EBUG_ON(!folio_test_locked(folio)); 410*30bff594SKent Overstreet __bch2_folio_release(folio); 411f57a6a5dSKent Overstreet } 412f57a6a5dSKent Overstreet 413*30bff594SKent Overstreet /* for newly allocated folios: */ 414*30bff594SKent Overstreet static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) 415f57a6a5dSKent Overstreet { 4163342ac13SKent Overstreet struct bch_folio *s; 417f57a6a5dSKent Overstreet 418f57a6a5dSKent Overstreet s = kzalloc(sizeof(*s), GFP_NOFS|gfp); 419f57a6a5dSKent Overstreet if (!s) 420f57a6a5dSKent Overstreet return NULL; 421f57a6a5dSKent Overstreet 4223826ee0bSKent Overstreet spin_lock_init(&s->lock); 423*30bff594SKent Overstreet folio_attach_private(folio, s); 4241c6fdbd8SKent Overstreet return s; 4251c6fdbd8SKent Overstreet } 4261c6fdbd8SKent Overstreet 427*30bff594SKent Overstreet static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) 428f57a6a5dSKent Overstreet { 429*30bff594SKent Overstreet return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); 430f57a6a5dSKent Overstreet } 431f57a6a5dSKent Overstreet 43279203111SKent Overstreet static unsigned bkey_to_sector_state(struct bkey_s_c k) 433b44a66a6SKent Overstreet { 43479203111SKent Overstreet if (bkey_extent_is_reservation(k)) 435b44a66a6SKent Overstreet return SECTOR_RESERVED; 43679203111SKent Overstreet if (bkey_extent_is_allocation(k.k)) 437b44a66a6SKent Overstreet return SECTOR_ALLOCATED; 438b44a66a6SKent Overstreet return SECTOR_UNALLOCATED; 439b44a66a6SKent Overstreet } 440b44a66a6SKent Overstreet 441*30bff594SKent Overstreet static void __bch2_folio_set(struct folio *folio, 442e6ec361fSKent Overstreet unsigned pg_offset, unsigned pg_len, 443e6ec361fSKent Overstreet unsigned nr_ptrs, unsigned state) 444e6ec361fSKent Overstreet { 445*30bff594SKent Overstreet struct bch_folio *s = bch2_folio_create(folio, __GFP_NOFAIL); 446e6ec361fSKent Overstreet unsigned i; 447e6ec361fSKent Overstreet 448e6ec361fSKent Overstreet BUG_ON(pg_offset >= PAGE_SECTORS); 449e6ec361fSKent Overstreet BUG_ON(pg_offset + pg_len > PAGE_SECTORS); 450e6ec361fSKent Overstreet 451e6ec361fSKent Overstreet spin_lock(&s->lock); 452e6ec361fSKent Overstreet 453e6ec361fSKent Overstreet for (i = pg_offset; i < pg_offset + pg_len; i++) { 454e6ec361fSKent Overstreet s->s[i].nr_replicas = nr_ptrs; 455e6ec361fSKent Overstreet s->s[i].state = state; 456e6ec361fSKent Overstreet } 457e6ec361fSKent Overstreet 458e6ec361fSKent Overstreet if (i == PAGE_SECTORS) 459e6ec361fSKent Overstreet s->uptodate = true; 460e6ec361fSKent Overstreet 461e6ec361fSKent Overstreet spin_unlock(&s->lock); 462e6ec361fSKent Overstreet } 463e6ec361fSKent Overstreet 4643342ac13SKent Overstreet /* 4653342ac13SKent Overstreet * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the 4663342ac13SKent Overstreet * extents btree: 4673342ac13SKent Overstreet */ 4683342ac13SKent Overstreet static int bch2_folio_set(struct bch_fs *c, subvol_inum inum, 469*30bff594SKent Overstreet struct folio **folios, unsigned nr_folios) 470e6ec361fSKent Overstreet { 471e6ec361fSKent Overstreet struct btree_trans trans; 472e6ec361fSKent Overstreet struct btree_iter iter; 473e6ec361fSKent Overstreet struct bkey_s_c k; 474*30bff594SKent Overstreet u64 offset = folio_sector(folios[0]); 475*30bff594SKent Overstreet unsigned folio_idx = 0; 476e6ec361fSKent Overstreet u32 snapshot; 477e6ec361fSKent Overstreet int ret; 478e6ec361fSKent Overstreet 479e6ec361fSKent Overstreet bch2_trans_init(&trans, c, 0, 0); 480e6ec361fSKent Overstreet retry: 481e6ec361fSKent Overstreet bch2_trans_begin(&trans); 482e6ec361fSKent Overstreet 483e6ec361fSKent Overstreet ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 484e6ec361fSKent Overstreet if (ret) 485e6ec361fSKent Overstreet goto err; 486e6ec361fSKent Overstreet 487e6ec361fSKent Overstreet for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, 488e6ec361fSKent Overstreet SPOS(inum.inum, offset, snapshot), 489e6ec361fSKent Overstreet BTREE_ITER_SLOTS, k, ret) { 490e6ec361fSKent Overstreet unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); 49179203111SKent Overstreet unsigned state = bkey_to_sector_state(k); 492e6ec361fSKent Overstreet 493*30bff594SKent Overstreet while (folio_idx < nr_folios) { 494*30bff594SKent Overstreet struct folio *folio = folios[folio_idx]; 495*30bff594SKent Overstreet u64 folio_start = folio_sector(folio); 496*30bff594SKent Overstreet u64 folio_end = folio_end_sector(folio); 497*30bff594SKent Overstreet unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; 498*30bff594SKent Overstreet unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; 499e6ec361fSKent Overstreet 500*30bff594SKent Overstreet BUG_ON(k.k->p.offset < folio_start); 501*30bff594SKent Overstreet BUG_ON(bkey_start_offset(k.k) > folio_end); 502e6ec361fSKent Overstreet 503*30bff594SKent Overstreet if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) 504*30bff594SKent Overstreet __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); 505e6ec361fSKent Overstreet 506*30bff594SKent Overstreet if (k.k->p.offset < folio_end) 507e6ec361fSKent Overstreet break; 508*30bff594SKent Overstreet folio_idx++; 509e6ec361fSKent Overstreet } 510e6ec361fSKent Overstreet 511*30bff594SKent Overstreet if (folio_idx == nr_folios) 512e6ec361fSKent Overstreet break; 513e6ec361fSKent Overstreet } 514e6ec361fSKent Overstreet 515e6ec361fSKent Overstreet offset = iter.pos.offset; 516e6ec361fSKent Overstreet bch2_trans_iter_exit(&trans, &iter); 517e6ec361fSKent Overstreet err: 518549d173cSKent Overstreet if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 519e6ec361fSKent Overstreet goto retry; 520e6ec361fSKent Overstreet bch2_trans_exit(&trans); 521e6ec361fSKent Overstreet 522e6ec361fSKent Overstreet return ret; 523e6ec361fSKent Overstreet } 524e6ec361fSKent Overstreet 525b44a66a6SKent Overstreet static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) 526b44a66a6SKent Overstreet { 527b44a66a6SKent Overstreet struct bvec_iter iter; 528b44a66a6SKent Overstreet struct bio_vec bv; 529b44a66a6SKent Overstreet unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v 530b44a66a6SKent Overstreet ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); 53179203111SKent Overstreet unsigned state = bkey_to_sector_state(k); 532b44a66a6SKent Overstreet 533e6ec361fSKent Overstreet bio_for_each_segment(bv, bio, iter) 534*30bff594SKent Overstreet __bch2_folio_set(page_folio(bv.bv_page), bv.bv_offset >> 9, 535e6ec361fSKent Overstreet bv.bv_len >> 9, nr_ptrs, state); 536b44a66a6SKent Overstreet } 537b44a66a6SKent Overstreet 538dcfc593fSKent Overstreet static void mark_pagecache_unallocated(struct bch_inode_info *inode, 539dcfc593fSKent Overstreet u64 start, u64 end) 540dcfc593fSKent Overstreet { 541dcfc593fSKent Overstreet pgoff_t index = start >> PAGE_SECTORS_SHIFT; 542dcfc593fSKent Overstreet pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; 543dcfc593fSKent Overstreet struct folio_batch fbatch; 544dcfc593fSKent Overstreet unsigned i, j; 545dcfc593fSKent Overstreet 546dcfc593fSKent Overstreet if (end <= start) 547dcfc593fSKent Overstreet return; 548dcfc593fSKent Overstreet 549dcfc593fSKent Overstreet folio_batch_init(&fbatch); 550dcfc593fSKent Overstreet 551dcfc593fSKent Overstreet while (filemap_get_folios(inode->v.i_mapping, 552dcfc593fSKent Overstreet &index, end_index, &fbatch)) { 553dcfc593fSKent Overstreet for (i = 0; i < folio_batch_count(&fbatch); i++) { 554dcfc593fSKent Overstreet struct folio *folio = fbatch.folios[i]; 555*30bff594SKent Overstreet u64 folio_start = folio->index << PAGE_SECTORS_SHIFT; 556*30bff594SKent Overstreet u64 folio_end = (folio->index + 1) << PAGE_SECTORS_SHIFT; 557*30bff594SKent Overstreet unsigned folio_offset = max(start, folio_start) - folio_start; 558*30bff594SKent Overstreet unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; 5593342ac13SKent Overstreet struct bch_folio *s; 560dcfc593fSKent Overstreet 561*30bff594SKent Overstreet BUG_ON(end <= folio_start); 562*30bff594SKent Overstreet BUG_ON(folio_offset >= PAGE_SECTORS); 563*30bff594SKent Overstreet BUG_ON(folio_offset + folio_len > PAGE_SECTORS); 564dcfc593fSKent Overstreet 565dcfc593fSKent Overstreet folio_lock(folio); 566*30bff594SKent Overstreet s = bch2_folio(folio); 567dcfc593fSKent Overstreet 568dcfc593fSKent Overstreet if (s) { 569dcfc593fSKent Overstreet spin_lock(&s->lock); 570*30bff594SKent Overstreet for (j = folio_offset; j < folio_offset + folio_len; j++) 571dcfc593fSKent Overstreet s->s[j].nr_replicas = 0; 572dcfc593fSKent Overstreet spin_unlock(&s->lock); 573dcfc593fSKent Overstreet } 574dcfc593fSKent Overstreet 575dcfc593fSKent Overstreet folio_unlock(folio); 576dcfc593fSKent Overstreet } 577dcfc593fSKent Overstreet folio_batch_release(&fbatch); 578dcfc593fSKent Overstreet cond_resched(); 579dcfc593fSKent Overstreet } 580dcfc593fSKent Overstreet } 581dcfc593fSKent Overstreet 582dcfc593fSKent Overstreet static void mark_pagecache_reserved(struct bch_inode_info *inode, 583dcfc593fSKent Overstreet u64 start, u64 end) 584dcfc593fSKent Overstreet { 585dcfc593fSKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 586dcfc593fSKent Overstreet pgoff_t index = start >> PAGE_SECTORS_SHIFT; 587dcfc593fSKent Overstreet pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; 588dcfc593fSKent Overstreet struct folio_batch fbatch; 589dcfc593fSKent Overstreet s64 i_sectors_delta = 0; 590dcfc593fSKent Overstreet unsigned i, j; 591dcfc593fSKent Overstreet 592dcfc593fSKent Overstreet if (end <= start) 593dcfc593fSKent Overstreet return; 594dcfc593fSKent Overstreet 595dcfc593fSKent Overstreet folio_batch_init(&fbatch); 596dcfc593fSKent Overstreet 597dcfc593fSKent Overstreet while (filemap_get_folios(inode->v.i_mapping, 598dcfc593fSKent Overstreet &index, end_index, &fbatch)) { 599dcfc593fSKent Overstreet for (i = 0; i < folio_batch_count(&fbatch); i++) { 600dcfc593fSKent Overstreet struct folio *folio = fbatch.folios[i]; 601*30bff594SKent Overstreet u64 folio_start = folio->index << PAGE_SECTORS_SHIFT; 602*30bff594SKent Overstreet u64 folio_end = (folio->index + 1) << PAGE_SECTORS_SHIFT; 603*30bff594SKent Overstreet unsigned folio_offset = max(start, folio_start) - folio_start; 604*30bff594SKent Overstreet unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; 6053342ac13SKent Overstreet struct bch_folio *s; 606dcfc593fSKent Overstreet 607*30bff594SKent Overstreet BUG_ON(end <= folio_start); 608*30bff594SKent Overstreet BUG_ON(folio_offset >= PAGE_SECTORS); 609*30bff594SKent Overstreet BUG_ON(folio_offset + folio_len > PAGE_SECTORS); 610dcfc593fSKent Overstreet 611dcfc593fSKent Overstreet folio_lock(folio); 612*30bff594SKent Overstreet s = bch2_folio(folio); 613dcfc593fSKent Overstreet 614dcfc593fSKent Overstreet if (s) { 615dcfc593fSKent Overstreet spin_lock(&s->lock); 616*30bff594SKent Overstreet for (j = folio_offset; j < folio_offset + folio_len; j++) 617dcfc593fSKent Overstreet switch (s->s[j].state) { 618dcfc593fSKent Overstreet case SECTOR_UNALLOCATED: 619dcfc593fSKent Overstreet s->s[j].state = SECTOR_RESERVED; 620dcfc593fSKent Overstreet break; 621dcfc593fSKent Overstreet case SECTOR_DIRTY: 622dcfc593fSKent Overstreet s->s[j].state = SECTOR_DIRTY_RESERVED; 623dcfc593fSKent Overstreet i_sectors_delta--; 624dcfc593fSKent Overstreet break; 625dcfc593fSKent Overstreet default: 626dcfc593fSKent Overstreet break; 627dcfc593fSKent Overstreet } 628dcfc593fSKent Overstreet spin_unlock(&s->lock); 629dcfc593fSKent Overstreet } 630dcfc593fSKent Overstreet 631dcfc593fSKent Overstreet folio_unlock(folio); 632dcfc593fSKent Overstreet } 633dcfc593fSKent Overstreet folio_batch_release(&fbatch); 634dcfc593fSKent Overstreet cond_resched(); 635dcfc593fSKent Overstreet } 636dcfc593fSKent Overstreet 637dcfc593fSKent Overstreet i_sectors_acct(c, inode, NULL, i_sectors_delta); 638dcfc593fSKent Overstreet } 639dcfc593fSKent Overstreet 640e1036a2aSKent Overstreet static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) 641e1036a2aSKent Overstreet { 642e1036a2aSKent Overstreet /* XXX: this should not be open coded */ 643e1036a2aSKent Overstreet return inode->ei_inode.bi_data_replicas 644e1036a2aSKent Overstreet ? inode->ei_inode.bi_data_replicas - 1 645e1036a2aSKent Overstreet : c->opts.data_replicas; 646e1036a2aSKent Overstreet } 647e1036a2aSKent Overstreet 6483342ac13SKent Overstreet static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, 649f57a6a5dSKent Overstreet unsigned nr_replicas) 650f57a6a5dSKent Overstreet { 651f57a6a5dSKent Overstreet return max(0, (int) nr_replicas - 652f57a6a5dSKent Overstreet s->nr_replicas - 653f57a6a5dSKent Overstreet s->replicas_reserved); 654f57a6a5dSKent Overstreet } 655f57a6a5dSKent Overstreet 656*30bff594SKent Overstreet static int bch2_get_folio_disk_reservation(struct bch_fs *c, 657f57a6a5dSKent Overstreet struct bch_inode_info *inode, 658*30bff594SKent Overstreet struct folio *folio, bool check_enospc) 6591c6fdbd8SKent Overstreet { 660*30bff594SKent Overstreet struct bch_folio *s = bch2_folio_create(folio, 0); 661e1036a2aSKent Overstreet unsigned nr_replicas = inode_nr_replicas(c, inode); 662f57a6a5dSKent Overstreet struct disk_reservation disk_res = { 0 }; 663f57a6a5dSKent Overstreet unsigned i, disk_res_sectors = 0; 664f81b648dSKent Overstreet int ret; 6651c6fdbd8SKent Overstreet 666f57a6a5dSKent Overstreet if (!s) 667f57a6a5dSKent Overstreet return -ENOMEM; 6681c6fdbd8SKent Overstreet 669f57a6a5dSKent Overstreet for (i = 0; i < ARRAY_SIZE(s->s); i++) 670f57a6a5dSKent Overstreet disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); 671f57a6a5dSKent Overstreet 672f57a6a5dSKent Overstreet if (!disk_res_sectors) 673f57a6a5dSKent Overstreet return 0; 674f57a6a5dSKent Overstreet 675f57a6a5dSKent Overstreet ret = bch2_disk_reservation_get(c, &disk_res, 676f57a6a5dSKent Overstreet disk_res_sectors, 1, 677f57a6a5dSKent Overstreet !check_enospc 678f57a6a5dSKent Overstreet ? BCH_DISK_RESERVATION_NOFAIL 679f57a6a5dSKent Overstreet : 0); 6801c6fdbd8SKent Overstreet if (unlikely(ret)) 681f81b648dSKent Overstreet return ret; 682f81b648dSKent Overstreet 683f57a6a5dSKent Overstreet for (i = 0; i < ARRAY_SIZE(s->s); i++) 684f57a6a5dSKent Overstreet s->s[i].replicas_reserved += 685f57a6a5dSKent Overstreet sectors_to_reserve(&s->s[i], nr_replicas); 686f57a6a5dSKent Overstreet 687f57a6a5dSKent Overstreet return 0; 6881c6fdbd8SKent Overstreet } 6891c6fdbd8SKent Overstreet 690*30bff594SKent Overstreet struct bch2_folio_reservation { 691d1542e03SKent Overstreet struct disk_reservation disk; 692d1542e03SKent Overstreet struct quota_res quota; 693d1542e03SKent Overstreet }; 694d1542e03SKent Overstreet 695*30bff594SKent Overstreet static void bch2_folio_reservation_init(struct bch_fs *c, 696f57a6a5dSKent Overstreet struct bch_inode_info *inode, 697*30bff594SKent Overstreet struct bch2_folio_reservation *res) 698d1542e03SKent Overstreet { 699d1542e03SKent Overstreet memset(res, 0, sizeof(*res)); 700d1542e03SKent Overstreet 701d1542e03SKent Overstreet res->disk.nr_replicas = inode_nr_replicas(c, inode); 702d1542e03SKent Overstreet } 703d1542e03SKent Overstreet 704*30bff594SKent Overstreet static void bch2_folio_reservation_put(struct bch_fs *c, 705d1542e03SKent Overstreet struct bch_inode_info *inode, 706*30bff594SKent Overstreet struct bch2_folio_reservation *res) 707d1542e03SKent Overstreet { 708d1542e03SKent Overstreet bch2_disk_reservation_put(c, &res->disk); 709d1542e03SKent Overstreet bch2_quota_reservation_put(c, inode, &res->quota); 710d1542e03SKent Overstreet } 711d1542e03SKent Overstreet 712*30bff594SKent Overstreet static int bch2_folio_reservation_get(struct bch_fs *c, 713*30bff594SKent Overstreet struct bch_inode_info *inode, 714*30bff594SKent Overstreet struct folio *folio, 715*30bff594SKent Overstreet struct bch2_folio_reservation *res, 716bd954215SKent Overstreet unsigned offset, unsigned len) 717f57a6a5dSKent Overstreet { 718*30bff594SKent Overstreet struct bch_folio *s = bch2_folio_create(folio, 0); 719d1542e03SKent Overstreet unsigned i, disk_sectors = 0, quota_sectors = 0; 720f57a6a5dSKent Overstreet int ret; 721f57a6a5dSKent Overstreet 722f57a6a5dSKent Overstreet if (!s) 723f57a6a5dSKent Overstreet return -ENOMEM; 724f57a6a5dSKent Overstreet 725e6ec361fSKent Overstreet BUG_ON(!s->uptodate); 726e6ec361fSKent Overstreet 7274b0a66d5SKent Overstreet for (i = round_down(offset, block_bytes(c)) >> 9; 7284b0a66d5SKent Overstreet i < round_up(offset + len, block_bytes(c)) >> 9; 729d1542e03SKent Overstreet i++) { 730d1542e03SKent Overstreet disk_sectors += sectors_to_reserve(&s->s[i], 731d1542e03SKent Overstreet res->disk.nr_replicas); 732d1542e03SKent Overstreet quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; 7331c6fdbd8SKent Overstreet } 7341c6fdbd8SKent Overstreet 735d1542e03SKent Overstreet if (disk_sectors) { 736bd954215SKent Overstreet ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); 737d1542e03SKent Overstreet if (unlikely(ret)) 738d1542e03SKent Overstreet return ret; 739d1542e03SKent Overstreet } 740d1542e03SKent Overstreet 741d1542e03SKent Overstreet if (quota_sectors) { 742d1542e03SKent Overstreet ret = bch2_quota_reservation_add(c, inode, &res->quota, 743bd954215SKent Overstreet quota_sectors, true); 744d1542e03SKent Overstreet if (unlikely(ret)) { 745d1542e03SKent Overstreet struct disk_reservation tmp = { 746d1542e03SKent Overstreet .sectors = disk_sectors 747d1542e03SKent Overstreet }; 748d1542e03SKent Overstreet 749d1542e03SKent Overstreet bch2_disk_reservation_put(c, &tmp); 750d1542e03SKent Overstreet res->disk.sectors -= disk_sectors; 751d1542e03SKent Overstreet return ret; 752d1542e03SKent Overstreet } 753d1542e03SKent Overstreet } 754d1542e03SKent Overstreet 755d1542e03SKent Overstreet return 0; 756f57a6a5dSKent Overstreet } 757f57a6a5dSKent Overstreet 758*30bff594SKent Overstreet static void bch2_clear_folio_bits(struct folio *folio) 7591c6fdbd8SKent Overstreet { 760*30bff594SKent Overstreet struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); 7611c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 762*30bff594SKent Overstreet struct bch_folio *s = bch2_folio(folio); 763d1542e03SKent Overstreet struct disk_reservation disk_res = { 0 }; 764f57a6a5dSKent Overstreet int i, dirty_sectors = 0; 7651c6fdbd8SKent Overstreet 766f57a6a5dSKent Overstreet if (!s) 7671c6fdbd8SKent Overstreet return; 7681c6fdbd8SKent Overstreet 769*30bff594SKent Overstreet EBUG_ON(!folio_test_locked(folio)); 770*30bff594SKent Overstreet EBUG_ON(folio_test_writeback(folio)); 7713826ee0bSKent Overstreet 772f57a6a5dSKent Overstreet for (i = 0; i < ARRAY_SIZE(s->s); i++) { 773d1542e03SKent Overstreet disk_res.sectors += s->s[i].replicas_reserved; 774d1542e03SKent Overstreet s->s[i].replicas_reserved = 0; 775d1542e03SKent Overstreet 776b44a66a6SKent Overstreet switch (s->s[i].state) { 777b44a66a6SKent Overstreet case SECTOR_DIRTY: 778f57a6a5dSKent Overstreet s->s[i].state = SECTOR_UNALLOCATED; 779b44a66a6SKent Overstreet --dirty_sectors; 780b44a66a6SKent Overstreet break; 781b44a66a6SKent Overstreet case SECTOR_DIRTY_RESERVED: 782b44a66a6SKent Overstreet s->s[i].state = SECTOR_RESERVED; 783b44a66a6SKent Overstreet break; 784b44a66a6SKent Overstreet default: 785b44a66a6SKent Overstreet break; 786f57a6a5dSKent Overstreet } 787f57a6a5dSKent Overstreet } 788adfcfaf0SKent Overstreet 789d1542e03SKent Overstreet bch2_disk_reservation_put(c, &disk_res); 790d1542e03SKent Overstreet 791b44a66a6SKent Overstreet i_sectors_acct(c, inode, NULL, dirty_sectors); 792adfcfaf0SKent Overstreet 793*30bff594SKent Overstreet bch2_folio_release(folio); 7941c6fdbd8SKent Overstreet } 7951c6fdbd8SKent Overstreet 796*30bff594SKent Overstreet static void bch2_set_folio_dirty(struct bch_fs *c, 797*30bff594SKent Overstreet struct bch_inode_info *inode, 798*30bff594SKent Overstreet struct folio *folio, 799*30bff594SKent Overstreet struct bch2_folio_reservation *res, 800d1542e03SKent Overstreet unsigned offset, unsigned len) 8011c6fdbd8SKent Overstreet { 802*30bff594SKent Overstreet struct bch_folio *s = bch2_folio(folio); 803f57a6a5dSKent Overstreet unsigned i, dirty_sectors = 0; 8041c6fdbd8SKent Overstreet 805*30bff594SKent Overstreet WARN_ON((u64) folio_pos(folio) + offset + len > 806877dfb34SKent Overstreet round_up((u64) i_size_read(&inode->v), block_bytes(c))); 807fb472ac5SKent Overstreet 8083826ee0bSKent Overstreet spin_lock(&s->lock); 8093826ee0bSKent Overstreet 8104b0a66d5SKent Overstreet for (i = round_down(offset, block_bytes(c)) >> 9; 8114b0a66d5SKent Overstreet i < round_up(offset + len, block_bytes(c)) >> 9; 812d1542e03SKent Overstreet i++) { 813d1542e03SKent Overstreet unsigned sectors = sectors_to_reserve(&s->s[i], 814d1542e03SKent Overstreet res->disk.nr_replicas); 8151c6fdbd8SKent Overstreet 816406d6d5aSKent Overstreet /* 817406d6d5aSKent Overstreet * This can happen if we race with the error path in 818406d6d5aSKent Overstreet * bch2_writepage_io_done(): 819406d6d5aSKent Overstreet */ 820406d6d5aSKent Overstreet sectors = min_t(unsigned, sectors, res->disk.sectors); 821406d6d5aSKent Overstreet 822d1542e03SKent Overstreet s->s[i].replicas_reserved += sectors; 823d1542e03SKent Overstreet res->disk.sectors -= sectors; 824adfcfaf0SKent Overstreet 825b44a66a6SKent Overstreet switch (s->s[i].state) { 826b44a66a6SKent Overstreet case SECTOR_UNALLOCATED: 827b44a66a6SKent Overstreet s->s[i].state = SECTOR_DIRTY; 828f57a6a5dSKent Overstreet dirty_sectors++; 829b44a66a6SKent Overstreet break; 830b44a66a6SKent Overstreet case SECTOR_RESERVED: 831b44a66a6SKent Overstreet s->s[i].state = SECTOR_DIRTY_RESERVED; 832b44a66a6SKent Overstreet break; 833b44a66a6SKent Overstreet default: 834b44a66a6SKent Overstreet break; 835b44a66a6SKent Overstreet } 836f57a6a5dSKent Overstreet } 837f57a6a5dSKent Overstreet 8383826ee0bSKent Overstreet spin_unlock(&s->lock); 8393826ee0bSKent Overstreet 840d1542e03SKent Overstreet i_sectors_acct(c, inode, &res->quota, dirty_sectors); 8411c6fdbd8SKent Overstreet 842*30bff594SKent Overstreet if (!folio_test_dirty(folio)) 843*30bff594SKent Overstreet filemap_dirty_folio(inode->v.i_mapping, folio); 8441c6fdbd8SKent Overstreet } 8451c6fdbd8SKent Overstreet 8461c6fdbd8SKent Overstreet vm_fault_t bch2_page_fault(struct vm_fault *vmf) 8471c6fdbd8SKent Overstreet { 8481c6fdbd8SKent Overstreet struct file *file = vmf->vma->vm_file; 849eb8e6e9cSKent Overstreet struct address_space *mapping = file->f_mapping; 850eb8e6e9cSKent Overstreet struct address_space *fdm = faults_disabled_mapping(); 8511c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 8521c6fdbd8SKent Overstreet int ret; 8531c6fdbd8SKent Overstreet 854eb8e6e9cSKent Overstreet if (fdm == mapping) 855eb8e6e9cSKent Overstreet return VM_FAULT_SIGBUS; 856eb8e6e9cSKent Overstreet 857eb8e6e9cSKent Overstreet /* Lock ordering: */ 858eb8e6e9cSKent Overstreet if (fdm > mapping) { 859eb8e6e9cSKent Overstreet struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); 860eb8e6e9cSKent Overstreet 861a7ecd30cSKent Overstreet if (bch2_pagecache_add_tryget(inode)) 862eb8e6e9cSKent Overstreet goto got_lock; 863eb8e6e9cSKent Overstreet 864a7ecd30cSKent Overstreet bch2_pagecache_block_put(fdm_host); 865eb8e6e9cSKent Overstreet 866a7ecd30cSKent Overstreet bch2_pagecache_add_get(inode); 867a7ecd30cSKent Overstreet bch2_pagecache_add_put(inode); 868eb8e6e9cSKent Overstreet 869a7ecd30cSKent Overstreet bch2_pagecache_block_get(fdm_host); 870eb8e6e9cSKent Overstreet 871eb8e6e9cSKent Overstreet /* Signal that lock has been dropped: */ 872eb8e6e9cSKent Overstreet set_fdm_dropped_locks(); 873eb8e6e9cSKent Overstreet return VM_FAULT_SIGBUS; 874eb8e6e9cSKent Overstreet } 875eb8e6e9cSKent Overstreet 876a7ecd30cSKent Overstreet bch2_pagecache_add_get(inode); 877eb8e6e9cSKent Overstreet got_lock: 8781c6fdbd8SKent Overstreet ret = filemap_fault(vmf); 879a7ecd30cSKent Overstreet bch2_pagecache_add_put(inode); 8801c6fdbd8SKent Overstreet 8811c6fdbd8SKent Overstreet return ret; 8821c6fdbd8SKent Overstreet } 8831c6fdbd8SKent Overstreet 8841c6fdbd8SKent Overstreet vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) 8851c6fdbd8SKent Overstreet { 886*30bff594SKent Overstreet struct folio *folio = page_folio(vmf->page); 8871c6fdbd8SKent Overstreet struct file *file = vmf->vma->vm_file; 8881c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 8891c6fdbd8SKent Overstreet struct address_space *mapping = file->f_mapping; 8901c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 891*30bff594SKent Overstreet struct bch2_folio_reservation res; 8926cc3535dSKent Overstreet unsigned len; 8936cc3535dSKent Overstreet loff_t isize; 894e6ec361fSKent Overstreet int ret; 8951c6fdbd8SKent Overstreet 896*30bff594SKent Overstreet bch2_folio_reservation_init(c, inode, &res); 897d1542e03SKent Overstreet 8981c6fdbd8SKent Overstreet sb_start_pagefault(inode->v.i_sb); 8991c6fdbd8SKent Overstreet file_update_time(file); 9001c6fdbd8SKent Overstreet 9011c6fdbd8SKent Overstreet /* 9021c6fdbd8SKent Overstreet * Not strictly necessary, but helps avoid dio writes livelocking in 9031c6fdbd8SKent Overstreet * write_invalidate_inode_pages_range() - can drop this if/when we get 9041c6fdbd8SKent Overstreet * a write_invalidate_inode_pages_range() that works without dropping 9051c6fdbd8SKent Overstreet * page lock before invalidating page 9061c6fdbd8SKent Overstreet */ 907a7ecd30cSKent Overstreet bch2_pagecache_add_get(inode); 9081c6fdbd8SKent Overstreet 909*30bff594SKent Overstreet folio_lock(folio); 9106cc3535dSKent Overstreet isize = i_size_read(&inode->v); 9116cc3535dSKent Overstreet 912*30bff594SKent Overstreet if (folio->mapping != mapping || folio_pos(folio) >= isize) { 913*30bff594SKent Overstreet folio_unlock(folio); 9141c6fdbd8SKent Overstreet ret = VM_FAULT_NOPAGE; 9151c6fdbd8SKent Overstreet goto out; 9161c6fdbd8SKent Overstreet } 9171c6fdbd8SKent Overstreet 918*30bff594SKent Overstreet len = min_t(loff_t, PAGE_SIZE, isize - folio_pos(folio)); 9196cc3535dSKent Overstreet 920*30bff594SKent Overstreet if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) { 921*30bff594SKent Overstreet if (bch2_folio_set(c, inode_inum(inode), &folio, 1)) { 922*30bff594SKent Overstreet folio_unlock(folio); 923e6ec361fSKent Overstreet ret = VM_FAULT_SIGBUS; 924e6ec361fSKent Overstreet goto out; 925e6ec361fSKent Overstreet } 926e6ec361fSKent Overstreet } 927e6ec361fSKent Overstreet 928*30bff594SKent Overstreet if (bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { 929*30bff594SKent Overstreet folio_unlock(folio); 9301c6fdbd8SKent Overstreet ret = VM_FAULT_SIGBUS; 9311c6fdbd8SKent Overstreet goto out; 9321c6fdbd8SKent Overstreet } 9331c6fdbd8SKent Overstreet 934*30bff594SKent Overstreet bch2_set_folio_dirty(c, inode, folio, &res, 0, len); 935*30bff594SKent Overstreet bch2_folio_reservation_put(c, inode, &res); 9361b783a69SKent Overstreet 937*30bff594SKent Overstreet folio_wait_stable(folio); 938e6ec361fSKent Overstreet ret = VM_FAULT_LOCKED; 9391c6fdbd8SKent Overstreet out: 940a7ecd30cSKent Overstreet bch2_pagecache_add_put(inode); 9411c6fdbd8SKent Overstreet sb_end_pagefault(inode->v.i_sb); 942d1542e03SKent Overstreet 9431c6fdbd8SKent Overstreet return ret; 9441c6fdbd8SKent Overstreet } 9451c6fdbd8SKent Overstreet 9461c6fdbd8SKent Overstreet void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) 9471c6fdbd8SKent Overstreet { 9481c6fdbd8SKent Overstreet if (offset || length < folio_size(folio)) 9491c6fdbd8SKent Overstreet return; 9501c6fdbd8SKent Overstreet 951*30bff594SKent Overstreet bch2_clear_folio_bits(folio); 9521c6fdbd8SKent Overstreet } 9531c6fdbd8SKent Overstreet 9541c6fdbd8SKent Overstreet bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) 9551c6fdbd8SKent Overstreet { 956a6d90385SKent Overstreet if (folio_test_dirty(folio) || folio_test_writeback(folio)) 9571c6fdbd8SKent Overstreet return false; 9581c6fdbd8SKent Overstreet 959*30bff594SKent Overstreet bch2_clear_folio_bits(folio); 9601c6fdbd8SKent Overstreet return true; 9611c6fdbd8SKent Overstreet } 9621c6fdbd8SKent Overstreet 9631c6fdbd8SKent Overstreet /* readpage(s): */ 9641c6fdbd8SKent Overstreet 9651c6fdbd8SKent Overstreet static void bch2_readpages_end_io(struct bio *bio) 9661c6fdbd8SKent Overstreet { 967*30bff594SKent Overstreet struct folio_iter fi; 9681c6fdbd8SKent Overstreet 969*30bff594SKent Overstreet bio_for_each_folio_all(fi, bio) { 9701c6fdbd8SKent Overstreet if (!bio->bi_status) { 971*30bff594SKent Overstreet folio_mark_uptodate(fi.folio); 9721c6fdbd8SKent Overstreet } else { 973*30bff594SKent Overstreet folio_clear_uptodate(fi.folio); 974*30bff594SKent Overstreet folio_set_error(fi.folio); 9751c6fdbd8SKent Overstreet } 976*30bff594SKent Overstreet folio_unlock(fi.folio); 9771c6fdbd8SKent Overstreet } 9781c6fdbd8SKent Overstreet 9791c6fdbd8SKent Overstreet bio_put(bio); 9801c6fdbd8SKent Overstreet } 9811c6fdbd8SKent Overstreet 9821c6fdbd8SKent Overstreet struct readpages_iter { 9831c6fdbd8SKent Overstreet struct address_space *mapping; 9841c6fdbd8SKent Overstreet struct page **pages; 9851c6fdbd8SKent Overstreet unsigned nr_pages; 9861c6fdbd8SKent Overstreet unsigned idx; 9871c6fdbd8SKent Overstreet pgoff_t offset; 9881c6fdbd8SKent Overstreet }; 9891c6fdbd8SKent Overstreet 9901c6fdbd8SKent Overstreet static int readpages_iter_init(struct readpages_iter *iter, 9911c6fdbd8SKent Overstreet struct readahead_control *ractl) 9921c6fdbd8SKent Overstreet { 9931c6fdbd8SKent Overstreet unsigned i, nr_pages = readahead_count(ractl); 9941c6fdbd8SKent Overstreet 9951c6fdbd8SKent Overstreet memset(iter, 0, sizeof(*iter)); 9961c6fdbd8SKent Overstreet 9971c6fdbd8SKent Overstreet iter->mapping = ractl->mapping; 9981c6fdbd8SKent Overstreet iter->offset = readahead_index(ractl); 9991c6fdbd8SKent Overstreet iter->nr_pages = nr_pages; 10001c6fdbd8SKent Overstreet 10011c6fdbd8SKent Overstreet iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); 10021c6fdbd8SKent Overstreet if (!iter->pages) 10031c6fdbd8SKent Overstreet return -ENOMEM; 10041c6fdbd8SKent Overstreet 100589931472SKent Overstreet nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); 10061c6fdbd8SKent Overstreet for (i = 0; i < nr_pages; i++) { 1007*30bff594SKent Overstreet __bch2_folio_create(page_folio(iter->pages[i]), __GFP_NOFAIL); 10081c6fdbd8SKent Overstreet put_page(iter->pages[i]); 10091c6fdbd8SKent Overstreet } 10101c6fdbd8SKent Overstreet 10111c6fdbd8SKent Overstreet return 0; 10121c6fdbd8SKent Overstreet } 10131c6fdbd8SKent Overstreet 1014*30bff594SKent Overstreet static inline struct folio *readpage_iter_next(struct readpages_iter *iter) 10151c6fdbd8SKent Overstreet { 10161c6fdbd8SKent Overstreet if (iter->idx >= iter->nr_pages) 10171c6fdbd8SKent Overstreet return NULL; 10181c6fdbd8SKent Overstreet 10191c6fdbd8SKent Overstreet EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); 10201c6fdbd8SKent Overstreet 1021*30bff594SKent Overstreet return page_folio(iter->pages[iter->idx]); 10221c6fdbd8SKent Overstreet } 10231c6fdbd8SKent Overstreet 102435189e09SKent Overstreet static bool extent_partial_reads_expensive(struct bkey_s_c k) 102535189e09SKent Overstreet { 102635189e09SKent Overstreet struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 102735189e09SKent Overstreet struct bch_extent_crc_unpacked crc; 102835189e09SKent Overstreet const union bch_extent_entry *i; 102935189e09SKent Overstreet 103035189e09SKent Overstreet bkey_for_each_crc(k.k, ptrs, crc, i) 103135189e09SKent Overstreet if (crc.csum_type || crc.compression_type) 103235189e09SKent Overstreet return true; 103335189e09SKent Overstreet return false; 103435189e09SKent Overstreet } 103535189e09SKent Overstreet 10361c6fdbd8SKent Overstreet static void readpage_bio_extend(struct readpages_iter *iter, 103776426098SKent Overstreet struct bio *bio, 103876426098SKent Overstreet unsigned sectors_this_extent, 10391c6fdbd8SKent Overstreet bool get_more) 10401c6fdbd8SKent Overstreet { 104176426098SKent Overstreet while (bio_sectors(bio) < sectors_this_extent && 10421c6fdbd8SKent Overstreet bio->bi_vcnt < bio->bi_max_vecs) { 1043*30bff594SKent Overstreet pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; 1044*30bff594SKent Overstreet struct folio *folio = readpage_iter_next(iter); 10451c6fdbd8SKent Overstreet int ret; 10461c6fdbd8SKent Overstreet 1047*30bff594SKent Overstreet if (folio) { 1048*30bff594SKent Overstreet if (iter->offset + iter->idx != folio_offset) 10491c6fdbd8SKent Overstreet break; 10501c6fdbd8SKent Overstreet 10511c6fdbd8SKent Overstreet iter->idx++; 10521c6fdbd8SKent Overstreet } else { 10531c6fdbd8SKent Overstreet if (!get_more) 10541c6fdbd8SKent Overstreet break; 10551c6fdbd8SKent Overstreet 1056*30bff594SKent Overstreet folio = xa_load(&iter->mapping->i_pages, folio_offset); 1057*30bff594SKent Overstreet if (folio && !xa_is_value(folio)) 10581c6fdbd8SKent Overstreet break; 10591c6fdbd8SKent Overstreet 1060*30bff594SKent Overstreet folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); 1061*30bff594SKent Overstreet if (!folio) 10621c6fdbd8SKent Overstreet break; 10631c6fdbd8SKent Overstreet 1064*30bff594SKent Overstreet if (!__bch2_folio_create(folio, 0)) { 1065*30bff594SKent Overstreet folio_put(folio); 1066f57a6a5dSKent Overstreet break; 1067f57a6a5dSKent Overstreet } 10681c6fdbd8SKent Overstreet 1069*30bff594SKent Overstreet ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_NOFS); 10701c6fdbd8SKent Overstreet if (ret) { 1071*30bff594SKent Overstreet __bch2_folio_release(folio); 1072*30bff594SKent Overstreet folio_put(folio); 10731c6fdbd8SKent Overstreet break; 10741c6fdbd8SKent Overstreet } 10751c6fdbd8SKent Overstreet 1076*30bff594SKent Overstreet folio_put(folio); 10771c6fdbd8SKent Overstreet } 10781c6fdbd8SKent Overstreet 1079*30bff594SKent Overstreet BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); 10801c6fdbd8SKent Overstreet } 10811c6fdbd8SKent Overstreet } 10821c6fdbd8SKent Overstreet 10838c6d298aSKent Overstreet static void bchfs_read(struct btree_trans *trans, 10848c6d298aSKent Overstreet struct bch_read_bio *rbio, 10858c6d298aSKent Overstreet subvol_inum inum, 10861c6fdbd8SKent Overstreet struct readpages_iter *readpages_iter) 10871c6fdbd8SKent Overstreet { 10880f238367SKent Overstreet struct bch_fs *c = trans->c; 10898c6d298aSKent Overstreet struct btree_iter iter; 109007a1006aSKent Overstreet struct bkey_buf sk; 10911c6fdbd8SKent Overstreet int flags = BCH_READ_RETRY_IF_STALE| 10921c6fdbd8SKent Overstreet BCH_READ_MAY_PROMOTE; 10938c6d298aSKent Overstreet u32 snapshot; 109476426098SKent Overstreet int ret = 0; 10951c6fdbd8SKent Overstreet 10961c6fdbd8SKent Overstreet rbio->c = c; 10971c6fdbd8SKent Overstreet rbio->start_time = local_clock(); 10988c6d298aSKent Overstreet rbio->subvol = inum.subvol; 109935189e09SKent Overstreet 110007a1006aSKent Overstreet bch2_bkey_buf_init(&sk); 110176426098SKent Overstreet retry: 1102700c25b3SKent Overstreet bch2_trans_begin(trans); 11038c6d298aSKent Overstreet iter = (struct btree_iter) { NULL }; 1104700c25b3SKent Overstreet 11058c6d298aSKent Overstreet ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 11068c6d298aSKent Overstreet if (ret) 11078c6d298aSKent Overstreet goto err; 11088c6d298aSKent Overstreet 11098c6d298aSKent Overstreet bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 11108c6d298aSKent Overstreet SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), 111157cfdd8bSKent Overstreet BTREE_ITER_SLOTS); 11121c6fdbd8SKent Overstreet while (1) { 11131c6fdbd8SKent Overstreet struct bkey_s_c k; 111476426098SKent Overstreet unsigned bytes, sectors, offset_into_extent; 11155ff75ccbSKent Overstreet enum btree_id data_btree = BTREE_ID_extents; 11161c6fdbd8SKent Overstreet 11173737e0ddSKent Overstreet /* 11183737e0ddSKent Overstreet * read_extent -> io_time_reset may cause a transaction restart 11193737e0ddSKent Overstreet * without returning an error, we need to check for that here: 11203737e0ddSKent Overstreet */ 1121549d173cSKent Overstreet ret = bch2_trans_relock(trans); 1122549d173cSKent Overstreet if (ret) 11233737e0ddSKent Overstreet break; 11243737e0ddSKent Overstreet 11258c6d298aSKent Overstreet bch2_btree_iter_set_pos(&iter, 11268c6d298aSKent Overstreet POS(inum.inum, rbio->bio.bi_iter.bi_sector)); 11271c6fdbd8SKent Overstreet 11288c6d298aSKent Overstreet k = bch2_btree_iter_peek_slot(&iter); 112976426098SKent Overstreet ret = bkey_err(k); 113076426098SKent Overstreet if (ret) 113176426098SKent Overstreet break; 11321c6fdbd8SKent Overstreet 11338c6d298aSKent Overstreet offset_into_extent = iter.pos.offset - 113406ed8558SKent Overstreet bkey_start_offset(k.k); 113576426098SKent Overstreet sectors = k.k->size - offset_into_extent; 113676426098SKent Overstreet 113707a1006aSKent Overstreet bch2_bkey_buf_reassemble(&sk, c, k); 113813dcd4abSKent Overstreet 11395ff75ccbSKent Overstreet ret = bch2_read_indirect_extent(trans, &data_btree, 114022d8a33dSYuxuan Shui &offset_into_extent, &sk); 114176426098SKent Overstreet if (ret) 114276426098SKent Overstreet break; 114376426098SKent Overstreet 114413dcd4abSKent Overstreet k = bkey_i_to_s_c(sk.k); 114513dcd4abSKent Overstreet 114676426098SKent Overstreet sectors = min(sectors, k.k->size - offset_into_extent); 114776426098SKent Overstreet 114835189e09SKent Overstreet if (readpages_iter) 114935189e09SKent Overstreet readpage_bio_extend(readpages_iter, &rbio->bio, sectors, 115035189e09SKent Overstreet extent_partial_reads_expensive(k)); 11511c6fdbd8SKent Overstreet 115276426098SKent Overstreet bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; 115306ed8558SKent Overstreet swap(rbio->bio.bi_iter.bi_size, bytes); 11541c6fdbd8SKent Overstreet 115506ed8558SKent Overstreet if (rbio->bio.bi_iter.bi_size == bytes) 11561c6fdbd8SKent Overstreet flags |= BCH_READ_LAST_FRAGMENT; 11571c6fdbd8SKent Overstreet 1158b44a66a6SKent Overstreet bch2_bio_page_state_set(&rbio->bio, k); 11591c6fdbd8SKent Overstreet 11608c6d298aSKent Overstreet bch2_read_extent(trans, rbio, iter.pos, 11615ff75ccbSKent Overstreet data_btree, k, offset_into_extent, flags); 11621c6fdbd8SKent Overstreet 11631c6fdbd8SKent Overstreet if (flags & BCH_READ_LAST_FRAGMENT) 116435189e09SKent Overstreet break; 11651c6fdbd8SKent Overstreet 116606ed8558SKent Overstreet swap(rbio->bio.bi_iter.bi_size, bytes); 116706ed8558SKent Overstreet bio_advance(&rbio->bio, bytes); 1168084d42bbSKent Overstreet 1169084d42bbSKent Overstreet ret = btree_trans_too_many_iters(trans); 1170084d42bbSKent Overstreet if (ret) 1171084d42bbSKent Overstreet break; 11721c6fdbd8SKent Overstreet } 11738c6d298aSKent Overstreet err: 11748c6d298aSKent Overstreet bch2_trans_iter_exit(trans, &iter); 117576426098SKent Overstreet 1176549d173cSKent Overstreet if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 117776426098SKent Overstreet goto retry; 117876426098SKent Overstreet 117935189e09SKent Overstreet if (ret) { 11807fec8266SKent Overstreet bch_err_inum_offset_ratelimited(c, 11817fec8266SKent Overstreet iter.pos.inode, 11827fec8266SKent Overstreet iter.pos.offset << 9, 11830fefe8d8SKent Overstreet "read error %i from btree lookup", ret); 11840fefe8d8SKent Overstreet rbio->bio.bi_status = BLK_STS_IOERR; 118576426098SKent Overstreet bio_endio(&rbio->bio); 11861c6fdbd8SKent Overstreet } 11871c6fdbd8SKent Overstreet 118807a1006aSKent Overstreet bch2_bkey_buf_exit(&sk, c); 118935189e09SKent Overstreet } 119035189e09SKent Overstreet 11911c6fdbd8SKent Overstreet void bch2_readahead(struct readahead_control *ractl) 11921c6fdbd8SKent Overstreet { 11931c6fdbd8SKent Overstreet struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); 11941c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 119501ad6737SKent Overstreet struct bch_io_opts opts; 1196424eb881SKent Overstreet struct btree_trans trans; 1197*30bff594SKent Overstreet struct folio *folio; 11981c6fdbd8SKent Overstreet struct readpages_iter readpages_iter; 11991c6fdbd8SKent Overstreet int ret; 12001c6fdbd8SKent Overstreet 120101ad6737SKent Overstreet bch2_inode_opts_get(&opts, c, &inode->ei_inode); 120201ad6737SKent Overstreet 12031c6fdbd8SKent Overstreet ret = readpages_iter_init(&readpages_iter, ractl); 12041c6fdbd8SKent Overstreet BUG_ON(ret); 12051c6fdbd8SKent Overstreet 120620bceecbSKent Overstreet bch2_trans_init(&trans, c, 0, 0); 12071c6fdbd8SKent Overstreet 1208a7ecd30cSKent Overstreet bch2_pagecache_add_get(inode); 12091c6fdbd8SKent Overstreet 1210*30bff594SKent Overstreet while ((folio = readpage_iter_next(&readpages_iter))) { 12111c6fdbd8SKent Overstreet pgoff_t index = readpages_iter.offset + readpages_iter.idx; 12121c6fdbd8SKent Overstreet unsigned n = min_t(unsigned, 12131c6fdbd8SKent Overstreet readpages_iter.nr_pages - 12141c6fdbd8SKent Overstreet readpages_iter.idx, 12151c6fdbd8SKent Overstreet BIO_MAX_VECS); 12161c6fdbd8SKent Overstreet struct bch_read_bio *rbio = 12171c6fdbd8SKent Overstreet rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, 12181c6fdbd8SKent Overstreet GFP_NOFS, &c->bio_read), 12191c6fdbd8SKent Overstreet opts); 12201c6fdbd8SKent Overstreet 12211c6fdbd8SKent Overstreet readpages_iter.idx++; 12221c6fdbd8SKent Overstreet 12237279c1a2SKent Overstreet rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; 12241c6fdbd8SKent Overstreet rbio->bio.bi_end_io = bch2_readpages_end_io; 1225*30bff594SKent Overstreet BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); 12261c6fdbd8SKent Overstreet 12278c6d298aSKent Overstreet bchfs_read(&trans, rbio, inode_inum(inode), 12280f238367SKent Overstreet &readpages_iter); 12291c6fdbd8SKent Overstreet } 12301c6fdbd8SKent Overstreet 1231a7ecd30cSKent Overstreet bch2_pagecache_add_put(inode); 1232424eb881SKent Overstreet 1233424eb881SKent Overstreet bch2_trans_exit(&trans); 12341c6fdbd8SKent Overstreet kfree(readpages_iter.pages); 12351c6fdbd8SKent Overstreet } 12361c6fdbd8SKent Overstreet 1237*30bff594SKent Overstreet static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, 1238*30bff594SKent Overstreet subvol_inum inum, struct folio *folio) 12391c6fdbd8SKent Overstreet { 1240424eb881SKent Overstreet struct btree_trans trans; 12411c6fdbd8SKent Overstreet 1242*30bff594SKent Overstreet bch2_folio_create(folio, __GFP_NOFAIL); 12431c6fdbd8SKent Overstreet 12441c6fdbd8SKent Overstreet rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; 1245*30bff594SKent Overstreet rbio->bio.bi_iter.bi_sector = folio_sector(folio); 1246*30bff594SKent Overstreet BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); 12471c6fdbd8SKent Overstreet 124820bceecbSKent Overstreet bch2_trans_init(&trans, c, 0, 0); 12498c6d298aSKent Overstreet bchfs_read(&trans, rbio, inum, NULL); 1250424eb881SKent Overstreet bch2_trans_exit(&trans); 12511c6fdbd8SKent Overstreet } 12521c6fdbd8SKent Overstreet 1253*30bff594SKent Overstreet static void bch2_read_single_folio_end_io(struct bio *bio) 12541c6fdbd8SKent Overstreet { 12551c6fdbd8SKent Overstreet complete(bio->bi_private); 12561c6fdbd8SKent Overstreet } 12571c6fdbd8SKent Overstreet 1258*30bff594SKent Overstreet static int bch2_read_single_folio(struct folio *folio, 12591c6fdbd8SKent Overstreet struct address_space *mapping) 12601c6fdbd8SKent Overstreet { 12611c6fdbd8SKent Overstreet struct bch_inode_info *inode = to_bch_ei(mapping->host); 12621c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 12631c6fdbd8SKent Overstreet struct bch_read_bio *rbio; 126401ad6737SKent Overstreet struct bch_io_opts opts; 12651c6fdbd8SKent Overstreet int ret; 12661c6fdbd8SKent Overstreet DECLARE_COMPLETION_ONSTACK(done); 12671c6fdbd8SKent Overstreet 126801ad6737SKent Overstreet bch2_inode_opts_get(&opts, c, &inode->ei_inode); 126901ad6737SKent Overstreet 12701c6fdbd8SKent Overstreet rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read), 127101ad6737SKent Overstreet opts); 12721c6fdbd8SKent Overstreet rbio->bio.bi_private = &done; 1273*30bff594SKent Overstreet rbio->bio.bi_end_io = bch2_read_single_folio_end_io; 12741c6fdbd8SKent Overstreet 1275*30bff594SKent Overstreet __bchfs_readfolio(c, rbio, inode_inum(inode), folio); 12761c6fdbd8SKent Overstreet wait_for_completion(&done); 12771c6fdbd8SKent Overstreet 12781c6fdbd8SKent Overstreet ret = blk_status_to_errno(rbio->bio.bi_status); 12791c6fdbd8SKent Overstreet bio_put(&rbio->bio); 12801c6fdbd8SKent Overstreet 12811c6fdbd8SKent Overstreet if (ret < 0) 12821c6fdbd8SKent Overstreet return ret; 12831c6fdbd8SKent Overstreet 1284*30bff594SKent Overstreet folio_mark_uptodate(folio); 12851c6fdbd8SKent Overstreet return 0; 12861c6fdbd8SKent Overstreet } 12871c6fdbd8SKent Overstreet 12881c6fdbd8SKent Overstreet int bch2_read_folio(struct file *file, struct folio *folio) 12891c6fdbd8SKent Overstreet { 12901c6fdbd8SKent Overstreet int ret; 12911c6fdbd8SKent Overstreet 1292*30bff594SKent Overstreet ret = bch2_read_single_folio(folio, folio->mapping); 12931c6fdbd8SKent Overstreet folio_unlock(folio); 12945c1ef830SKent Overstreet return bch2_err_class(ret); 12951c6fdbd8SKent Overstreet } 12961c6fdbd8SKent Overstreet 12971c6fdbd8SKent Overstreet /* writepages: */ 12981c6fdbd8SKent Overstreet 12991c6fdbd8SKent Overstreet struct bch_writepage_state { 13001c6fdbd8SKent Overstreet struct bch_writepage_io *io; 13011c6fdbd8SKent Overstreet struct bch_io_opts opts; 13021c6fdbd8SKent Overstreet }; 13031c6fdbd8SKent Overstreet 13041c6fdbd8SKent Overstreet static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, 13051c6fdbd8SKent Overstreet struct bch_inode_info *inode) 13061c6fdbd8SKent Overstreet { 130701ad6737SKent Overstreet struct bch_writepage_state ret = { 0 }; 130801ad6737SKent Overstreet 130901ad6737SKent Overstreet bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); 131001ad6737SKent Overstreet return ret; 13111c6fdbd8SKent Overstreet } 13121c6fdbd8SKent Overstreet 13139f311f21SKent Overstreet static void bch2_writepage_io_done(struct bch_write_op *op) 13141c6fdbd8SKent Overstreet { 13159f311f21SKent Overstreet struct bch_writepage_io *io = 13169f311f21SKent Overstreet container_of(op, struct bch_writepage_io, op); 13179a3df993SKent Overstreet struct bch_fs *c = io->op.c; 13189a3df993SKent Overstreet struct bio *bio = &io->op.wbio.bio; 13191c6fdbd8SKent Overstreet struct bvec_iter_all iter; 13201c6fdbd8SKent Overstreet struct bio_vec *bvec; 1321b3fce09cSKent Overstreet unsigned i; 13221c6fdbd8SKent Overstreet 13239a3df993SKent Overstreet if (io->op.error) { 132433c74e41SKent Overstreet set_bit(EI_INODE_ERROR, &io->inode->ei_flags); 132533c74e41SKent Overstreet 132675812e70SKent Overstreet bio_for_each_segment_all(bvec, bio, iter) { 13273342ac13SKent Overstreet struct bch_folio *s; 1328b3fce09cSKent Overstreet 13291c6fdbd8SKent Overstreet SetPageError(bvec->bv_page); 133075812e70SKent Overstreet mapping_set_error(bvec->bv_page->mapping, -EIO); 1331b3fce09cSKent Overstreet 1332*30bff594SKent Overstreet s = __bch2_folio(page_folio(bvec->bv_page)); 13333826ee0bSKent Overstreet spin_lock(&s->lock); 1334b3fce09cSKent Overstreet for (i = 0; i < PAGE_SECTORS; i++) 1335b3fce09cSKent Overstreet s->s[i].nr_replicas = 0; 13363826ee0bSKent Overstreet spin_unlock(&s->lock); 133775812e70SKent Overstreet } 13381c6fdbd8SKent Overstreet } 13391c6fdbd8SKent Overstreet 13404be1a412SKent Overstreet if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { 13414be1a412SKent Overstreet bio_for_each_segment_all(bvec, bio, iter) { 13423342ac13SKent Overstreet struct bch_folio *s; 13434be1a412SKent Overstreet 1344*30bff594SKent Overstreet s = __bch2_folio(page_folio(bvec->bv_page)); 13454be1a412SKent Overstreet spin_lock(&s->lock); 13464be1a412SKent Overstreet for (i = 0; i < PAGE_SECTORS; i++) 13474be1a412SKent Overstreet s->s[i].nr_replicas = 0; 13484be1a412SKent Overstreet spin_unlock(&s->lock); 13494be1a412SKent Overstreet } 13504be1a412SKent Overstreet } 13514be1a412SKent Overstreet 13521c6fdbd8SKent Overstreet /* 13531c6fdbd8SKent Overstreet * racing with fallocate can cause us to add fewer sectors than 13541c6fdbd8SKent Overstreet * expected - but we shouldn't add more sectors than expected: 13551c6fdbd8SKent Overstreet */ 1356f8494d25SKent Overstreet WARN_ON_ONCE(io->op.i_sectors_delta > 0); 13571c6fdbd8SKent Overstreet 13581c6fdbd8SKent Overstreet /* 13591c6fdbd8SKent Overstreet * (error (due to going RO) halfway through a page can screw that up 13601c6fdbd8SKent Overstreet * slightly) 13611c6fdbd8SKent Overstreet * XXX wtf? 13629a3df993SKent Overstreet BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); 13631c6fdbd8SKent Overstreet */ 13641c6fdbd8SKent Overstreet 13651c6fdbd8SKent Overstreet /* 13661c6fdbd8SKent Overstreet * PageWriteback is effectively our ref on the inode - fixup i_blocks 13671c6fdbd8SKent Overstreet * before calling end_page_writeback: 13681c6fdbd8SKent Overstreet */ 13699a3df993SKent Overstreet i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); 13701c6fdbd8SKent Overstreet 13717f5e31e1SKent Overstreet bio_for_each_segment_all(bvec, bio, iter) { 1372*30bff594SKent Overstreet struct folio *folio = page_folio(bvec->bv_page); 1373*30bff594SKent Overstreet struct bch_folio *s = __bch2_folio(folio); 13747f5e31e1SKent Overstreet 13757f5e31e1SKent Overstreet if (atomic_dec_and_test(&s->write_count)) 1376*30bff594SKent Overstreet folio_end_writeback(folio); 13777f5e31e1SKent Overstreet } 13781c6fdbd8SKent Overstreet 13799f311f21SKent Overstreet bio_put(&io->op.wbio.bio); 13801c6fdbd8SKent Overstreet } 13811c6fdbd8SKent Overstreet 13821c6fdbd8SKent Overstreet static void bch2_writepage_do_io(struct bch_writepage_state *w) 13831c6fdbd8SKent Overstreet { 13841c6fdbd8SKent Overstreet struct bch_writepage_io *io = w->io; 13851c6fdbd8SKent Overstreet 13861c6fdbd8SKent Overstreet w->io = NULL; 13879f311f21SKent Overstreet closure_call(&io->op.cl, bch2_write, NULL, NULL); 13881c6fdbd8SKent Overstreet } 13891c6fdbd8SKent Overstreet 13901c6fdbd8SKent Overstreet /* 13911c6fdbd8SKent Overstreet * Get a bch_writepage_io and add @page to it - appending to an existing one if 13921c6fdbd8SKent Overstreet * possible, else allocating a new one: 13931c6fdbd8SKent Overstreet */ 13941c6fdbd8SKent Overstreet static void bch2_writepage_io_alloc(struct bch_fs *c, 139550fe5bd6SKent Overstreet struct writeback_control *wbc, 13961c6fdbd8SKent Overstreet struct bch_writepage_state *w, 13971c6fdbd8SKent Overstreet struct bch_inode_info *inode, 13987f5e31e1SKent Overstreet u64 sector, 13991c6fdbd8SKent Overstreet unsigned nr_replicas) 14001c6fdbd8SKent Overstreet { 14011c6fdbd8SKent Overstreet struct bch_write_op *op; 14021c6fdbd8SKent Overstreet 14031c6fdbd8SKent Overstreet w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, 14041c6fdbd8SKent Overstreet REQ_OP_WRITE, 14051c6fdbd8SKent Overstreet GFP_NOFS, 14061c6fdbd8SKent Overstreet &c->writepage_bioset), 14079a3df993SKent Overstreet struct bch_writepage_io, op.wbio.bio); 14081c6fdbd8SKent Overstreet 14099a3df993SKent Overstreet w->io->inode = inode; 14109a3df993SKent Overstreet op = &w->io->op; 14119a3df993SKent Overstreet bch2_write_op_init(op, c, w->opts); 14129a3df993SKent Overstreet op->target = w->opts.foreground_target; 14131c6fdbd8SKent Overstreet op->nr_replicas = nr_replicas; 14141c6fdbd8SKent Overstreet op->res.nr_replicas = nr_replicas; 14151c6fdbd8SKent Overstreet op->write_point = writepoint_hashed(inode->ei_last_dirtied); 14168c6d298aSKent Overstreet op->subvol = inode->ei_subvol; 14177f5e31e1SKent Overstreet op->pos = POS(inode->v.i_ino, sector); 14189f311f21SKent Overstreet op->end_io = bch2_writepage_io_done; 1419a8b3a677SKent Overstreet op->devs_need_flush = &inode->ei_devs_need_flush; 14207f5e31e1SKent Overstreet op->wbio.bio.bi_iter.bi_sector = sector; 142150fe5bd6SKent Overstreet op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); 14221c6fdbd8SKent Overstreet } 14231c6fdbd8SKent Overstreet 14241c6fdbd8SKent Overstreet static int __bch2_writepage(struct folio *folio, 14251c6fdbd8SKent Overstreet struct writeback_control *wbc, 14261c6fdbd8SKent Overstreet void *data) 14271c6fdbd8SKent Overstreet { 1428*30bff594SKent Overstreet struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); 14291c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 14301c6fdbd8SKent Overstreet struct bch_writepage_state *w = data; 14313342ac13SKent Overstreet struct bch_folio *s, orig; 1432*30bff594SKent Overstreet unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; 14331c6fdbd8SKent Overstreet loff_t i_size = i_size_read(&inode->v); 14341c6fdbd8SKent Overstreet pgoff_t end_index = i_size >> PAGE_SHIFT; 1435e1036a2aSKent Overstreet int ret; 14361c6fdbd8SKent Overstreet 1437*30bff594SKent Overstreet EBUG_ON(!folio_test_uptodate(folio)); 14381c6fdbd8SKent Overstreet 1439*30bff594SKent Overstreet /* Is the folio fully inside i_size? */ 1440*30bff594SKent Overstreet if (folio->index < end_index) 14411c6fdbd8SKent Overstreet goto do_io; 14421c6fdbd8SKent Overstreet 1443*30bff594SKent Overstreet /* Is the folio fully outside i_size? (truncate in progress) */ 14441c6fdbd8SKent Overstreet offset = i_size & (PAGE_SIZE - 1); 1445*30bff594SKent Overstreet if (folio->index > end_index || !offset) { 1446*30bff594SKent Overstreet folio_unlock(folio); 14471c6fdbd8SKent Overstreet return 0; 14481c6fdbd8SKent Overstreet } 14491c6fdbd8SKent Overstreet 14501c6fdbd8SKent Overstreet /* 1451*30bff594SKent Overstreet * The folio straddles i_size. It must be zeroed out on each and every 14521c6fdbd8SKent Overstreet * writepage invocation because it may be mmapped. "A file is mapped 1453*30bff594SKent Overstreet * in multiples of the folio size. For a file that is not a multiple of 1454*30bff594SKent Overstreet * the folio size, the remaining memory is zeroed when mapped, and 14551c6fdbd8SKent Overstreet * writes to that region are not written out to the file." 14561c6fdbd8SKent Overstreet */ 1457*30bff594SKent Overstreet folio_zero_segment(folio, offset, folio_size(folio)); 14581c6fdbd8SKent Overstreet do_io: 1459*30bff594SKent Overstreet f_sectors = folio_sectors(folio); 1460*30bff594SKent Overstreet s = bch2_folio_create(folio, __GFP_NOFAIL); 1461f81b648dSKent Overstreet 1462f74a5051SKent Overstreet /* 1463f74a5051SKent Overstreet * Things get really hairy with errors during writeback: 1464f74a5051SKent Overstreet */ 1465*30bff594SKent Overstreet ret = bch2_get_folio_disk_reservation(c, inode, folio, false); 1466f74a5051SKent Overstreet BUG_ON(ret); 1467e1036a2aSKent Overstreet 14687f5e31e1SKent Overstreet /* Before unlocking the page, get copy of reservations: */ 1469f74a5051SKent Overstreet spin_lock(&s->lock); 14707f5e31e1SKent Overstreet orig = *s; 1471f74a5051SKent Overstreet spin_unlock(&s->lock); 14727f5e31e1SKent Overstreet 1473*30bff594SKent Overstreet for (i = 0; i < f_sectors; i++) { 14742ba5d38bSKent Overstreet if (s->s[i].state < SECTOR_DIRTY) 14757f5e31e1SKent Overstreet continue; 14767f5e31e1SKent Overstreet 1477f81b648dSKent Overstreet nr_replicas_this_write = 1478f57a6a5dSKent Overstreet min_t(unsigned, nr_replicas_this_write, 1479f57a6a5dSKent Overstreet s->s[i].nr_replicas + 1480f57a6a5dSKent Overstreet s->s[i].replicas_reserved); 14817f5e31e1SKent Overstreet } 14821c6fdbd8SKent Overstreet 1483*30bff594SKent Overstreet for (i = 0; i < f_sectors; i++) { 14842ba5d38bSKent Overstreet if (s->s[i].state < SECTOR_DIRTY) 14857f5e31e1SKent Overstreet continue; 14867f5e31e1SKent Overstreet 1487f57a6a5dSKent Overstreet s->s[i].nr_replicas = w->opts.compression 1488f57a6a5dSKent Overstreet ? 0 : nr_replicas_this_write; 1489e1036a2aSKent Overstreet 1490f57a6a5dSKent Overstreet s->s[i].replicas_reserved = 0; 1491f57a6a5dSKent Overstreet s->s[i].state = SECTOR_ALLOCATED; 1492f57a6a5dSKent Overstreet } 14931c6fdbd8SKent Overstreet 14947f5e31e1SKent Overstreet BUG_ON(atomic_read(&s->write_count)); 14957f5e31e1SKent Overstreet atomic_set(&s->write_count, 1); 14967f5e31e1SKent Overstreet 1497*30bff594SKent Overstreet BUG_ON(folio_test_writeback(folio)); 1498*30bff594SKent Overstreet folio_start_writeback(folio); 14997f5e31e1SKent Overstreet 1500*30bff594SKent Overstreet folio_unlock(folio); 15011c6fdbd8SKent Overstreet 15027f5e31e1SKent Overstreet offset = 0; 15037f5e31e1SKent Overstreet while (1) { 1504f74a5051SKent Overstreet unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; 15057f5e31e1SKent Overstreet u64 sector; 15067f5e31e1SKent Overstreet 1507*30bff594SKent Overstreet while (offset < f_sectors && 15082ba5d38bSKent Overstreet orig.s[offset].state < SECTOR_DIRTY) 15097f5e31e1SKent Overstreet offset++; 15107f5e31e1SKent Overstreet 1511*30bff594SKent Overstreet if (offset == f_sectors) 15127f5e31e1SKent Overstreet break; 15137f5e31e1SKent Overstreet 1514*30bff594SKent Overstreet while (offset + sectors < f_sectors && 1515f74a5051SKent Overstreet orig.s[offset + sectors].state >= SECTOR_DIRTY) { 1516f74a5051SKent Overstreet reserved_sectors += orig.s[offset + sectors].replicas_reserved; 1517f74a5051SKent Overstreet dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY; 15187f5e31e1SKent Overstreet sectors++; 15197f5e31e1SKent Overstreet } 1520f74a5051SKent Overstreet BUG_ON(!sectors); 1521f74a5051SKent Overstreet 1522*30bff594SKent Overstreet sector = folio_sector(folio) + offset; 15237f5e31e1SKent Overstreet 15241c6fdbd8SKent Overstreet if (w->io && 15259a3df993SKent Overstreet (w->io->op.res.nr_replicas != nr_replicas_this_write || 15269a3df993SKent Overstreet bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || 1527f59b3464SKent Overstreet w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= 1528f59b3464SKent Overstreet (BIO_MAX_VECS * PAGE_SIZE) || 15299a3df993SKent Overstreet bio_end_sector(&w->io->op.wbio.bio) != sector)) 15301c6fdbd8SKent Overstreet bch2_writepage_do_io(w); 15311c6fdbd8SKent Overstreet 15321c6fdbd8SKent Overstreet if (!w->io) 153350fe5bd6SKent Overstreet bch2_writepage_io_alloc(c, wbc, w, inode, sector, 1534f81b648dSKent Overstreet nr_replicas_this_write); 15351c6fdbd8SKent Overstreet 15367f5e31e1SKent Overstreet atomic_inc(&s->write_count); 15377f5e31e1SKent Overstreet 15389a3df993SKent Overstreet BUG_ON(inode != w->io->inode); 1539*30bff594SKent Overstreet BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, 15407f5e31e1SKent Overstreet sectors << 9, offset << 9)); 15411c6fdbd8SKent Overstreet 15426cc3535dSKent Overstreet /* Check for writing past i_size: */ 15438eb71e9eSKent Overstreet WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > 154480fe580cSKent Overstreet round_up(i_size, block_bytes(c)) && 15458eb71e9eSKent Overstreet !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), 15468eb71e9eSKent Overstreet "writing past i_size: %llu > %llu (unrounded %llu)\n", 15478eb71e9eSKent Overstreet bio_end_sector(&w->io->op.wbio.bio) << 9, 15488eb71e9eSKent Overstreet round_up(i_size, block_bytes(c)), 15498eb71e9eSKent Overstreet i_size); 15506cc3535dSKent Overstreet 15519a3df993SKent Overstreet w->io->op.res.sectors += reserved_sectors; 15529a3df993SKent Overstreet w->io->op.i_sectors_delta -= dirty_sectors; 15531c6fdbd8SKent Overstreet w->io->op.new_i_size = i_size; 15541c6fdbd8SKent Overstreet 15557f5e31e1SKent Overstreet offset += sectors; 15567f5e31e1SKent Overstreet } 15577f5e31e1SKent Overstreet 15587f5e31e1SKent Overstreet if (atomic_dec_and_test(&s->write_count)) 1559*30bff594SKent Overstreet folio_end_writeback(folio); 15607f5e31e1SKent Overstreet 15611c6fdbd8SKent Overstreet return 0; 15621c6fdbd8SKent Overstreet } 15631c6fdbd8SKent Overstreet 15641c6fdbd8SKent Overstreet int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) 15651c6fdbd8SKent Overstreet { 15661c6fdbd8SKent Overstreet struct bch_fs *c = mapping->host->i_sb->s_fs_info; 15671c6fdbd8SKent Overstreet struct bch_writepage_state w = 15681c6fdbd8SKent Overstreet bch_writepage_state_init(c, to_bch_ei(mapping->host)); 15691c6fdbd8SKent Overstreet struct blk_plug plug; 15701c6fdbd8SKent Overstreet int ret; 15711c6fdbd8SKent Overstreet 15721c6fdbd8SKent Overstreet blk_start_plug(&plug); 15731c6fdbd8SKent Overstreet ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); 15741c6fdbd8SKent Overstreet if (w.io) 15751c6fdbd8SKent Overstreet bch2_writepage_do_io(&w); 15761c6fdbd8SKent Overstreet blk_finish_plug(&plug); 15775c1ef830SKent Overstreet return bch2_err_class(ret); 15781c6fdbd8SKent Overstreet } 15791c6fdbd8SKent Overstreet 15801c6fdbd8SKent Overstreet /* buffered writes: */ 15811c6fdbd8SKent Overstreet 15821c6fdbd8SKent Overstreet int bch2_write_begin(struct file *file, struct address_space *mapping, 15831c6fdbd8SKent Overstreet loff_t pos, unsigned len, 15841c6fdbd8SKent Overstreet struct page **pagep, void **fsdata) 15851c6fdbd8SKent Overstreet { 15861c6fdbd8SKent Overstreet struct bch_inode_info *inode = to_bch_ei(mapping->host); 15871c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 1588*30bff594SKent Overstreet struct bch2_folio_reservation *res; 15891c6fdbd8SKent Overstreet pgoff_t index = pos >> PAGE_SHIFT; 15901c6fdbd8SKent Overstreet unsigned offset = pos & (PAGE_SIZE - 1); 1591*30bff594SKent Overstreet struct folio *folio; 15921c6fdbd8SKent Overstreet int ret = -ENOMEM; 15931c6fdbd8SKent Overstreet 1594d1542e03SKent Overstreet res = kmalloc(sizeof(*res), GFP_KERNEL); 1595d1542e03SKent Overstreet if (!res) 1596d1542e03SKent Overstreet return -ENOMEM; 1597d1542e03SKent Overstreet 1598*30bff594SKent Overstreet bch2_folio_reservation_init(c, inode, res); 1599d1542e03SKent Overstreet *fsdata = res; 16001c6fdbd8SKent Overstreet 1601a7ecd30cSKent Overstreet bch2_pagecache_add_get(inode); 16021c6fdbd8SKent Overstreet 1603*30bff594SKent Overstreet folio = __filemap_get_folio(mapping, index, 1604*30bff594SKent Overstreet FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, 1605*30bff594SKent Overstreet mapping_gfp_mask(mapping)); 1606*30bff594SKent Overstreet if (!folio) 16071c6fdbd8SKent Overstreet goto err_unlock; 16081c6fdbd8SKent Overstreet 1609*30bff594SKent Overstreet if (folio_test_uptodate(folio)) 16101c6fdbd8SKent Overstreet goto out; 16111c6fdbd8SKent Overstreet 1612*30bff594SKent Overstreet /* If we're writing entire folio, don't need to read it in first: */ 1613*30bff594SKent Overstreet if (len == folio_size(folio)) 16141c6fdbd8SKent Overstreet goto out; 16151c6fdbd8SKent Overstreet 16161c6fdbd8SKent Overstreet if (!offset && pos + len >= inode->v.i_size) { 1617*30bff594SKent Overstreet folio_zero_segment(folio, len, folio_size(folio)); 1618*30bff594SKent Overstreet flush_dcache_folio(folio); 16191c6fdbd8SKent Overstreet goto out; 16201c6fdbd8SKent Overstreet } 16211c6fdbd8SKent Overstreet 16221c6fdbd8SKent Overstreet if (index > inode->v.i_size >> PAGE_SHIFT) { 1623*30bff594SKent Overstreet folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); 1624*30bff594SKent Overstreet flush_dcache_folio(folio); 16251c6fdbd8SKent Overstreet goto out; 16261c6fdbd8SKent Overstreet } 16271c6fdbd8SKent Overstreet readpage: 1628*30bff594SKent Overstreet ret = bch2_read_single_folio(folio, mapping); 16291c6fdbd8SKent Overstreet if (ret) 16301c6fdbd8SKent Overstreet goto err; 16311c6fdbd8SKent Overstreet out: 1632*30bff594SKent Overstreet if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) { 1633*30bff594SKent Overstreet ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); 1634e6ec361fSKent Overstreet if (ret) 16353a4d3656SKent Overstreet goto err; 1636e6ec361fSKent Overstreet } 1637e6ec361fSKent Overstreet 1638*30bff594SKent Overstreet ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); 16391c6fdbd8SKent Overstreet if (ret) { 1640*30bff594SKent Overstreet if (!folio_test_uptodate(folio)) { 16411c6fdbd8SKent Overstreet /* 1642*30bff594SKent Overstreet * If the folio hasn't been read in, we won't know if we 16431c6fdbd8SKent Overstreet * actually need a reservation - we don't actually need 1644*30bff594SKent Overstreet * to read here, we just need to check if the folio is 16451c6fdbd8SKent Overstreet * fully backed by uncompressed data: 16461c6fdbd8SKent Overstreet */ 16471c6fdbd8SKent Overstreet goto readpage; 16481c6fdbd8SKent Overstreet } 16491c6fdbd8SKent Overstreet 16501c6fdbd8SKent Overstreet goto err; 16511c6fdbd8SKent Overstreet } 16521c6fdbd8SKent Overstreet 1653*30bff594SKent Overstreet *pagep = &folio->page; 16541c6fdbd8SKent Overstreet return 0; 16551c6fdbd8SKent Overstreet err: 1656*30bff594SKent Overstreet folio_unlock(folio); 1657*30bff594SKent Overstreet folio_put(folio); 16581c6fdbd8SKent Overstreet *pagep = NULL; 16591c6fdbd8SKent Overstreet err_unlock: 1660a7ecd30cSKent Overstreet bch2_pagecache_add_put(inode); 1661d1542e03SKent Overstreet kfree(res); 1662d1542e03SKent Overstreet *fsdata = NULL; 16635c1ef830SKent Overstreet return bch2_err_class(ret); 16641c6fdbd8SKent Overstreet } 16651c6fdbd8SKent Overstreet 16661c6fdbd8SKent Overstreet int bch2_write_end(struct file *file, struct address_space *mapping, 16671c6fdbd8SKent Overstreet loff_t pos, unsigned len, unsigned copied, 16681c6fdbd8SKent Overstreet struct page *page, void *fsdata) 16691c6fdbd8SKent Overstreet { 16701c6fdbd8SKent Overstreet struct bch_inode_info *inode = to_bch_ei(mapping->host); 16711c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 1672*30bff594SKent Overstreet struct bch2_folio_reservation *res = fsdata; 1673*30bff594SKent Overstreet struct folio *folio = page_folio(page); 1674d1542e03SKent Overstreet unsigned offset = pos & (PAGE_SIZE - 1); 16751c6fdbd8SKent Overstreet 16761c6fdbd8SKent Overstreet lockdep_assert_held(&inode->v.i_rwsem); 16771c6fdbd8SKent Overstreet 1678*30bff594SKent Overstreet if (unlikely(copied < len && !folio_test_uptodate(folio))) { 16791c6fdbd8SKent Overstreet /* 1680*30bff594SKent Overstreet * The folio needs to be read in, but that would destroy 16811c6fdbd8SKent Overstreet * our partial write - simplest thing is to just force 16821c6fdbd8SKent Overstreet * userspace to redo the write: 16831c6fdbd8SKent Overstreet */ 1684*30bff594SKent Overstreet folio_zero_range(folio, 0, folio_size(folio)); 1685*30bff594SKent Overstreet flush_dcache_folio(folio); 16861c6fdbd8SKent Overstreet copied = 0; 16871c6fdbd8SKent Overstreet } 16881c6fdbd8SKent Overstreet 16891c6fdbd8SKent Overstreet spin_lock(&inode->v.i_lock); 16901c6fdbd8SKent Overstreet if (pos + copied > inode->v.i_size) 16911c6fdbd8SKent Overstreet i_size_write(&inode->v, pos + copied); 16921c6fdbd8SKent Overstreet spin_unlock(&inode->v.i_lock); 16931c6fdbd8SKent Overstreet 16941c6fdbd8SKent Overstreet if (copied) { 1695*30bff594SKent Overstreet if (!folio_test_uptodate(folio)) 1696*30bff594SKent Overstreet folio_mark_uptodate(folio); 1697d1542e03SKent Overstreet 1698*30bff594SKent Overstreet bch2_set_folio_dirty(c, inode, folio, res, offset, copied); 16991c6fdbd8SKent Overstreet 17001c6fdbd8SKent Overstreet inode->ei_last_dirtied = (unsigned long) current; 17011c6fdbd8SKent Overstreet } 17021c6fdbd8SKent Overstreet 1703*30bff594SKent Overstreet folio_unlock(folio); 1704*30bff594SKent Overstreet folio_put(folio); 1705a7ecd30cSKent Overstreet bch2_pagecache_add_put(inode); 17061c6fdbd8SKent Overstreet 1707*30bff594SKent Overstreet bch2_folio_reservation_put(c, inode, res); 1708d1542e03SKent Overstreet kfree(res); 1709d1542e03SKent Overstreet 17101c6fdbd8SKent Overstreet return copied; 17111c6fdbd8SKent Overstreet } 17121c6fdbd8SKent Overstreet 17131c6fdbd8SKent Overstreet #define WRITE_BATCH_PAGES 32 17141c6fdbd8SKent Overstreet 17151c6fdbd8SKent Overstreet static int __bch2_buffered_write(struct bch_inode_info *inode, 17161c6fdbd8SKent Overstreet struct address_space *mapping, 17171c6fdbd8SKent Overstreet struct iov_iter *iter, 17181c6fdbd8SKent Overstreet loff_t pos, unsigned len) 17191c6fdbd8SKent Overstreet { 17201c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 1721*30bff594SKent Overstreet struct folio *folios[WRITE_BATCH_PAGES]; 1722*30bff594SKent Overstreet struct bch2_folio_reservation res; 17231c6fdbd8SKent Overstreet unsigned long index = pos >> PAGE_SHIFT; 17241c6fdbd8SKent Overstreet unsigned offset = pos & (PAGE_SIZE - 1); 1725*30bff594SKent Overstreet unsigned nr_folios = DIV_ROUND_UP(offset + len, PAGE_SIZE); 1726d1542e03SKent Overstreet unsigned i, reserved = 0, set_dirty = 0; 1727*30bff594SKent Overstreet unsigned copied = 0, nr_folios_copied = 0; 17281c6fdbd8SKent Overstreet int ret = 0; 17291c6fdbd8SKent Overstreet 17301c6fdbd8SKent Overstreet BUG_ON(!len); 1731*30bff594SKent Overstreet BUG_ON(nr_folios > ARRAY_SIZE(folios)); 17321c6fdbd8SKent Overstreet 1733*30bff594SKent Overstreet bch2_folio_reservation_init(c, inode, &res); 1734d1542e03SKent Overstreet 1735*30bff594SKent Overstreet for (i = 0; i < nr_folios; i++) { 1736*30bff594SKent Overstreet folios[i] = __filemap_get_folio(mapping, index + i, 1737*30bff594SKent Overstreet FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, 1738*30bff594SKent Overstreet mapping_gfp_mask(mapping)); 1739*30bff594SKent Overstreet if (!folios[i]) { 1740*30bff594SKent Overstreet nr_folios = i; 17418de819f8SKent Overstreet if (!i) { 17421c6fdbd8SKent Overstreet ret = -ENOMEM; 17431c6fdbd8SKent Overstreet goto out; 17441c6fdbd8SKent Overstreet } 17458de819f8SKent Overstreet len = min_t(unsigned, len, 1746*30bff594SKent Overstreet nr_folios * PAGE_SIZE - offset); 17478de819f8SKent Overstreet break; 17488de819f8SKent Overstreet } 17491c6fdbd8SKent Overstreet } 17501c6fdbd8SKent Overstreet 1751*30bff594SKent Overstreet if (offset && !folio_test_uptodate(folios[0])) { 1752*30bff594SKent Overstreet ret = bch2_read_single_folio(folios[0], mapping); 17531c6fdbd8SKent Overstreet if (ret) 17541c6fdbd8SKent Overstreet goto out; 17551c6fdbd8SKent Overstreet } 17561c6fdbd8SKent Overstreet 17571c6fdbd8SKent Overstreet if ((pos + len) & (PAGE_SIZE - 1) && 1758*30bff594SKent Overstreet !folio_test_uptodate(folios[nr_folios - 1])) { 1759*30bff594SKent Overstreet if ((index + nr_folios - 1) << PAGE_SHIFT >= inode->v.i_size) { 1760*30bff594SKent Overstreet folio_zero_range(folios[nr_folios - 1], 0, 1761*30bff594SKent Overstreet folio_size(folios[nr_folios - 1])); 17621c6fdbd8SKent Overstreet } else { 1763*30bff594SKent Overstreet ret = bch2_read_single_folio(folios[nr_folios - 1], mapping); 17641c6fdbd8SKent Overstreet if (ret) 17651c6fdbd8SKent Overstreet goto out; 17661c6fdbd8SKent Overstreet } 17671c6fdbd8SKent Overstreet } 17681c6fdbd8SKent Overstreet 1769d1542e03SKent Overstreet while (reserved < len) { 1770e6ec361fSKent Overstreet unsigned i = (offset + reserved) >> PAGE_SHIFT; 1771*30bff594SKent Overstreet struct folio *folio = folios[i]; 1772*30bff594SKent Overstreet unsigned folio_offset = (offset + reserved) & (PAGE_SIZE - 1); 1773*30bff594SKent Overstreet unsigned folio_len = min_t(unsigned, len - reserved, 1774*30bff594SKent Overstreet PAGE_SIZE - folio_offset); 17751c6fdbd8SKent Overstreet 1776*30bff594SKent Overstreet if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) { 17773342ac13SKent Overstreet ret = bch2_folio_set(c, inode_inum(inode), 1778*30bff594SKent Overstreet folios + i, nr_folios - i); 1779e6ec361fSKent Overstreet if (ret) 1780e6ec361fSKent Overstreet goto out; 17811c6fdbd8SKent Overstreet } 17821c6fdbd8SKent Overstreet 1783353448f3SKent Overstreet /* 1784353448f3SKent Overstreet * XXX: per POSIX and fstests generic/275, on -ENOSPC we're 1785353448f3SKent Overstreet * supposed to write as much as we have disk space for. 1786353448f3SKent Overstreet * 1787353448f3SKent Overstreet * On failure here we should still write out a partial page if 1788353448f3SKent Overstreet * we aren't completely out of disk space - we don't do that 1789353448f3SKent Overstreet * yet: 1790353448f3SKent Overstreet */ 1791*30bff594SKent Overstreet ret = bch2_folio_reservation_get(c, inode, folio, &res, 1792*30bff594SKent Overstreet folio_offset, folio_len); 1793353448f3SKent Overstreet if (unlikely(ret)) { 1794353448f3SKent Overstreet if (!reserved) 17951c6fdbd8SKent Overstreet goto out; 1796353448f3SKent Overstreet break; 1797353448f3SKent Overstreet } 1798d1542e03SKent Overstreet 1799*30bff594SKent Overstreet reserved += folio_len; 18001c6fdbd8SKent Overstreet } 18011c6fdbd8SKent Overstreet 18021c6fdbd8SKent Overstreet if (mapping_writably_mapped(mapping)) 1803*30bff594SKent Overstreet for (i = 0; i < nr_folios; i++) 1804*30bff594SKent Overstreet flush_dcache_folio(folios[i]); 18051c6fdbd8SKent Overstreet 1806353448f3SKent Overstreet while (copied < reserved) { 1807*30bff594SKent Overstreet struct folio *folio = folios[(offset + copied) >> PAGE_SHIFT]; 1808*30bff594SKent Overstreet unsigned folio_offset = (offset + copied) & (PAGE_SIZE - 1); 1809*30bff594SKent Overstreet unsigned folio_len = min_t(unsigned, reserved - copied, 1810*30bff594SKent Overstreet PAGE_SIZE - folio_offset); 1811*30bff594SKent Overstreet unsigned folio_copied = copy_page_from_iter_atomic(&folio->page, 1812*30bff594SKent Overstreet folio_offset, folio_len, iter); 1813d1542e03SKent Overstreet 1814*30bff594SKent Overstreet if (!folio_copied) 1815d1542e03SKent Overstreet break; 18161c6fdbd8SKent Overstreet 1817*30bff594SKent Overstreet if (!folio_test_uptodate(folio) && 1818*30bff594SKent Overstreet folio_copied != PAGE_SIZE && 1819*30bff594SKent Overstreet pos + copied + folio_copied < inode->v.i_size) { 1820*30bff594SKent Overstreet folio_zero_range(folio, 0, folio_size(folio)); 1821912bdf17SKent Overstreet break; 1822912bdf17SKent Overstreet } 1823912bdf17SKent Overstreet 1824*30bff594SKent Overstreet flush_dcache_folio(folio); 1825*30bff594SKent Overstreet copied += folio_copied; 1826912bdf17SKent Overstreet 1827*30bff594SKent Overstreet if (folio_copied != folio_len) 1828912bdf17SKent Overstreet break; 18291c6fdbd8SKent Overstreet } 18301c6fdbd8SKent Overstreet 18311c6fdbd8SKent Overstreet if (!copied) 18321c6fdbd8SKent Overstreet goto out; 18331c6fdbd8SKent Overstreet 1834877dfb34SKent Overstreet spin_lock(&inode->v.i_lock); 1835877dfb34SKent Overstreet if (pos + copied > inode->v.i_size) 1836877dfb34SKent Overstreet i_size_write(&inode->v, pos + copied); 1837877dfb34SKent Overstreet spin_unlock(&inode->v.i_lock); 1838877dfb34SKent Overstreet 1839d1542e03SKent Overstreet while (set_dirty < copied) { 1840*30bff594SKent Overstreet struct folio *folio = folios[(offset + set_dirty) >> PAGE_SHIFT]; 1841*30bff594SKent Overstreet unsigned folio_offset = (offset + set_dirty) & (PAGE_SIZE - 1); 1842*30bff594SKent Overstreet unsigned folio_len = min_t(unsigned, copied - set_dirty, 1843*30bff594SKent Overstreet PAGE_SIZE - folio_offset); 1844d1542e03SKent Overstreet 1845*30bff594SKent Overstreet if (!folio_test_uptodate(folio)) 1846*30bff594SKent Overstreet folio_mark_uptodate(folio); 1847d1542e03SKent Overstreet 1848*30bff594SKent Overstreet bch2_set_folio_dirty(c, inode, folio, &res, folio_offset, folio_len); 1849*30bff594SKent Overstreet folio_unlock(folio); 1850*30bff594SKent Overstreet folio_put(folio); 1851d1542e03SKent Overstreet 1852*30bff594SKent Overstreet set_dirty += folio_len; 1853d1542e03SKent Overstreet } 1854877dfb34SKent Overstreet 1855*30bff594SKent Overstreet nr_folios_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); 1856877dfb34SKent Overstreet inode->ei_last_dirtied = (unsigned long) current; 18571c6fdbd8SKent Overstreet out: 1858*30bff594SKent Overstreet for (i = nr_folios_copied; i < nr_folios; i++) { 1859*30bff594SKent Overstreet folio_unlock(folios[i]); 1860*30bff594SKent Overstreet folio_put(folios[i]); 18611c6fdbd8SKent Overstreet } 18621c6fdbd8SKent Overstreet 1863*30bff594SKent Overstreet bch2_folio_reservation_put(c, inode, &res); 18641c6fdbd8SKent Overstreet 18651c6fdbd8SKent Overstreet return copied ?: ret; 18661c6fdbd8SKent Overstreet } 18671c6fdbd8SKent Overstreet 18681c6fdbd8SKent Overstreet static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) 18691c6fdbd8SKent Overstreet { 18701c6fdbd8SKent Overstreet struct file *file = iocb->ki_filp; 18711c6fdbd8SKent Overstreet struct address_space *mapping = file->f_mapping; 18721c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 18731c6fdbd8SKent Overstreet loff_t pos = iocb->ki_pos; 18741c6fdbd8SKent Overstreet ssize_t written = 0; 18751c6fdbd8SKent Overstreet int ret = 0; 18761c6fdbd8SKent Overstreet 1877a7ecd30cSKent Overstreet bch2_pagecache_add_get(inode); 18781c6fdbd8SKent Overstreet 18791c6fdbd8SKent Overstreet do { 18801c6fdbd8SKent Overstreet unsigned offset = pos & (PAGE_SIZE - 1); 18811c6fdbd8SKent Overstreet unsigned bytes = min_t(unsigned long, iov_iter_count(iter), 18821c6fdbd8SKent Overstreet PAGE_SIZE * WRITE_BATCH_PAGES - offset); 18831c6fdbd8SKent Overstreet again: 18841c6fdbd8SKent Overstreet /* 18851c6fdbd8SKent Overstreet * Bring in the user page that we will copy from _first_. 18861c6fdbd8SKent Overstreet * Otherwise there's a nasty deadlock on copying from the 18871c6fdbd8SKent Overstreet * same page as we're writing to, without it being marked 18881c6fdbd8SKent Overstreet * up-to-date. 18891c6fdbd8SKent Overstreet * 18901c6fdbd8SKent Overstreet * Not only is this an optimisation, but it is also required 18911c6fdbd8SKent Overstreet * to check that the address is actually valid, when atomic 18921c6fdbd8SKent Overstreet * usercopies are used, below. 18931c6fdbd8SKent Overstreet */ 18941c6fdbd8SKent Overstreet if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { 18951c6fdbd8SKent Overstreet bytes = min_t(unsigned long, iov_iter_count(iter), 18961c6fdbd8SKent Overstreet PAGE_SIZE - offset); 18971c6fdbd8SKent Overstreet 18981c6fdbd8SKent Overstreet if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { 18991c6fdbd8SKent Overstreet ret = -EFAULT; 19001c6fdbd8SKent Overstreet break; 19011c6fdbd8SKent Overstreet } 19021c6fdbd8SKent Overstreet } 19031c6fdbd8SKent Overstreet 19041c6fdbd8SKent Overstreet if (unlikely(fatal_signal_pending(current))) { 19051c6fdbd8SKent Overstreet ret = -EINTR; 19061c6fdbd8SKent Overstreet break; 19071c6fdbd8SKent Overstreet } 19081c6fdbd8SKent Overstreet 19091c6fdbd8SKent Overstreet ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); 19101c6fdbd8SKent Overstreet if (unlikely(ret < 0)) 19111c6fdbd8SKent Overstreet break; 19121c6fdbd8SKent Overstreet 19131c6fdbd8SKent Overstreet cond_resched(); 19141c6fdbd8SKent Overstreet 19151c6fdbd8SKent Overstreet if (unlikely(ret == 0)) { 19161c6fdbd8SKent Overstreet /* 19171c6fdbd8SKent Overstreet * If we were unable to copy any data at all, we must 19181c6fdbd8SKent Overstreet * fall back to a single segment length write. 19191c6fdbd8SKent Overstreet * 19201c6fdbd8SKent Overstreet * If we didn't fallback here, we could livelock 19211c6fdbd8SKent Overstreet * because not all segments in the iov can be copied at 19221c6fdbd8SKent Overstreet * once without a pagefault. 19231c6fdbd8SKent Overstreet */ 19241c6fdbd8SKent Overstreet bytes = min_t(unsigned long, PAGE_SIZE - offset, 19251c6fdbd8SKent Overstreet iov_iter_single_seg_count(iter)); 19261c6fdbd8SKent Overstreet goto again; 19271c6fdbd8SKent Overstreet } 19281c6fdbd8SKent Overstreet pos += ret; 19291c6fdbd8SKent Overstreet written += ret; 1930912bdf17SKent Overstreet ret = 0; 19311c6fdbd8SKent Overstreet 19321c6fdbd8SKent Overstreet balance_dirty_pages_ratelimited(mapping); 19331c6fdbd8SKent Overstreet } while (iov_iter_count(iter)); 19341c6fdbd8SKent Overstreet 1935a7ecd30cSKent Overstreet bch2_pagecache_add_put(inode); 19361c6fdbd8SKent Overstreet 19371c6fdbd8SKent Overstreet return written ? written : ret; 19381c6fdbd8SKent Overstreet } 19391c6fdbd8SKent Overstreet 19401c6fdbd8SKent Overstreet /* O_DIRECT reads */ 19411c6fdbd8SKent Overstreet 1942b4725cc1SKent Overstreet static void bio_check_or_release(struct bio *bio, bool check_dirty) 1943b4725cc1SKent Overstreet { 1944b4725cc1SKent Overstreet if (check_dirty) { 1945b4725cc1SKent Overstreet bio_check_pages_dirty(bio); 1946b4725cc1SKent Overstreet } else { 1947b4725cc1SKent Overstreet bio_release_pages(bio, false); 1948b4725cc1SKent Overstreet bio_put(bio); 1949b4725cc1SKent Overstreet } 1950b4725cc1SKent Overstreet } 1951b4725cc1SKent Overstreet 19521c6fdbd8SKent Overstreet static void bch2_dio_read_complete(struct closure *cl) 19531c6fdbd8SKent Overstreet { 19541c6fdbd8SKent Overstreet struct dio_read *dio = container_of(cl, struct dio_read, cl); 19551c6fdbd8SKent Overstreet 19561c6fdbd8SKent Overstreet dio->req->ki_complete(dio->req, dio->ret); 1957b4725cc1SKent Overstreet bio_check_or_release(&dio->rbio.bio, dio->should_dirty); 19581c6fdbd8SKent Overstreet } 19591c6fdbd8SKent Overstreet 19601c6fdbd8SKent Overstreet static void bch2_direct_IO_read_endio(struct bio *bio) 19611c6fdbd8SKent Overstreet { 19621c6fdbd8SKent Overstreet struct dio_read *dio = bio->bi_private; 19631c6fdbd8SKent Overstreet 19641c6fdbd8SKent Overstreet if (bio->bi_status) 19651c6fdbd8SKent Overstreet dio->ret = blk_status_to_errno(bio->bi_status); 19661c6fdbd8SKent Overstreet 19671c6fdbd8SKent Overstreet closure_put(&dio->cl); 19681c6fdbd8SKent Overstreet } 19691c6fdbd8SKent Overstreet 19701c6fdbd8SKent Overstreet static void bch2_direct_IO_read_split_endio(struct bio *bio) 19711c6fdbd8SKent Overstreet { 1972b4725cc1SKent Overstreet struct dio_read *dio = bio->bi_private; 1973b4725cc1SKent Overstreet bool should_dirty = dio->should_dirty; 1974b4725cc1SKent Overstreet 19751c6fdbd8SKent Overstreet bch2_direct_IO_read_endio(bio); 1976b4725cc1SKent Overstreet bio_check_or_release(bio, should_dirty); 19771c6fdbd8SKent Overstreet } 19781c6fdbd8SKent Overstreet 19791c6fdbd8SKent Overstreet static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) 19801c6fdbd8SKent Overstreet { 19811c6fdbd8SKent Overstreet struct file *file = req->ki_filp; 19821c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 19831c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 198401ad6737SKent Overstreet struct bch_io_opts opts; 19851c6fdbd8SKent Overstreet struct dio_read *dio; 19861c6fdbd8SKent Overstreet struct bio *bio; 19871c6fdbd8SKent Overstreet loff_t offset = req->ki_pos; 19881c6fdbd8SKent Overstreet bool sync = is_sync_kiocb(req); 19891c6fdbd8SKent Overstreet size_t shorten; 19901c6fdbd8SKent Overstreet ssize_t ret; 19911c6fdbd8SKent Overstreet 199201ad6737SKent Overstreet bch2_inode_opts_get(&opts, c, &inode->ei_inode); 199301ad6737SKent Overstreet 19941c6fdbd8SKent Overstreet if ((offset|iter->count) & (block_bytes(c) - 1)) 19951c6fdbd8SKent Overstreet return -EINVAL; 19961c6fdbd8SKent Overstreet 19971c6fdbd8SKent Overstreet ret = min_t(loff_t, iter->count, 19981c6fdbd8SKent Overstreet max_t(loff_t, 0, i_size_read(&inode->v) - offset)); 19991c6fdbd8SKent Overstreet 20001c6fdbd8SKent Overstreet if (!ret) 20011c6fdbd8SKent Overstreet return ret; 20021c6fdbd8SKent Overstreet 20031c6fdbd8SKent Overstreet shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); 20041c6fdbd8SKent Overstreet iter->count -= shorten; 20051c6fdbd8SKent Overstreet 20061c6fdbd8SKent Overstreet bio = bio_alloc_bioset(NULL, 20074d126dc8SKent Overstreet bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 20081c6fdbd8SKent Overstreet REQ_OP_READ, 20091c6fdbd8SKent Overstreet GFP_KERNEL, 20101c6fdbd8SKent Overstreet &c->dio_read_bioset); 20111c6fdbd8SKent Overstreet 20121c6fdbd8SKent Overstreet bio->bi_end_io = bch2_direct_IO_read_endio; 20131c6fdbd8SKent Overstreet 20141c6fdbd8SKent Overstreet dio = container_of(bio, struct dio_read, rbio.bio); 20151c6fdbd8SKent Overstreet closure_init(&dio->cl, NULL); 20161c6fdbd8SKent Overstreet 20171c6fdbd8SKent Overstreet /* 20181c6fdbd8SKent Overstreet * this is a _really_ horrible hack just to avoid an atomic sub at the 20191c6fdbd8SKent Overstreet * end: 20201c6fdbd8SKent Overstreet */ 20211c6fdbd8SKent Overstreet if (!sync) { 20221c6fdbd8SKent Overstreet set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); 20231c6fdbd8SKent Overstreet atomic_set(&dio->cl.remaining, 20241c6fdbd8SKent Overstreet CLOSURE_REMAINING_INITIALIZER - 20251c6fdbd8SKent Overstreet CLOSURE_RUNNING + 20261c6fdbd8SKent Overstreet CLOSURE_DESTRUCTOR); 20271c6fdbd8SKent Overstreet } else { 20281c6fdbd8SKent Overstreet atomic_set(&dio->cl.remaining, 20291c6fdbd8SKent Overstreet CLOSURE_REMAINING_INITIALIZER + 1); 20301c6fdbd8SKent Overstreet } 20311c6fdbd8SKent Overstreet 20321c6fdbd8SKent Overstreet dio->req = req; 20331c6fdbd8SKent Overstreet dio->ret = ret; 2034b4725cc1SKent Overstreet /* 2035b4725cc1SKent Overstreet * This is one of the sketchier things I've encountered: we have to skip 2036b4725cc1SKent Overstreet * the dirtying of requests that are internal from the kernel (i.e. from 2037b4725cc1SKent Overstreet * loopback), because we'll deadlock on page_lock. 2038b4725cc1SKent Overstreet */ 2039b4725cc1SKent Overstreet dio->should_dirty = iter_is_iovec(iter); 20401c6fdbd8SKent Overstreet 20411c6fdbd8SKent Overstreet goto start; 20421c6fdbd8SKent Overstreet while (iter->count) { 20431c6fdbd8SKent Overstreet bio = bio_alloc_bioset(NULL, 20444d126dc8SKent Overstreet bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 20451c6fdbd8SKent Overstreet REQ_OP_READ, 20461c6fdbd8SKent Overstreet GFP_KERNEL, 20471c6fdbd8SKent Overstreet &c->bio_read); 20481c6fdbd8SKent Overstreet bio->bi_end_io = bch2_direct_IO_read_split_endio; 20491c6fdbd8SKent Overstreet start: 20501c6fdbd8SKent Overstreet bio->bi_opf = REQ_OP_READ|REQ_SYNC; 20511c6fdbd8SKent Overstreet bio->bi_iter.bi_sector = offset >> 9; 20521c6fdbd8SKent Overstreet bio->bi_private = dio; 20531c6fdbd8SKent Overstreet 20541c6fdbd8SKent Overstreet ret = bio_iov_iter_get_pages(bio, iter); 20551c6fdbd8SKent Overstreet if (ret < 0) { 20561c6fdbd8SKent Overstreet /* XXX: fault inject this path */ 20571c6fdbd8SKent Overstreet bio->bi_status = BLK_STS_RESOURCE; 20581c6fdbd8SKent Overstreet bio_endio(bio); 20591c6fdbd8SKent Overstreet break; 20601c6fdbd8SKent Overstreet } 20611c6fdbd8SKent Overstreet 20621c6fdbd8SKent Overstreet offset += bio->bi_iter.bi_size; 2063b4725cc1SKent Overstreet 2064b4725cc1SKent Overstreet if (dio->should_dirty) 20651c6fdbd8SKent Overstreet bio_set_pages_dirty(bio); 20661c6fdbd8SKent Overstreet 20671c6fdbd8SKent Overstreet if (iter->count) 20681c6fdbd8SKent Overstreet closure_get(&dio->cl); 20691c6fdbd8SKent Overstreet 20708c6d298aSKent Overstreet bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); 20711c6fdbd8SKent Overstreet } 20721c6fdbd8SKent Overstreet 20731c6fdbd8SKent Overstreet iter->count += shorten; 20741c6fdbd8SKent Overstreet 20751c6fdbd8SKent Overstreet if (sync) { 20761c6fdbd8SKent Overstreet closure_sync(&dio->cl); 20771c6fdbd8SKent Overstreet closure_debug_destroy(&dio->cl); 20781c6fdbd8SKent Overstreet ret = dio->ret; 2079b4725cc1SKent Overstreet bio_check_or_release(&dio->rbio.bio, dio->should_dirty); 20801c6fdbd8SKent Overstreet return ret; 20811c6fdbd8SKent Overstreet } else { 20821c6fdbd8SKent Overstreet return -EIOCBQUEUED; 20831c6fdbd8SKent Overstreet } 20841c6fdbd8SKent Overstreet } 20851c6fdbd8SKent Overstreet 20861c6fdbd8SKent Overstreet ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) 20871c6fdbd8SKent Overstreet { 20881c6fdbd8SKent Overstreet struct file *file = iocb->ki_filp; 20891c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 20901c6fdbd8SKent Overstreet struct address_space *mapping = file->f_mapping; 20911c6fdbd8SKent Overstreet size_t count = iov_iter_count(iter); 20921c6fdbd8SKent Overstreet ssize_t ret; 20931c6fdbd8SKent Overstreet 20941c6fdbd8SKent Overstreet if (!count) 20951c6fdbd8SKent Overstreet return 0; /* skip atime */ 20961c6fdbd8SKent Overstreet 20971c6fdbd8SKent Overstreet if (iocb->ki_flags & IOCB_DIRECT) { 20981c6fdbd8SKent Overstreet struct blk_plug plug; 20991c6fdbd8SKent Overstreet 2100a023127aSKent Overstreet if (unlikely(mapping->nrpages)) { 21011c6fdbd8SKent Overstreet ret = filemap_write_and_wait_range(mapping, 21021c6fdbd8SKent Overstreet iocb->ki_pos, 21031c6fdbd8SKent Overstreet iocb->ki_pos + count - 1); 21041c6fdbd8SKent Overstreet if (ret < 0) 21055c1ef830SKent Overstreet goto out; 2106a023127aSKent Overstreet } 21071c6fdbd8SKent Overstreet 21081c6fdbd8SKent Overstreet file_accessed(file); 21091c6fdbd8SKent Overstreet 21101c6fdbd8SKent Overstreet blk_start_plug(&plug); 21111c6fdbd8SKent Overstreet ret = bch2_direct_IO_read(iocb, iter); 21121c6fdbd8SKent Overstreet blk_finish_plug(&plug); 21131c6fdbd8SKent Overstreet 21141c6fdbd8SKent Overstreet if (ret >= 0) 21151c6fdbd8SKent Overstreet iocb->ki_pos += ret; 21161c6fdbd8SKent Overstreet } else { 2117a7ecd30cSKent Overstreet bch2_pagecache_add_get(inode); 21181c6fdbd8SKent Overstreet ret = generic_file_read_iter(iocb, iter); 2119a7ecd30cSKent Overstreet bch2_pagecache_add_put(inode); 21201c6fdbd8SKent Overstreet } 21215c1ef830SKent Overstreet out: 21225c1ef830SKent Overstreet return bch2_err_class(ret); 21231c6fdbd8SKent Overstreet } 21241c6fdbd8SKent Overstreet 21251c6fdbd8SKent Overstreet /* O_DIRECT writes */ 21261c6fdbd8SKent Overstreet 21276fed42bbSKent Overstreet static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, 21286fed42bbSKent Overstreet u64 offset, u64 size, 21296fed42bbSKent Overstreet unsigned nr_replicas, bool compressed) 21306fed42bbSKent Overstreet { 21316fed42bbSKent Overstreet struct btree_trans trans; 21326fed42bbSKent Overstreet struct btree_iter iter; 21336fed42bbSKent Overstreet struct bkey_s_c k; 21346fed42bbSKent Overstreet u64 end = offset + size; 21356fed42bbSKent Overstreet u32 snapshot; 21366fed42bbSKent Overstreet bool ret = true; 21376fed42bbSKent Overstreet int err; 21386fed42bbSKent Overstreet 21396fed42bbSKent Overstreet bch2_trans_init(&trans, c, 0, 0); 21406fed42bbSKent Overstreet retry: 21416fed42bbSKent Overstreet bch2_trans_begin(&trans); 21426fed42bbSKent Overstreet 21436fed42bbSKent Overstreet err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 21446fed42bbSKent Overstreet if (err) 21456fed42bbSKent Overstreet goto err; 21466fed42bbSKent Overstreet 2147e5fa91d7SKent Overstreet for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, 21486fed42bbSKent Overstreet SPOS(inum.inum, offset, snapshot), 21496fed42bbSKent Overstreet BTREE_ITER_SLOTS, k, err) { 2150e88a75ebSKent Overstreet if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) 21516fed42bbSKent Overstreet break; 21526fed42bbSKent Overstreet 21538c6d298aSKent Overstreet if (k.k->p.snapshot != snapshot || 21548c6d298aSKent Overstreet nr_replicas > bch2_bkey_replicas(c, k) || 21556fed42bbSKent Overstreet (!compressed && bch2_bkey_sectors_compressed(k))) { 21566fed42bbSKent Overstreet ret = false; 21576fed42bbSKent Overstreet break; 21586fed42bbSKent Overstreet } 21596fed42bbSKent Overstreet } 21606fed42bbSKent Overstreet 21616fed42bbSKent Overstreet offset = iter.pos.offset; 21626fed42bbSKent Overstreet bch2_trans_iter_exit(&trans, &iter); 21636fed42bbSKent Overstreet err: 2164549d173cSKent Overstreet if (bch2_err_matches(err, BCH_ERR_transaction_restart)) 21656fed42bbSKent Overstreet goto retry; 21666fed42bbSKent Overstreet bch2_trans_exit(&trans); 21676fed42bbSKent Overstreet 21686fed42bbSKent Overstreet return err ? false : ret; 21696fed42bbSKent Overstreet } 21706fed42bbSKent Overstreet 2171182c7bbfSKent Overstreet static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) 2172182c7bbfSKent Overstreet { 2173182c7bbfSKent Overstreet struct bch_fs *c = dio->op.c; 2174182c7bbfSKent Overstreet struct bch_inode_info *inode = dio->inode; 2175182c7bbfSKent Overstreet struct bio *bio = &dio->op.wbio.bio; 2176182c7bbfSKent Overstreet 2177182c7bbfSKent Overstreet return bch2_check_range_allocated(c, inode_inum(inode), 2178182c7bbfSKent Overstreet dio->op.pos.offset, bio_sectors(bio), 2179182c7bbfSKent Overstreet dio->op.opts.data_replicas, 2180182c7bbfSKent Overstreet dio->op.opts.compression != 0); 2181182c7bbfSKent Overstreet } 2182182c7bbfSKent Overstreet 2183a1ee777bSKent Overstreet static void bch2_dio_write_loop_async(struct bch_write_op *); 2184a1ee777bSKent Overstreet static __always_inline long bch2_dio_write_done(struct dio_write *dio); 2185a1ee777bSKent Overstreet 21861c6fdbd8SKent Overstreet /* 21871c6fdbd8SKent Overstreet * We're going to return -EIOCBQUEUED, but we haven't finished consuming the 21881c6fdbd8SKent Overstreet * iov_iter yet, so we need to stash a copy of the iovec: it might be on the 21891c6fdbd8SKent Overstreet * caller's stack, we're not guaranteed that it will live for the duration of 21901c6fdbd8SKent Overstreet * the IO: 21911c6fdbd8SKent Overstreet */ 21921c6fdbd8SKent Overstreet static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) 21931c6fdbd8SKent Overstreet { 21941c6fdbd8SKent Overstreet struct iovec *iov = dio->inline_vecs; 21951c6fdbd8SKent Overstreet 21961c6fdbd8SKent Overstreet /* 21971c6fdbd8SKent Overstreet * iov_iter has a single embedded iovec - nothing to do: 21981c6fdbd8SKent Overstreet */ 21991c6fdbd8SKent Overstreet if (iter_is_ubuf(&dio->iter)) 22001c6fdbd8SKent Overstreet return 0; 22011c6fdbd8SKent Overstreet 22021c6fdbd8SKent Overstreet /* 22031c6fdbd8SKent Overstreet * We don't currently handle non-iovec iov_iters here - return an error, 22041c6fdbd8SKent Overstreet * and we'll fall back to doing the IO synchronously: 22051c6fdbd8SKent Overstreet */ 22061c6fdbd8SKent Overstreet if (!iter_is_iovec(&dio->iter)) 22071c6fdbd8SKent Overstreet return -1; 22081c6fdbd8SKent Overstreet 22091c6fdbd8SKent Overstreet if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { 22101c6fdbd8SKent Overstreet iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), 22111c6fdbd8SKent Overstreet GFP_KERNEL); 22121c6fdbd8SKent Overstreet if (unlikely(!iov)) 22131c6fdbd8SKent Overstreet return -ENOMEM; 22141c6fdbd8SKent Overstreet 22151c6fdbd8SKent Overstreet dio->free_iov = true; 22161c6fdbd8SKent Overstreet } 22171c6fdbd8SKent Overstreet 22181c6fdbd8SKent Overstreet memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); 22191c6fdbd8SKent Overstreet dio->iter.__iov = iov; 22201c6fdbd8SKent Overstreet return 0; 22211c6fdbd8SKent Overstreet } 22221c6fdbd8SKent Overstreet 2223a1ee777bSKent Overstreet static void bch2_dio_write_flush_done(struct closure *cl) 2224a1ee777bSKent Overstreet { 2225a1ee777bSKent Overstreet struct dio_write *dio = container_of(cl, struct dio_write, op.cl); 2226a1ee777bSKent Overstreet struct bch_fs *c = dio->op.c; 2227a1ee777bSKent Overstreet 2228a1ee777bSKent Overstreet closure_debug_destroy(cl); 2229a1ee777bSKent Overstreet 2230a1ee777bSKent Overstreet dio->op.error = bch2_journal_error(&c->journal); 2231a1ee777bSKent Overstreet 2232a1ee777bSKent Overstreet bch2_dio_write_done(dio); 2233a1ee777bSKent Overstreet } 2234a1ee777bSKent Overstreet 2235a1ee777bSKent Overstreet static noinline void bch2_dio_write_flush(struct dio_write *dio) 2236a1ee777bSKent Overstreet { 2237a1ee777bSKent Overstreet struct bch_fs *c = dio->op.c; 2238a1ee777bSKent Overstreet struct bch_inode_unpacked inode; 2239a1ee777bSKent Overstreet int ret; 2240a1ee777bSKent Overstreet 2241a1ee777bSKent Overstreet dio->flush = 0; 2242a1ee777bSKent Overstreet 2243a1ee777bSKent Overstreet closure_init(&dio->op.cl, NULL); 2244a1ee777bSKent Overstreet 2245a1ee777bSKent Overstreet if (!dio->op.error) { 2246a1ee777bSKent Overstreet ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); 2247a8b3a677SKent Overstreet if (ret) { 2248a1ee777bSKent Overstreet dio->op.error = ret; 2249a8b3a677SKent Overstreet } else { 2250a1ee777bSKent Overstreet bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); 2251a8b3a677SKent Overstreet bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); 2252a8b3a677SKent Overstreet } 2253a1ee777bSKent Overstreet } 2254a1ee777bSKent Overstreet 2255a1ee777bSKent Overstreet if (dio->sync) { 2256a1ee777bSKent Overstreet closure_sync(&dio->op.cl); 2257a1ee777bSKent Overstreet closure_debug_destroy(&dio->op.cl); 2258a1ee777bSKent Overstreet } else { 2259a1ee777bSKent Overstreet continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); 2260a1ee777bSKent Overstreet } 2261a1ee777bSKent Overstreet } 2262042a1f26SKent Overstreet 2263182c7bbfSKent Overstreet static __always_inline long bch2_dio_write_done(struct dio_write *dio) 2264182c7bbfSKent Overstreet { 2265182c7bbfSKent Overstreet struct kiocb *req = dio->req; 2266182c7bbfSKent Overstreet struct bch_inode_info *inode = dio->inode; 2267182c7bbfSKent Overstreet bool sync = dio->sync; 2268a1ee777bSKent Overstreet long ret; 2269a1ee777bSKent Overstreet 2270a1ee777bSKent Overstreet if (unlikely(dio->flush)) { 2271a1ee777bSKent Overstreet bch2_dio_write_flush(dio); 2272a1ee777bSKent Overstreet if (!sync) 2273a1ee777bSKent Overstreet return -EIOCBQUEUED; 2274a1ee777bSKent Overstreet } 2275182c7bbfSKent Overstreet 2276a7ecd30cSKent Overstreet bch2_pagecache_block_put(inode); 2277182c7bbfSKent Overstreet 2278182c7bbfSKent Overstreet if (dio->free_iov) 2279182c7bbfSKent Overstreet kfree(dio->iter.__iov); 2280a1ee777bSKent Overstreet 2281a1ee777bSKent Overstreet ret = dio->op.error ?: ((long) dio->written << 9); 2282182c7bbfSKent Overstreet bio_put(&dio->op.wbio.bio); 2283182c7bbfSKent Overstreet 2284182c7bbfSKent Overstreet /* inode->i_dio_count is our ref on inode and thus bch_fs */ 2285182c7bbfSKent Overstreet inode_dio_end(&inode->v); 2286182c7bbfSKent Overstreet 2287182c7bbfSKent Overstreet if (ret < 0) 2288182c7bbfSKent Overstreet ret = bch2_err_class(ret); 2289182c7bbfSKent Overstreet 2290182c7bbfSKent Overstreet if (!sync) { 2291182c7bbfSKent Overstreet req->ki_complete(req, ret); 2292182c7bbfSKent Overstreet ret = -EIOCBQUEUED; 2293182c7bbfSKent Overstreet } 2294182c7bbfSKent Overstreet return ret; 2295182c7bbfSKent Overstreet } 2296182c7bbfSKent Overstreet 2297182c7bbfSKent Overstreet static __always_inline void bch2_dio_write_end(struct dio_write *dio) 2298182c7bbfSKent Overstreet { 2299182c7bbfSKent Overstreet struct bch_fs *c = dio->op.c; 2300182c7bbfSKent Overstreet struct kiocb *req = dio->req; 2301182c7bbfSKent Overstreet struct bch_inode_info *inode = dio->inode; 2302182c7bbfSKent Overstreet struct bio *bio = &dio->op.wbio.bio; 2303182c7bbfSKent Overstreet 2304182c7bbfSKent Overstreet req->ki_pos += (u64) dio->op.written << 9; 2305182c7bbfSKent Overstreet dio->written += dio->op.written; 2306182c7bbfSKent Overstreet 23076b1b186aSKent Overstreet if (dio->extending) { 2308182c7bbfSKent Overstreet spin_lock(&inode->v.i_lock); 2309182c7bbfSKent Overstreet if (req->ki_pos > inode->v.i_size) 2310182c7bbfSKent Overstreet i_size_write(&inode->v, req->ki_pos); 2311182c7bbfSKent Overstreet spin_unlock(&inode->v.i_lock); 23126b1b186aSKent Overstreet } 23136b1b186aSKent Overstreet 23146b1b186aSKent Overstreet if (dio->op.i_sectors_delta || dio->quota_res.sectors) { 23156b1b186aSKent Overstreet mutex_lock(&inode->ei_quota_lock); 23166b1b186aSKent Overstreet __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); 23176b1b186aSKent Overstreet __bch2_quota_reservation_put(c, inode, &dio->quota_res); 23186b1b186aSKent Overstreet mutex_unlock(&inode->ei_quota_lock); 23196b1b186aSKent Overstreet } 2320182c7bbfSKent Overstreet 2321182c7bbfSKent Overstreet bio_release_pages(bio, false); 2322182c7bbfSKent Overstreet 2323182c7bbfSKent Overstreet if (unlikely(dio->op.error)) 2324182c7bbfSKent Overstreet set_bit(EI_INODE_ERROR, &inode->ei_flags); 2325182c7bbfSKent Overstreet } 2326182c7bbfSKent Overstreet 23274d868d18SKent Overstreet static __always_inline long bch2_dio_write_loop(struct dio_write *dio) 23281c6fdbd8SKent Overstreet { 2329182c7bbfSKent Overstreet struct bch_fs *c = dio->op.c; 23301c6fdbd8SKent Overstreet struct kiocb *req = dio->req; 2331182c7bbfSKent Overstreet struct address_space *mapping = dio->mapping; 2332182c7bbfSKent Overstreet struct bch_inode_info *inode = dio->inode; 233301ad6737SKent Overstreet struct bch_io_opts opts; 23349a3df993SKent Overstreet struct bio *bio = &dio->op.wbio.bio; 2335eb8e6e9cSKent Overstreet unsigned unaligned, iter_count; 2336eb8e6e9cSKent Overstreet bool sync = dio->sync, dropped_locks; 23371c6fdbd8SKent Overstreet long ret; 23381c6fdbd8SKent Overstreet 233901ad6737SKent Overstreet bch2_inode_opts_get(&opts, c, &inode->ei_inode); 234001ad6737SKent Overstreet 23411c6fdbd8SKent Overstreet while (1) { 2342eb8e6e9cSKent Overstreet iter_count = dio->iter.count; 2343eb8e6e9cSKent Overstreet 2344182c7bbfSKent Overstreet EBUG_ON(current->faults_disabled_mapping); 23451c6fdbd8SKent Overstreet current->faults_disabled_mapping = mapping; 23461c6fdbd8SKent Overstreet 23471c6fdbd8SKent Overstreet ret = bio_iov_iter_get_pages(bio, &dio->iter); 23481c6fdbd8SKent Overstreet 2349eb8e6e9cSKent Overstreet dropped_locks = fdm_dropped_locks(); 2350eb8e6e9cSKent Overstreet 23511c6fdbd8SKent Overstreet current->faults_disabled_mapping = NULL; 23521c6fdbd8SKent Overstreet 2353eb8e6e9cSKent Overstreet /* 2354eb8e6e9cSKent Overstreet * If the fault handler returned an error but also signalled 2355eb8e6e9cSKent Overstreet * that it dropped & retook ei_pagecache_lock, we just need to 2356eb8e6e9cSKent Overstreet * re-shoot down the page cache and retry: 2357eb8e6e9cSKent Overstreet */ 2358eb8e6e9cSKent Overstreet if (dropped_locks && ret) 2359eb8e6e9cSKent Overstreet ret = 0; 2360eb8e6e9cSKent Overstreet 23611c6fdbd8SKent Overstreet if (unlikely(ret < 0)) 23621c6fdbd8SKent Overstreet goto err; 23631c6fdbd8SKent Overstreet 2364eb8e6e9cSKent Overstreet if (unlikely(dropped_locks)) { 2365eb8e6e9cSKent Overstreet ret = write_invalidate_inode_pages_range(mapping, 2366eb8e6e9cSKent Overstreet req->ki_pos, 2367eb8e6e9cSKent Overstreet req->ki_pos + iter_count - 1); 2368eb8e6e9cSKent Overstreet if (unlikely(ret)) 2369eb8e6e9cSKent Overstreet goto err; 2370eb8e6e9cSKent Overstreet 2371eb8e6e9cSKent Overstreet if (!bio->bi_iter.bi_size) 2372eb8e6e9cSKent Overstreet continue; 2373eb8e6e9cSKent Overstreet } 2374eb8e6e9cSKent Overstreet 23750a426c32SKent Overstreet unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); 23760a426c32SKent Overstreet bio->bi_iter.bi_size -= unaligned; 23770a426c32SKent Overstreet iov_iter_revert(&dio->iter, unaligned); 23780a426c32SKent Overstreet 23790a426c32SKent Overstreet if (!bio->bi_iter.bi_size) { 23800a426c32SKent Overstreet /* 23810a426c32SKent Overstreet * bio_iov_iter_get_pages was only able to get < 23820a426c32SKent Overstreet * blocksize worth of pages: 23830a426c32SKent Overstreet */ 23840a426c32SKent Overstreet ret = -EFAULT; 23850a426c32SKent Overstreet goto err; 23860a426c32SKent Overstreet } 23870a426c32SKent Overstreet 238801ad6737SKent Overstreet bch2_write_op_init(&dio->op, c, opts); 2389182c7bbfSKent Overstreet dio->op.end_io = sync 2390182c7bbfSKent Overstreet ? NULL 2391182c7bbfSKent Overstreet : bch2_dio_write_loop_async; 2392042a1f26SKent Overstreet dio->op.target = dio->op.opts.foreground_target; 2393042a1f26SKent Overstreet dio->op.write_point = writepoint_hashed((unsigned long) current); 2394042a1f26SKent Overstreet dio->op.nr_replicas = dio->op.opts.data_replicas; 23958c6d298aSKent Overstreet dio->op.subvol = inode->ei_subvol; 2396042a1f26SKent Overstreet dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); 2397a8b3a677SKent Overstreet dio->op.devs_need_flush = &inode->ei_devs_need_flush; 2398042a1f26SKent Overstreet 23991df3e199SKent Overstreet if (sync) 24001df3e199SKent Overstreet dio->op.flags |= BCH_WRITE_SYNC; 2401a6336910SKent Overstreet dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; 2402042a1f26SKent Overstreet 24036b1b186aSKent Overstreet ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, 24046b1b186aSKent Overstreet bio_sectors(bio), true); 24056b1b186aSKent Overstreet if (unlikely(ret)) 24066b1b186aSKent Overstreet goto err; 24076b1b186aSKent Overstreet 2408042a1f26SKent Overstreet ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), 2409042a1f26SKent Overstreet dio->op.opts.data_replicas, 0); 2410042a1f26SKent Overstreet if (unlikely(ret) && 2411182c7bbfSKent Overstreet !bch2_dio_write_check_allocated(dio)) 2412042a1f26SKent Overstreet goto err; 24131c6fdbd8SKent Overstreet 24141c6fdbd8SKent Overstreet task_io_account_write(bio->bi_iter.bi_size); 24151c6fdbd8SKent Overstreet 2416182c7bbfSKent Overstreet if (unlikely(dio->iter.count) && 2417182c7bbfSKent Overstreet !dio->sync && 2418182c7bbfSKent Overstreet !dio->loop && 2419182c7bbfSKent Overstreet bch2_dio_write_copy_iov(dio)) 2420286d8ad0SKent Overstreet dio->sync = sync = true; 2421182c7bbfSKent Overstreet 24221c6fdbd8SKent Overstreet dio->loop = true; 2423f8f30863SKent Overstreet closure_call(&dio->op.cl, bch2_write, NULL, NULL); 24241c6fdbd8SKent Overstreet 2425182c7bbfSKent Overstreet if (!sync) 24261c6fdbd8SKent Overstreet return -EIOCBQUEUED; 24279a3df993SKent Overstreet 2428182c7bbfSKent Overstreet bch2_dio_write_end(dio); 24299a3df993SKent Overstreet 2430182c7bbfSKent Overstreet if (likely(!dio->iter.count) || dio->op.error) 24311c6fdbd8SKent Overstreet break; 2432f8f30863SKent Overstreet 24331c6fdbd8SKent Overstreet bio_reset(bio, NULL, REQ_OP_WRITE); 24341c6fdbd8SKent Overstreet } 2435182c7bbfSKent Overstreet out: 2436182c7bbfSKent Overstreet return bch2_dio_write_done(dio); 24371c6fdbd8SKent Overstreet err: 2438182c7bbfSKent Overstreet dio->op.error = ret; 24391c6fdbd8SKent Overstreet 24405468f119SKent Overstreet bio_release_pages(bio, false); 24416b1b186aSKent Overstreet 24426b1b186aSKent Overstreet bch2_quota_reservation_put(c, inode, &dio->quota_res); 2443182c7bbfSKent Overstreet goto out; 24441c6fdbd8SKent Overstreet } 24451c6fdbd8SKent Overstreet 24464d868d18SKent Overstreet static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) 24471c6fdbd8SKent Overstreet { 2448182c7bbfSKent Overstreet struct mm_struct *mm = dio->mm; 24491c6fdbd8SKent Overstreet 2450182c7bbfSKent Overstreet bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); 2451182c7bbfSKent Overstreet 2452182c7bbfSKent Overstreet if (mm) 2453182c7bbfSKent Overstreet kthread_use_mm(mm); 24541c6fdbd8SKent Overstreet bch2_dio_write_loop(dio); 2455182c7bbfSKent Overstreet if (mm) 2456182c7bbfSKent Overstreet kthread_unuse_mm(mm); 24571c6fdbd8SKent Overstreet } 24581c6fdbd8SKent Overstreet 24594d868d18SKent Overstreet static void bch2_dio_write_loop_async(struct bch_write_op *op) 24604d868d18SKent Overstreet { 24614d868d18SKent Overstreet struct dio_write *dio = container_of(op, struct dio_write, op); 24624d868d18SKent Overstreet 24634d868d18SKent Overstreet bch2_dio_write_end(dio); 24644d868d18SKent Overstreet 24654d868d18SKent Overstreet if (likely(!dio->iter.count) || dio->op.error) 24664d868d18SKent Overstreet bch2_dio_write_done(dio); 24674d868d18SKent Overstreet else 24684d868d18SKent Overstreet bch2_dio_write_continue(dio); 24694d868d18SKent Overstreet } 24704d868d18SKent Overstreet 24711c6fdbd8SKent Overstreet static noinline 24721c6fdbd8SKent Overstreet ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) 24731c6fdbd8SKent Overstreet { 24741c6fdbd8SKent Overstreet struct file *file = req->ki_filp; 247554847d25SKent Overstreet struct address_space *mapping = file->f_mapping; 24761c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 24771c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 24781c6fdbd8SKent Overstreet struct dio_write *dio; 24791c6fdbd8SKent Overstreet struct bio *bio; 24807edcfbfeSKent Overstreet bool locked = true, extending; 24811c6fdbd8SKent Overstreet ssize_t ret; 24821c6fdbd8SKent Overstreet 24837edcfbfeSKent Overstreet prefetch(&c->opts); 24847edcfbfeSKent Overstreet prefetch((void *) &c->opts + 64); 24857edcfbfeSKent Overstreet prefetch(&inode->ei_inode); 24867edcfbfeSKent Overstreet prefetch((void *) &inode->ei_inode + 64); 24871c6fdbd8SKent Overstreet 24887edcfbfeSKent Overstreet inode_lock(&inode->v); 24897edcfbfeSKent Overstreet 24907edcfbfeSKent Overstreet ret = generic_write_checks(req, iter); 24917edcfbfeSKent Overstreet if (unlikely(ret <= 0)) 24927edcfbfeSKent Overstreet goto err; 24937edcfbfeSKent Overstreet 24947edcfbfeSKent Overstreet ret = file_remove_privs(file); 24957edcfbfeSKent Overstreet if (unlikely(ret)) 24967edcfbfeSKent Overstreet goto err; 24977edcfbfeSKent Overstreet 24987edcfbfeSKent Overstreet ret = file_update_time(file); 24997edcfbfeSKent Overstreet if (unlikely(ret)) 25007edcfbfeSKent Overstreet goto err; 25011c6fdbd8SKent Overstreet 2502919dbbd1SKent Overstreet if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) 25037edcfbfeSKent Overstreet goto err; 25047edcfbfeSKent Overstreet 25057edcfbfeSKent Overstreet inode_dio_begin(&inode->v); 2506a7ecd30cSKent Overstreet bch2_pagecache_block_get(inode); 25077edcfbfeSKent Overstreet 25087edcfbfeSKent Overstreet extending = req->ki_pos + iter->count > inode->v.i_size; 25097edcfbfeSKent Overstreet if (!extending) { 25107edcfbfeSKent Overstreet inode_unlock(&inode->v); 25117edcfbfeSKent Overstreet locked = false; 25127edcfbfeSKent Overstreet } 25131c6fdbd8SKent Overstreet 25141c6fdbd8SKent Overstreet bio = bio_alloc_bioset(NULL, 25154d126dc8SKent Overstreet bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 25161c6fdbd8SKent Overstreet REQ_OP_WRITE, 25171c6fdbd8SKent Overstreet GFP_KERNEL, 25181c6fdbd8SKent Overstreet &c->dio_write_bioset); 25199a3df993SKent Overstreet dio = container_of(bio, struct dio_write, op.wbio.bio); 25201c6fdbd8SKent Overstreet dio->req = req; 2521182c7bbfSKent Overstreet dio->mapping = mapping; 2522182c7bbfSKent Overstreet dio->inode = inode; 2523ed484030SKent Overstreet dio->mm = current->mm; 25241c6fdbd8SKent Overstreet dio->loop = false; 25256b1b186aSKent Overstreet dio->extending = extending; 25267edcfbfeSKent Overstreet dio->sync = is_sync_kiocb(req) || extending; 2527a1ee777bSKent Overstreet dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; 25281c6fdbd8SKent Overstreet dio->free_iov = false; 25291c6fdbd8SKent Overstreet dio->quota_res.sectors = 0; 2530042a1f26SKent Overstreet dio->written = 0; 25311c6fdbd8SKent Overstreet dio->iter = *iter; 2532182c7bbfSKent Overstreet dio->op.c = c; 25339a3df993SKent Overstreet 2534a023127aSKent Overstreet if (unlikely(mapping->nrpages)) { 253554847d25SKent Overstreet ret = write_invalidate_inode_pages_range(mapping, 253654847d25SKent Overstreet req->ki_pos, 253754847d25SKent Overstreet req->ki_pos + iter->count - 1); 253854847d25SKent Overstreet if (unlikely(ret)) 253954847d25SKent Overstreet goto err_put_bio; 2540a023127aSKent Overstreet } 254154847d25SKent Overstreet 25427edcfbfeSKent Overstreet ret = bch2_dio_write_loop(dio); 25431c6fdbd8SKent Overstreet err: 25447edcfbfeSKent Overstreet if (locked) 25457edcfbfeSKent Overstreet inode_unlock(&inode->v); 25467edcfbfeSKent Overstreet return ret; 25477edcfbfeSKent Overstreet err_put_bio: 2548a7ecd30cSKent Overstreet bch2_pagecache_block_put(inode); 25491c6fdbd8SKent Overstreet bio_put(bio); 25507edcfbfeSKent Overstreet inode_dio_end(&inode->v); 25517edcfbfeSKent Overstreet goto err; 25521c6fdbd8SKent Overstreet } 25531c6fdbd8SKent Overstreet 25547edcfbfeSKent Overstreet ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) 25551c6fdbd8SKent Overstreet { 25561c6fdbd8SKent Overstreet struct file *file = iocb->ki_filp; 25577edcfbfeSKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 25581c6fdbd8SKent Overstreet ssize_t ret; 25591c6fdbd8SKent Overstreet 25605c1ef830SKent Overstreet if (iocb->ki_flags & IOCB_DIRECT) { 25615c1ef830SKent Overstreet ret = bch2_direct_write(iocb, from); 25625c1ef830SKent Overstreet goto out; 25635c1ef830SKent Overstreet } 25641c6fdbd8SKent Overstreet 25657edcfbfeSKent Overstreet inode_lock(&inode->v); 25667edcfbfeSKent Overstreet 25677edcfbfeSKent Overstreet ret = generic_write_checks(iocb, from); 25687edcfbfeSKent Overstreet if (ret <= 0) 25697edcfbfeSKent Overstreet goto unlock; 25707edcfbfeSKent Overstreet 25711c6fdbd8SKent Overstreet ret = file_remove_privs(file); 25721c6fdbd8SKent Overstreet if (ret) 25737edcfbfeSKent Overstreet goto unlock; 25741c6fdbd8SKent Overstreet 25751c6fdbd8SKent Overstreet ret = file_update_time(file); 25761c6fdbd8SKent Overstreet if (ret) 25777edcfbfeSKent Overstreet goto unlock; 25781c6fdbd8SKent Overstreet 25797edcfbfeSKent Overstreet ret = bch2_buffered_write(iocb, from); 25801c6fdbd8SKent Overstreet if (likely(ret > 0)) 25811c6fdbd8SKent Overstreet iocb->ki_pos += ret; 25827edcfbfeSKent Overstreet unlock: 25831c6fdbd8SKent Overstreet inode_unlock(&inode->v); 25841c6fdbd8SKent Overstreet 25857edcfbfeSKent Overstreet if (ret > 0) 25861c6fdbd8SKent Overstreet ret = generic_write_sync(iocb, ret); 25875c1ef830SKent Overstreet out: 25885c1ef830SKent Overstreet return bch2_err_class(ret); 25891c6fdbd8SKent Overstreet } 25901c6fdbd8SKent Overstreet 25911c6fdbd8SKent Overstreet /* fsync: */ 25921c6fdbd8SKent Overstreet 259368a2054dSKent Overstreet /* 259468a2054dSKent Overstreet * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an 259568a2054dSKent Overstreet * insert trigger: look up the btree inode instead 259668a2054dSKent Overstreet */ 2597a8b3a677SKent Overstreet static int bch2_flush_inode(struct bch_fs *c, 2598a8b3a677SKent Overstreet struct bch_inode_info *inode) 259968a2054dSKent Overstreet { 2600a8b3a677SKent Overstreet struct bch_inode_unpacked u; 260168a2054dSKent Overstreet int ret; 260268a2054dSKent Overstreet 260368a2054dSKent Overstreet if (c->opts.journal_flush_disabled) 260468a2054dSKent Overstreet return 0; 260568a2054dSKent Overstreet 2606a8b3a677SKent Overstreet ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); 260768a2054dSKent Overstreet if (ret) 260868a2054dSKent Overstreet return ret; 260968a2054dSKent Overstreet 2610a8b3a677SKent Overstreet return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: 2611a8b3a677SKent Overstreet bch2_inode_flush_nocow_writes(c, inode); 261268a2054dSKent Overstreet } 261368a2054dSKent Overstreet 26141c6fdbd8SKent Overstreet int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) 26151c6fdbd8SKent Overstreet { 26161c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 26171c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 261868a2054dSKent Overstreet int ret, ret2, ret3; 26191c6fdbd8SKent Overstreet 26201c6fdbd8SKent Overstreet ret = file_write_and_wait_range(file, start, end); 262168a2054dSKent Overstreet ret2 = sync_inode_metadata(&inode->v, 1); 2622a8b3a677SKent Overstreet ret3 = bch2_flush_inode(c, inode); 26231c6fdbd8SKent Overstreet 26245c1ef830SKent Overstreet return bch2_err_class(ret ?: ret2 ?: ret3); 26251c6fdbd8SKent Overstreet } 26261c6fdbd8SKent Overstreet 26271c6fdbd8SKent Overstreet /* truncate: */ 26281c6fdbd8SKent Overstreet 26296fed42bbSKent Overstreet static inline int range_has_data(struct bch_fs *c, u32 subvol, 26301c6fdbd8SKent Overstreet struct bpos start, 26311c6fdbd8SKent Overstreet struct bpos end) 26321c6fdbd8SKent Overstreet { 2633424eb881SKent Overstreet struct btree_trans trans; 263467e0dd8fSKent Overstreet struct btree_iter iter; 26351c6fdbd8SKent Overstreet struct bkey_s_c k; 26361c6fdbd8SKent Overstreet int ret = 0; 26371c6fdbd8SKent Overstreet 263820bceecbSKent Overstreet bch2_trans_init(&trans, c, 0, 0); 26396fed42bbSKent Overstreet retry: 26406fed42bbSKent Overstreet bch2_trans_begin(&trans); 26416fed42bbSKent Overstreet 26426fed42bbSKent Overstreet ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); 26436fed42bbSKent Overstreet if (ret) 26446fed42bbSKent Overstreet goto err; 2645424eb881SKent Overstreet 2646c72f687aSKent Overstreet for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret) 26471c6fdbd8SKent Overstreet if (bkey_extent_is_data(k.k)) { 26481c6fdbd8SKent Overstreet ret = 1; 26491c6fdbd8SKent Overstreet break; 26501c6fdbd8SKent Overstreet } 26516fed42bbSKent Overstreet start = iter.pos; 265267e0dd8fSKent Overstreet bch2_trans_iter_exit(&trans, &iter); 26536fed42bbSKent Overstreet err: 2654549d173cSKent Overstreet if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 26556fed42bbSKent Overstreet goto retry; 26561c6fdbd8SKent Overstreet 26579a796fdbSKent Overstreet bch2_trans_exit(&trans); 26589a796fdbSKent Overstreet return ret; 26591c6fdbd8SKent Overstreet } 26601c6fdbd8SKent Overstreet 26611c6fdbd8SKent Overstreet static int __bch2_truncate_page(struct bch_inode_info *inode, 26621c6fdbd8SKent Overstreet pgoff_t index, loff_t start, loff_t end) 26631c6fdbd8SKent Overstreet { 26641c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 26651c6fdbd8SKent Overstreet struct address_space *mapping = inode->v.i_mapping; 26663342ac13SKent Overstreet struct bch_folio *s; 26671c6fdbd8SKent Overstreet unsigned start_offset = start & (PAGE_SIZE - 1); 26681c6fdbd8SKent Overstreet unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; 2669a99b1cafSKent Overstreet unsigned i; 2670*30bff594SKent Overstreet struct folio *folio; 2671b19d307dSKent Overstreet s64 i_sectors_delta = 0; 26721c6fdbd8SKent Overstreet int ret = 0; 26731c6fdbd8SKent Overstreet 26741c6fdbd8SKent Overstreet /* Page boundary? Nothing to do */ 26751c6fdbd8SKent Overstreet if (!((index == start >> PAGE_SHIFT && start_offset) || 26761c6fdbd8SKent Overstreet (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) 26771c6fdbd8SKent Overstreet return 0; 26781c6fdbd8SKent Overstreet 26791c6fdbd8SKent Overstreet /* Above i_size? */ 26801c6fdbd8SKent Overstreet if (index << PAGE_SHIFT >= inode->v.i_size) 26811c6fdbd8SKent Overstreet return 0; 26821c6fdbd8SKent Overstreet 2683*30bff594SKent Overstreet folio = filemap_lock_folio(mapping, index); 2684*30bff594SKent Overstreet if (!folio) { 26851c6fdbd8SKent Overstreet /* 26861c6fdbd8SKent Overstreet * XXX: we're doing two index lookups when we end up reading the 2687*30bff594SKent Overstreet * folio 26881c6fdbd8SKent Overstreet */ 26896fed42bbSKent Overstreet ret = range_has_data(c, inode->ei_subvol, 2690c72f687aSKent Overstreet POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), 2691c72f687aSKent Overstreet POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); 26921c6fdbd8SKent Overstreet if (ret <= 0) 26931c6fdbd8SKent Overstreet return ret; 26941c6fdbd8SKent Overstreet 2695*30bff594SKent Overstreet folio = __filemap_get_folio(mapping, index, 2696*30bff594SKent Overstreet FGP_LOCK|FGP_CREAT, GFP_KERNEL); 2697*30bff594SKent Overstreet if (unlikely(!folio)) { 26981c6fdbd8SKent Overstreet ret = -ENOMEM; 26991c6fdbd8SKent Overstreet goto out; 27001c6fdbd8SKent Overstreet } 27011c6fdbd8SKent Overstreet } 27021c6fdbd8SKent Overstreet 2703*30bff594SKent Overstreet s = bch2_folio_create(folio, 0); 2704a99b1cafSKent Overstreet if (!s) { 2705a99b1cafSKent Overstreet ret = -ENOMEM; 2706a99b1cafSKent Overstreet goto unlock; 2707a99b1cafSKent Overstreet } 2708a99b1cafSKent Overstreet 2709*30bff594SKent Overstreet if (!folio_test_uptodate(folio)) { 2710*30bff594SKent Overstreet ret = bch2_read_single_folio(folio, mapping); 27111c6fdbd8SKent Overstreet if (ret) 27121c6fdbd8SKent Overstreet goto unlock; 27131c6fdbd8SKent Overstreet } 27141c6fdbd8SKent Overstreet 2715c437e153SKent Overstreet BUG_ON(!s->uptodate); 2716c437e153SKent Overstreet 2717a99b1cafSKent Overstreet if (index != start >> PAGE_SHIFT) 2718a99b1cafSKent Overstreet start_offset = 0; 2719a99b1cafSKent Overstreet if (index != end >> PAGE_SHIFT) 2720a99b1cafSKent Overstreet end_offset = PAGE_SIZE; 2721a99b1cafSKent Overstreet 2722a99b1cafSKent Overstreet for (i = round_up(start_offset, block_bytes(c)) >> 9; 2723a99b1cafSKent Overstreet i < round_down(end_offset, block_bytes(c)) >> 9; 2724a99b1cafSKent Overstreet i++) { 2725a99b1cafSKent Overstreet s->s[i].nr_replicas = 0; 2726b19d307dSKent Overstreet if (s->s[i].state == SECTOR_DIRTY) 2727b19d307dSKent Overstreet i_sectors_delta--; 2728a99b1cafSKent Overstreet s->s[i].state = SECTOR_UNALLOCATED; 2729a99b1cafSKent Overstreet } 2730a99b1cafSKent Overstreet 2731b19d307dSKent Overstreet i_sectors_acct(c, inode, NULL, i_sectors_delta); 2732b19d307dSKent Overstreet 273374163da7SKent Overstreet /* 2734*30bff594SKent Overstreet * Caller needs to know whether this folio will be written out by 273574163da7SKent Overstreet * writeback - doing an i_size update if necessary - or whether it will 273674163da7SKent Overstreet * be responsible for the i_size update: 273774163da7SKent Overstreet */ 273874163da7SKent Overstreet ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT), 273974163da7SKent Overstreet PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY; 274074163da7SKent Overstreet 2741*30bff594SKent Overstreet folio_zero_segment(folio, start_offset, end_offset); 2742a99b1cafSKent Overstreet 27431c6fdbd8SKent Overstreet /* 27441c6fdbd8SKent Overstreet * Bit of a hack - we don't want truncate to fail due to -ENOSPC. 27451c6fdbd8SKent Overstreet * 2746*30bff594SKent Overstreet * XXX: because we aren't currently tracking whether the folio has actual 27471c6fdbd8SKent Overstreet * data in it (vs. just 0s, or only partially written) this wrong. ick. 27481c6fdbd8SKent Overstreet */ 2749*30bff594SKent Overstreet BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); 27501c6fdbd8SKent Overstreet 27519ba2eb25SKent Overstreet /* 27529ba2eb25SKent Overstreet * This removes any writeable userspace mappings; we need to force 27539ba2eb25SKent Overstreet * .page_mkwrite to be called again before any mmapped writes, to 27549ba2eb25SKent Overstreet * redirty the full page: 27559ba2eb25SKent Overstreet */ 2756*30bff594SKent Overstreet folio_mkclean(folio); 2757*30bff594SKent Overstreet filemap_dirty_folio(mapping, folio); 27581c6fdbd8SKent Overstreet unlock: 2759*30bff594SKent Overstreet folio_unlock(folio); 2760*30bff594SKent Overstreet folio_put(folio); 27611c6fdbd8SKent Overstreet out: 27621c6fdbd8SKent Overstreet return ret; 27631c6fdbd8SKent Overstreet } 27641c6fdbd8SKent Overstreet 27651c6fdbd8SKent Overstreet static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) 27661c6fdbd8SKent Overstreet { 27671c6fdbd8SKent Overstreet return __bch2_truncate_page(inode, from >> PAGE_SHIFT, 2768a99b1cafSKent Overstreet from, round_up(from, PAGE_SIZE)); 27691c6fdbd8SKent Overstreet } 27701c6fdbd8SKent Overstreet 277174163da7SKent Overstreet static int bch2_truncate_pages(struct bch_inode_info *inode, 277274163da7SKent Overstreet loff_t start, loff_t end) 277374163da7SKent Overstreet { 277474163da7SKent Overstreet int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT, 277574163da7SKent Overstreet start, end); 277674163da7SKent Overstreet 277774163da7SKent Overstreet if (ret >= 0 && 277874163da7SKent Overstreet start >> PAGE_SHIFT != end >> PAGE_SHIFT) 277974163da7SKent Overstreet ret = __bch2_truncate_page(inode, 278074163da7SKent Overstreet end >> PAGE_SHIFT, 278174163da7SKent Overstreet start, end); 278274163da7SKent Overstreet return ret; 278374163da7SKent Overstreet } 278474163da7SKent Overstreet 278568a507a2SKent Overstreet static int bch2_extend(struct mnt_idmap *idmap, 278668a507a2SKent Overstreet struct bch_inode_info *inode, 2787e0541a93SKent Overstreet struct bch_inode_unpacked *inode_u, 2788e0541a93SKent Overstreet struct iattr *iattr) 27891c6fdbd8SKent Overstreet { 27901c6fdbd8SKent Overstreet struct address_space *mapping = inode->v.i_mapping; 27911c6fdbd8SKent Overstreet int ret; 27921c6fdbd8SKent Overstreet 2793e0541a93SKent Overstreet /* 2794e0541a93SKent Overstreet * sync appends: 27952925fc49SKent Overstreet * 27962925fc49SKent Overstreet * this has to be done _before_ extending i_size: 2797e0541a93SKent Overstreet */ 2798e0541a93SKent Overstreet ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); 27991c6fdbd8SKent Overstreet if (ret) 28001c6fdbd8SKent Overstreet return ret; 28011c6fdbd8SKent Overstreet 28021c6fdbd8SKent Overstreet truncate_setsize(&inode->v, iattr->ia_size); 28031c6fdbd8SKent Overstreet 280468a507a2SKent Overstreet return bch2_setattr_nonsize(idmap, inode, iattr); 28051c6fdbd8SKent Overstreet } 28061c6fdbd8SKent Overstreet 280754e2264eSKent Overstreet static int bch2_truncate_finish_fn(struct bch_inode_info *inode, 280854e2264eSKent Overstreet struct bch_inode_unpacked *bi, 280954e2264eSKent Overstreet void *p) 281054e2264eSKent Overstreet { 281154e2264eSKent Overstreet bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; 281254e2264eSKent Overstreet return 0; 281354e2264eSKent Overstreet } 281454e2264eSKent Overstreet 281554e2264eSKent Overstreet static int bch2_truncate_start_fn(struct bch_inode_info *inode, 281654e2264eSKent Overstreet struct bch_inode_unpacked *bi, void *p) 281754e2264eSKent Overstreet { 281854e2264eSKent Overstreet u64 *new_i_size = p; 281954e2264eSKent Overstreet 282054e2264eSKent Overstreet bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; 282154e2264eSKent Overstreet bi->bi_size = *new_i_size; 282254e2264eSKent Overstreet return 0; 282354e2264eSKent Overstreet } 282454e2264eSKent Overstreet 282568a507a2SKent Overstreet int bch2_truncate(struct mnt_idmap *idmap, 282668a507a2SKent Overstreet struct bch_inode_info *inode, struct iattr *iattr) 28271c6fdbd8SKent Overstreet { 28281c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 28291c6fdbd8SKent Overstreet struct address_space *mapping = inode->v.i_mapping; 2830e0541a93SKent Overstreet struct bch_inode_unpacked inode_u; 283154e2264eSKent Overstreet u64 new_i_size = iattr->ia_size; 28322e87eae1SKent Overstreet s64 i_sectors_delta = 0; 28331c6fdbd8SKent Overstreet int ret = 0; 28341c6fdbd8SKent Overstreet 283568a507a2SKent Overstreet /* 283678d66ab1SDan Robertson * If the truncate call with change the size of the file, the 283778d66ab1SDan Robertson * cmtimes should be updated. If the size will not change, we 283878d66ab1SDan Robertson * do not need to update the cmtimes. 283968a507a2SKent Overstreet */ 284078d66ab1SDan Robertson if (iattr->ia_size != inode->v.i_size) { 284168a507a2SKent Overstreet if (!(iattr->ia_valid & ATTR_MTIME)) 284268a507a2SKent Overstreet ktime_get_coarse_real_ts64(&iattr->ia_mtime); 284368a507a2SKent Overstreet if (!(iattr->ia_valid & ATTR_CTIME)) 284468a507a2SKent Overstreet ktime_get_coarse_real_ts64(&iattr->ia_ctime); 284568a507a2SKent Overstreet iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; 284678d66ab1SDan Robertson } 284768a507a2SKent Overstreet 28481c6fdbd8SKent Overstreet inode_dio_wait(&inode->v); 2849a7ecd30cSKent Overstreet bch2_pagecache_block_get(inode); 28501c6fdbd8SKent Overstreet 28516fed42bbSKent Overstreet ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); 2852e0541a93SKent Overstreet if (ret) 2853e0541a93SKent Overstreet goto err; 28541c6fdbd8SKent Overstreet 2855c45d473dSKent Overstreet /* 2856c45d473dSKent Overstreet * check this before next assertion; on filesystem error our normal 2857c45d473dSKent Overstreet * invariants are a bit broken (truncate has to truncate the page cache 2858c45d473dSKent Overstreet * before the inode). 2859c45d473dSKent Overstreet */ 2860c45d473dSKent Overstreet ret = bch2_journal_error(&c->journal); 2861c45d473dSKent Overstreet if (ret) 2862c45d473dSKent Overstreet goto err; 2863c45d473dSKent Overstreet 28648eb71e9eSKent Overstreet WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && 28658eb71e9eSKent Overstreet inode->v.i_size < inode_u.bi_size, 28668eb71e9eSKent Overstreet "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", 28678eb71e9eSKent Overstreet (u64) inode->v.i_size, inode_u.bi_size); 2868e0541a93SKent Overstreet 2869e0541a93SKent Overstreet if (iattr->ia_size > inode->v.i_size) { 287068a507a2SKent Overstreet ret = bch2_extend(idmap, inode, &inode_u, iattr); 287154e2264eSKent Overstreet goto err; 28721c6fdbd8SKent Overstreet } 28731c6fdbd8SKent Overstreet 287468a507a2SKent Overstreet iattr->ia_valid &= ~ATTR_SIZE; 287568a507a2SKent Overstreet 28761c6fdbd8SKent Overstreet ret = bch2_truncate_page(inode, iattr->ia_size); 287774163da7SKent Overstreet if (unlikely(ret < 0)) 287854e2264eSKent Overstreet goto err; 28791c6fdbd8SKent Overstreet 28806cc3535dSKent Overstreet /* 28816cc3535dSKent Overstreet * When extending, we're going to write the new i_size to disk 28826cc3535dSKent Overstreet * immediately so we need to flush anything above the current on disk 28836cc3535dSKent Overstreet * i_size first: 28846cc3535dSKent Overstreet * 28856cc3535dSKent Overstreet * Also, when extending we need to flush the page that i_size currently 28866cc3535dSKent Overstreet * straddles - if it's mapped to userspace, we need to ensure that 28876cc3535dSKent Overstreet * userspace has to redirty it and call .mkwrite -> set_page_dirty 28886cc3535dSKent Overstreet * again to allocate the part of the page that was extended. 28896cc3535dSKent Overstreet */ 2890e0541a93SKent Overstreet if (iattr->ia_size > inode_u.bi_size) 28911c6fdbd8SKent Overstreet ret = filemap_write_and_wait_range(mapping, 2892e0541a93SKent Overstreet inode_u.bi_size, 28931c6fdbd8SKent Overstreet iattr->ia_size - 1); 28941c6fdbd8SKent Overstreet else if (iattr->ia_size & (PAGE_SIZE - 1)) 28951c6fdbd8SKent Overstreet ret = filemap_write_and_wait_range(mapping, 28961c6fdbd8SKent Overstreet round_down(iattr->ia_size, PAGE_SIZE), 28971c6fdbd8SKent Overstreet iattr->ia_size - 1); 28981c6fdbd8SKent Overstreet if (ret) 289954e2264eSKent Overstreet goto err; 29001c6fdbd8SKent Overstreet 290154e2264eSKent Overstreet mutex_lock(&inode->ei_update_lock); 290254e2264eSKent Overstreet ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, 290354e2264eSKent Overstreet &new_i_size, 0); 290454e2264eSKent Overstreet mutex_unlock(&inode->ei_update_lock); 29051c6fdbd8SKent Overstreet 29061c6fdbd8SKent Overstreet if (unlikely(ret)) 290754e2264eSKent Overstreet goto err; 29081c6fdbd8SKent Overstreet 29091c6fdbd8SKent Overstreet truncate_setsize(&inode->v, iattr->ia_size); 29101c6fdbd8SKent Overstreet 29118c6d298aSKent Overstreet ret = bch2_fpunch(c, inode_inum(inode), 2912a99b1cafSKent Overstreet round_up(iattr->ia_size, block_bytes(c)) >> 9, 291368a2054dSKent Overstreet U64_MAX, &i_sectors_delta); 29142e87eae1SKent Overstreet i_sectors_acct(c, inode, NULL, i_sectors_delta); 29152e87eae1SKent Overstreet 2916b33bf1bcSKent Overstreet bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && 2917b33bf1bcSKent Overstreet !bch2_journal_error(&c->journal), c, 2918b33bf1bcSKent Overstreet "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", 2919b33bf1bcSKent Overstreet inode->v.i_ino, (u64) inode->v.i_blocks, 2920b33bf1bcSKent Overstreet inode->ei_inode.bi_sectors); 29211c6fdbd8SKent Overstreet if (unlikely(ret)) 292254e2264eSKent Overstreet goto err; 29231c6fdbd8SKent Overstreet 292454e2264eSKent Overstreet mutex_lock(&inode->ei_update_lock); 292568a507a2SKent Overstreet ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); 292654e2264eSKent Overstreet mutex_unlock(&inode->ei_update_lock); 292768a507a2SKent Overstreet 292868a507a2SKent Overstreet ret = bch2_setattr_nonsize(idmap, inode, iattr); 292954e2264eSKent Overstreet err: 2930a7ecd30cSKent Overstreet bch2_pagecache_block_put(inode); 29315c1ef830SKent Overstreet return bch2_err_class(ret); 29321c6fdbd8SKent Overstreet } 29331c6fdbd8SKent Overstreet 29341c6fdbd8SKent Overstreet /* fallocate: */ 29351c6fdbd8SKent Overstreet 2936050197b1SKent Overstreet static int inode_update_times_fn(struct bch_inode_info *inode, 2937050197b1SKent Overstreet struct bch_inode_unpacked *bi, void *p) 2938050197b1SKent Overstreet { 2939050197b1SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 2940050197b1SKent Overstreet 2941050197b1SKent Overstreet bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); 2942050197b1SKent Overstreet return 0; 2943050197b1SKent Overstreet } 2944050197b1SKent Overstreet 29452e87eae1SKent Overstreet static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) 29461c6fdbd8SKent Overstreet { 29471c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 294874163da7SKent Overstreet u64 end = offset + len; 294974163da7SKent Overstreet u64 block_start = round_up(offset, block_bytes(c)); 295074163da7SKent Overstreet u64 block_end = round_down(end, block_bytes(c)); 295174163da7SKent Overstreet bool truncated_last_page; 29521c6fdbd8SKent Overstreet int ret = 0; 29531c6fdbd8SKent Overstreet 295474163da7SKent Overstreet ret = bch2_truncate_pages(inode, offset, end); 295574163da7SKent Overstreet if (unlikely(ret < 0)) 29561c6fdbd8SKent Overstreet goto err; 29571c6fdbd8SKent Overstreet 295874163da7SKent Overstreet truncated_last_page = ret; 29591c6fdbd8SKent Overstreet 296074163da7SKent Overstreet truncate_pagecache_range(&inode->v, offset, end - 1); 29611c6fdbd8SKent Overstreet 296274163da7SKent Overstreet if (block_start < block_end) { 29632e87eae1SKent Overstreet s64 i_sectors_delta = 0; 29642e87eae1SKent Overstreet 29658c6d298aSKent Overstreet ret = bch2_fpunch(c, inode_inum(inode), 296674163da7SKent Overstreet block_start >> 9, block_end >> 9, 29672e87eae1SKent Overstreet &i_sectors_delta); 29682e87eae1SKent Overstreet i_sectors_acct(c, inode, NULL, i_sectors_delta); 29692e87eae1SKent Overstreet } 2970050197b1SKent Overstreet 2971050197b1SKent Overstreet mutex_lock(&inode->ei_update_lock); 297274163da7SKent Overstreet if (end >= inode->v.i_size && !truncated_last_page) { 297374163da7SKent Overstreet ret = bch2_write_inode_size(c, inode, inode->v.i_size, 297474163da7SKent Overstreet ATTR_MTIME|ATTR_CTIME); 297574163da7SKent Overstreet } else { 2976050197b1SKent Overstreet ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 297774163da7SKent Overstreet ATTR_MTIME|ATTR_CTIME); 297874163da7SKent Overstreet } 2979050197b1SKent Overstreet mutex_unlock(&inode->ei_update_lock); 29801c6fdbd8SKent Overstreet err: 29811c6fdbd8SKent Overstreet return ret; 29821c6fdbd8SKent Overstreet } 29831c6fdbd8SKent Overstreet 29842e87eae1SKent Overstreet static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, 29855f786787SKent Overstreet loff_t offset, loff_t len, 29865f786787SKent Overstreet bool insert) 29871c6fdbd8SKent Overstreet { 29881c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 29891c6fdbd8SKent Overstreet struct address_space *mapping = inode->v.i_mapping; 299007a1006aSKent Overstreet struct bkey_buf copy; 2991d69f41d6SKent Overstreet struct btree_trans trans; 299267e0dd8fSKent Overstreet struct btree_iter src, dst, del; 29935f786787SKent Overstreet loff_t shift, new_size; 29945f786787SKent Overstreet u64 src_start; 299550dc0f69SKent Overstreet int ret = 0; 29961c6fdbd8SKent Overstreet 29971c6fdbd8SKent Overstreet if ((offset | len) & (block_bytes(c) - 1)) 29981c6fdbd8SKent Overstreet return -EINVAL; 29991c6fdbd8SKent Overstreet 30005f786787SKent Overstreet if (insert) { 30015f786787SKent Overstreet if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) 300274163da7SKent Overstreet return -EFBIG; 30035f786787SKent Overstreet 30045f786787SKent Overstreet if (offset >= inode->v.i_size) 300574163da7SKent Overstreet return -EINVAL; 30065f786787SKent Overstreet 30075f786787SKent Overstreet src_start = U64_MAX; 30085f786787SKent Overstreet shift = len; 30095f786787SKent Overstreet } else { 30101c6fdbd8SKent Overstreet if (offset + len >= inode->v.i_size) 301174163da7SKent Overstreet return -EINVAL; 30121c6fdbd8SKent Overstreet 30135f786787SKent Overstreet src_start = offset + len; 30145f786787SKent Overstreet shift = -len; 30155f786787SKent Overstreet } 30161c6fdbd8SKent Overstreet 30175f786787SKent Overstreet new_size = inode->v.i_size + shift; 30181c6fdbd8SKent Overstreet 30191c6fdbd8SKent Overstreet ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); 30201c6fdbd8SKent Overstreet if (ret) 302174163da7SKent Overstreet return ret; 30221c6fdbd8SKent Overstreet 30235f786787SKent Overstreet if (insert) { 30245f786787SKent Overstreet i_size_write(&inode->v, new_size); 30255f786787SKent Overstreet mutex_lock(&inode->ei_update_lock); 30265f786787SKent Overstreet ret = bch2_write_inode_size(c, inode, new_size, 30275f786787SKent Overstreet ATTR_MTIME|ATTR_CTIME); 30285f786787SKent Overstreet mutex_unlock(&inode->ei_update_lock); 30295f786787SKent Overstreet } else { 30302e87eae1SKent Overstreet s64 i_sectors_delta = 0; 30312e87eae1SKent Overstreet 30328c6d298aSKent Overstreet ret = bch2_fpunch(c, inode_inum(inode), 30332e87eae1SKent Overstreet offset >> 9, (offset + len) >> 9, 30342e87eae1SKent Overstreet &i_sectors_delta); 30352e87eae1SKent Overstreet i_sectors_acct(c, inode, NULL, i_sectors_delta); 30362e87eae1SKent Overstreet 303763095894SKent Overstreet if (ret) 303874163da7SKent Overstreet return ret; 30395f786787SKent Overstreet } 30408ef231bdSKent Overstreet 304150dc0f69SKent Overstreet bch2_bkey_buf_init(©); 3042f7beb4caSKent Overstreet bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); 304367e0dd8fSKent Overstreet bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, 30445f786787SKent Overstreet POS(inode->v.i_ino, src_start >> 9), 304563095894SKent Overstreet BTREE_ITER_INTENT); 304667e0dd8fSKent Overstreet bch2_trans_copy_iter(&dst, &src); 304767e0dd8fSKent Overstreet bch2_trans_copy_iter(&del, &src); 30485f786787SKent Overstreet 3049549d173cSKent Overstreet while (ret == 0 || 3050549d173cSKent Overstreet bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 305163095894SKent Overstreet struct disk_reservation disk_res = 305263095894SKent Overstreet bch2_disk_reservation_init(c, 0); 305363095894SKent Overstreet struct bkey_i delete; 305463095894SKent Overstreet struct bkey_s_c k; 305563095894SKent Overstreet struct bpos next_pos; 30565f786787SKent Overstreet struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); 30575f786787SKent Overstreet struct bpos atomic_end; 30582d594dfbSKent Overstreet unsigned trigger_flags = 0; 30596fed42bbSKent Overstreet u32 snapshot; 30606fed42bbSKent Overstreet 30616fed42bbSKent Overstreet bch2_trans_begin(&trans); 30626fed42bbSKent Overstreet 30636fed42bbSKent Overstreet ret = bch2_subvolume_get_snapshot(&trans, 30646fed42bbSKent Overstreet inode->ei_subvol, &snapshot); 30656fed42bbSKent Overstreet if (ret) 30666fed42bbSKent Overstreet continue; 30676fed42bbSKent Overstreet 30686fed42bbSKent Overstreet bch2_btree_iter_set_snapshot(&src, snapshot); 30696fed42bbSKent Overstreet bch2_btree_iter_set_snapshot(&dst, snapshot); 30706fed42bbSKent Overstreet bch2_btree_iter_set_snapshot(&del, snapshot); 307163095894SKent Overstreet 3072700c25b3SKent Overstreet bch2_trans_begin(&trans); 3073700c25b3SKent Overstreet 30745f786787SKent Overstreet k = insert 307567e0dd8fSKent Overstreet ? bch2_btree_iter_peek_prev(&src) 3076c72f687aSKent Overstreet : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX)); 307763095894SKent Overstreet if ((ret = bkey_err(k))) 307850dc0f69SKent Overstreet continue; 307963095894SKent Overstreet 308063095894SKent Overstreet if (!k.k || k.k->p.inode != inode->v.i_ino) 308163095894SKent Overstreet break; 308263095894SKent Overstreet 30835f786787SKent Overstreet if (insert && 3084e88a75ebSKent Overstreet bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9))) 30855f786787SKent Overstreet break; 30865f786787SKent Overstreet reassemble: 308707a1006aSKent Overstreet bch2_bkey_buf_reassemble(©, c, k); 30885f786787SKent Overstreet 30895f786787SKent Overstreet if (insert && 3090e88a75ebSKent Overstreet bkey_lt(bkey_start_pos(k.k), move_pos)) 309135189e09SKent Overstreet bch2_cut_front(move_pos, copy.k); 30925f786787SKent Overstreet 309335189e09SKent Overstreet copy.k->k.p.offset += shift >> 9; 309467e0dd8fSKent Overstreet bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); 30951c6fdbd8SKent Overstreet 309667e0dd8fSKent Overstreet ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); 30973c7f3b7aSKent Overstreet if (ret) 309850dc0f69SKent Overstreet continue; 3099e2d9912cSKent Overstreet 3100e88a75ebSKent Overstreet if (!bkey_eq(atomic_end, copy.k->k.p)) { 31015f786787SKent Overstreet if (insert) { 31025f786787SKent Overstreet move_pos = atomic_end; 31035f786787SKent Overstreet move_pos.offset -= shift >> 9; 31045f786787SKent Overstreet goto reassemble; 31055f786787SKent Overstreet } else { 3106085ab693SKent Overstreet bch2_cut_back(atomic_end, copy.k); 31075f786787SKent Overstreet } 31085f786787SKent Overstreet } 31095f786787SKent Overstreet 311063095894SKent Overstreet bkey_init(&delete.k); 3111283eda57SKent Overstreet delete.k.p = copy.k->k.p; 3112283eda57SKent Overstreet delete.k.size = copy.k->k.size; 3113283eda57SKent Overstreet delete.k.p.offset -= shift >> 9; 311467e0dd8fSKent Overstreet bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); 31151c6fdbd8SKent Overstreet 31165f786787SKent Overstreet next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; 311763095894SKent Overstreet 31187c4ca54aSKent Overstreet if (copy.k->k.size != k.k->size) { 311963095894SKent Overstreet /* We might end up splitting compressed extents: */ 312063095894SKent Overstreet unsigned nr_ptrs = 31214de77495SKent Overstreet bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); 312263095894SKent Overstreet 312363095894SKent Overstreet ret = bch2_disk_reservation_get(c, &disk_res, 312435189e09SKent Overstreet copy.k->k.size, nr_ptrs, 31251c6fdbd8SKent Overstreet BCH_DISK_RESERVATION_NOFAIL); 31261c6fdbd8SKent Overstreet BUG_ON(ret); 312763095894SKent Overstreet } 31281c6fdbd8SKent Overstreet 312967e0dd8fSKent Overstreet ret = bch2_btree_iter_traverse(&del) ?: 313067e0dd8fSKent Overstreet bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: 313167e0dd8fSKent Overstreet bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: 313268a2054dSKent Overstreet bch2_trans_commit(&trans, &disk_res, NULL, 31332d594dfbSKent Overstreet BTREE_INSERT_NOFAIL); 31341c6fdbd8SKent Overstreet bch2_disk_reservation_put(c, &disk_res); 313550dc0f69SKent Overstreet 313663095894SKent Overstreet if (!ret) 313767e0dd8fSKent Overstreet bch2_btree_iter_set_pos(&src, next_pos); 313850dc0f69SKent Overstreet } 313967e0dd8fSKent Overstreet bch2_trans_iter_exit(&trans, &del); 314067e0dd8fSKent Overstreet bch2_trans_iter_exit(&trans, &dst); 314167e0dd8fSKent Overstreet bch2_trans_iter_exit(&trans, &src); 314250dc0f69SKent Overstreet bch2_trans_exit(&trans); 314350dc0f69SKent Overstreet bch2_bkey_buf_exit(©, c); 314463095894SKent Overstreet 31458ef231bdSKent Overstreet if (ret) 314674163da7SKent Overstreet return ret; 31471c6fdbd8SKent Overstreet 314874163da7SKent Overstreet mutex_lock(&inode->ei_update_lock); 31495f786787SKent Overstreet if (!insert) { 31508ef231bdSKent Overstreet i_size_write(&inode->v, new_size); 31518ef231bdSKent Overstreet ret = bch2_write_inode_size(c, inode, new_size, 31528ef231bdSKent Overstreet ATTR_MTIME|ATTR_CTIME); 315374163da7SKent Overstreet } else { 315474163da7SKent Overstreet /* We need an inode update to update bi_journal_seq for fsync: */ 315574163da7SKent Overstreet ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 315674163da7SKent Overstreet ATTR_MTIME|ATTR_CTIME); 31575f786787SKent Overstreet } 315874163da7SKent Overstreet mutex_unlock(&inode->ei_update_lock); 31591c6fdbd8SKent Overstreet return ret; 31601c6fdbd8SKent Overstreet } 31611c6fdbd8SKent Overstreet 3162694015c2SKent Overstreet static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, 3163694015c2SKent Overstreet u64 start_sector, u64 end_sector) 31641c6fdbd8SKent Overstreet { 31651c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 3166190fa7afSKent Overstreet struct btree_trans trans; 316767e0dd8fSKent Overstreet struct btree_iter iter; 3168694015c2SKent Overstreet struct bpos end_pos = POS(inode->v.i_ino, end_sector); 316901ad6737SKent Overstreet struct bch_io_opts opts; 3170694015c2SKent Overstreet int ret = 0; 31711c6fdbd8SKent Overstreet 317201ad6737SKent Overstreet bch2_inode_opts_get(&opts, c, &inode->ei_inode); 3173f7beb4caSKent Overstreet bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); 31741c6fdbd8SKent Overstreet 317567e0dd8fSKent Overstreet bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, 3176694015c2SKent Overstreet POS(inode->v.i_ino, start_sector), 3177190fa7afSKent Overstreet BTREE_ITER_SLOTS|BTREE_ITER_INTENT); 31781c6fdbd8SKent Overstreet 3179e88a75ebSKent Overstreet while (!ret && bkey_lt(iter.pos, end_pos)) { 31802e87eae1SKent Overstreet s64 i_sectors_delta = 0; 3181190fa7afSKent Overstreet struct quota_res quota_res = { 0 }; 31821c6fdbd8SKent Overstreet struct bkey_s_c k; 3183694015c2SKent Overstreet unsigned sectors; 31846fed42bbSKent Overstreet u32 snapshot; 31851c6fdbd8SKent Overstreet 3186163e885aSKent Overstreet bch2_trans_begin(&trans); 3187a8abd3a7SKent Overstreet 31886fed42bbSKent Overstreet ret = bch2_subvolume_get_snapshot(&trans, 31896fed42bbSKent Overstreet inode->ei_subvol, &snapshot); 31906fed42bbSKent Overstreet if (ret) 31916fed42bbSKent Overstreet goto bkey_err; 31926fed42bbSKent Overstreet 31936fed42bbSKent Overstreet bch2_btree_iter_set_snapshot(&iter, snapshot); 31946fed42bbSKent Overstreet 319567e0dd8fSKent Overstreet k = bch2_btree_iter_peek_slot(&iter); 31960f238367SKent Overstreet if ((ret = bkey_err(k))) 31970f238367SKent Overstreet goto bkey_err; 31981c6fdbd8SKent Overstreet 31991c6fdbd8SKent Overstreet /* already reserved */ 320079203111SKent Overstreet if (bkey_extent_is_reservation(k) && 320179203111SKent Overstreet bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { 320267e0dd8fSKent Overstreet bch2_btree_iter_advance(&iter); 32031c6fdbd8SKent Overstreet continue; 32041c6fdbd8SKent Overstreet } 32051c6fdbd8SKent Overstreet 3206190fa7afSKent Overstreet if (bkey_extent_is_data(k.k) && 3207190fa7afSKent Overstreet !(mode & FALLOC_FL_ZERO_RANGE)) { 320867e0dd8fSKent Overstreet bch2_btree_iter_advance(&iter); 32091c6fdbd8SKent Overstreet continue; 32101c6fdbd8SKent Overstreet } 32111c6fdbd8SKent Overstreet 3212a8b3a677SKent Overstreet /* 3213a8b3a677SKent Overstreet * XXX: for nocow mode, we should promote shared extents to 3214a8b3a677SKent Overstreet * unshared here 3215a8b3a677SKent Overstreet */ 3216a8b3a677SKent Overstreet 321770de7a47SKent Overstreet sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset; 32181c6fdbd8SKent Overstreet 32191c6fdbd8SKent Overstreet if (!bkey_extent_is_allocation(k.k)) { 32201c6fdbd8SKent Overstreet ret = bch2_quota_reservation_add(c, inode, 3221190fa7afSKent Overstreet "a_res, 32221c6fdbd8SKent Overstreet sectors, true); 32231c6fdbd8SKent Overstreet if (unlikely(ret)) 32240f238367SKent Overstreet goto bkey_err; 32251c6fdbd8SKent Overstreet } 32261c6fdbd8SKent Overstreet 322770de7a47SKent Overstreet ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter, 322870de7a47SKent Overstreet sectors, opts, &i_sectors_delta, 322970de7a47SKent Overstreet writepoint_hashed((unsigned long) current)); 32308810386fSKent Overstreet if (ret) 32318810386fSKent Overstreet goto bkey_err; 323270de7a47SKent Overstreet 32332e87eae1SKent Overstreet i_sectors_acct(c, inode, "a_res, i_sectors_delta); 32340f238367SKent Overstreet bkey_err: 3235190fa7afSKent Overstreet bch2_quota_reservation_put(c, inode, "a_res); 3236549d173cSKent Overstreet if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 32371c6fdbd8SKent Overstreet ret = 0; 323850dc0f69SKent Overstreet } 323974163da7SKent Overstreet 3240dcfc593fSKent Overstreet bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ 3241dcfc593fSKent Overstreet mark_pagecache_reserved(inode, start_sector, iter.pos.offset); 3242dcfc593fSKent Overstreet 3243098ef98dSKent Overstreet if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { 324474163da7SKent Overstreet struct quota_res quota_res = { 0 }; 324574163da7SKent Overstreet s64 i_sectors_delta = 0; 324674163da7SKent Overstreet 324774163da7SKent Overstreet bch2_fpunch_at(&trans, &iter, inode_inum(inode), 324874163da7SKent Overstreet end_sector, &i_sectors_delta); 324974163da7SKent Overstreet i_sectors_acct(c, inode, "a_res, i_sectors_delta); 325074163da7SKent Overstreet bch2_quota_reservation_put(c, inode, "a_res); 325174163da7SKent Overstreet } 325274163da7SKent Overstreet 325367e0dd8fSKent Overstreet bch2_trans_iter_exit(&trans, &iter); 3254694015c2SKent Overstreet bch2_trans_exit(&trans); 3255694015c2SKent Overstreet return ret; 3256694015c2SKent Overstreet } 325750dc0f69SKent Overstreet 3258694015c2SKent Overstreet static long bchfs_fallocate(struct bch_inode_info *inode, int mode, 3259694015c2SKent Overstreet loff_t offset, loff_t len) 3260694015c2SKent Overstreet { 3261694015c2SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 326274163da7SKent Overstreet u64 end = offset + len; 326374163da7SKent Overstreet u64 block_start = round_down(offset, block_bytes(c)); 326474163da7SKent Overstreet u64 block_end = round_up(end, block_bytes(c)); 326574163da7SKent Overstreet bool truncated_last_page = false; 326674163da7SKent Overstreet int ret, ret2 = 0; 3267694015c2SKent Overstreet 3268694015c2SKent Overstreet if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { 3269694015c2SKent Overstreet ret = inode_newsize_ok(&inode->v, end); 3270694015c2SKent Overstreet if (ret) 327174163da7SKent Overstreet return ret; 3272694015c2SKent Overstreet } 3273694015c2SKent Overstreet 3274694015c2SKent Overstreet if (mode & FALLOC_FL_ZERO_RANGE) { 327574163da7SKent Overstreet ret = bch2_truncate_pages(inode, offset, end); 327674163da7SKent Overstreet if (unlikely(ret < 0)) 327774163da7SKent Overstreet return ret; 3278694015c2SKent Overstreet 327974163da7SKent Overstreet truncated_last_page = ret; 3280694015c2SKent Overstreet 3281694015c2SKent Overstreet truncate_pagecache_range(&inode->v, offset, end - 1); 328274163da7SKent Overstreet 328374163da7SKent Overstreet block_start = round_up(offset, block_bytes(c)); 328474163da7SKent Overstreet block_end = round_down(end, block_bytes(c)); 3285694015c2SKent Overstreet } 3286694015c2SKent Overstreet 3287694015c2SKent Overstreet ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); 3288e0541a93SKent Overstreet 3289e0541a93SKent Overstreet /* 329074163da7SKent Overstreet * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, 329174163da7SKent Overstreet * so that the VFS cache i_size is consistent with the btree i_size: 3292e0541a93SKent Overstreet */ 329374163da7SKent Overstreet if (ret && 3294098ef98dSKent Overstreet !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) 329574163da7SKent Overstreet return ret; 32961c6fdbd8SKent Overstreet 329774163da7SKent Overstreet if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) 3298e0541a93SKent Overstreet end = inode->v.i_size; 329974163da7SKent Overstreet 330074163da7SKent Overstreet if (end >= inode->v.i_size && 330174163da7SKent Overstreet (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || 330274163da7SKent Overstreet !(mode & FALLOC_FL_KEEP_SIZE))) { 330374163da7SKent Overstreet spin_lock(&inode->v.i_lock); 3304e0541a93SKent Overstreet i_size_write(&inode->v, end); 330574163da7SKent Overstreet spin_unlock(&inode->v.i_lock); 3306e0541a93SKent Overstreet 33071c6fdbd8SKent Overstreet mutex_lock(&inode->ei_update_lock); 330874163da7SKent Overstreet ret2 = bch2_write_inode_size(c, inode, end, 0); 33091c6fdbd8SKent Overstreet mutex_unlock(&inode->ei_update_lock); 33101c6fdbd8SKent Overstreet } 331174163da7SKent Overstreet 331274163da7SKent Overstreet return ret ?: ret2; 33131c6fdbd8SKent Overstreet } 33141c6fdbd8SKent Overstreet 33151c6fdbd8SKent Overstreet long bch2_fallocate_dispatch(struct file *file, int mode, 33161c6fdbd8SKent Overstreet loff_t offset, loff_t len) 33171c6fdbd8SKent Overstreet { 33181c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 33192a9101a9SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 33202a9101a9SKent Overstreet long ret; 33212a9101a9SKent Overstreet 3322d94189adSKent Overstreet if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) 33232a9101a9SKent Overstreet return -EROFS; 33241c6fdbd8SKent Overstreet 332574163da7SKent Overstreet inode_lock(&inode->v); 332674163da7SKent Overstreet inode_dio_wait(&inode->v); 3327a7ecd30cSKent Overstreet bch2_pagecache_block_get(inode); 332874163da7SKent Overstreet 332907bfcc0bSKent Overstreet ret = file_modified(file); 333007bfcc0bSKent Overstreet if (ret) 333107bfcc0bSKent Overstreet goto err; 333207bfcc0bSKent Overstreet 33331c6fdbd8SKent Overstreet if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) 33342a9101a9SKent Overstreet ret = bchfs_fallocate(inode, mode, offset, len); 33352a9101a9SKent Overstreet else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) 33362a9101a9SKent Overstreet ret = bchfs_fpunch(inode, offset, len); 33372a9101a9SKent Overstreet else if (mode == FALLOC_FL_INSERT_RANGE) 33382a9101a9SKent Overstreet ret = bchfs_fcollapse_finsert(inode, offset, len, true); 33392a9101a9SKent Overstreet else if (mode == FALLOC_FL_COLLAPSE_RANGE) 33402a9101a9SKent Overstreet ret = bchfs_fcollapse_finsert(inode, offset, len, false); 33412a9101a9SKent Overstreet else 33422a9101a9SKent Overstreet ret = -EOPNOTSUPP; 334307bfcc0bSKent Overstreet err: 3344a7ecd30cSKent Overstreet bch2_pagecache_block_put(inode); 334574163da7SKent Overstreet inode_unlock(&inode->v); 3346d94189adSKent Overstreet bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); 33471c6fdbd8SKent Overstreet 33485c1ef830SKent Overstreet return bch2_err_class(ret); 33491c6fdbd8SKent Overstreet } 33501c6fdbd8SKent Overstreet 3351c72f687aSKent Overstreet /* 3352c72f687aSKent Overstreet * Take a quota reservation for unallocated blocks in a given file range 3353c72f687aSKent Overstreet * Does not check pagecache 3354c72f687aSKent Overstreet */ 3355e8540e56SKent Overstreet static int quota_reserve_range(struct bch_inode_info *inode, 3356e8540e56SKent Overstreet struct quota_res *res, 3357e8540e56SKent Overstreet u64 start, u64 end) 3358e8540e56SKent Overstreet { 3359e8540e56SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 3360e8540e56SKent Overstreet struct btree_trans trans; 3361e8540e56SKent Overstreet struct btree_iter iter; 3362e8540e56SKent Overstreet struct bkey_s_c k; 3363e8540e56SKent Overstreet u32 snapshot; 3364e8540e56SKent Overstreet u64 sectors = end - start; 3365e8540e56SKent Overstreet u64 pos = start; 3366e8540e56SKent Overstreet int ret; 3367e8540e56SKent Overstreet 3368e8540e56SKent Overstreet bch2_trans_init(&trans, c, 0, 0); 3369e8540e56SKent Overstreet retry: 3370e8540e56SKent Overstreet bch2_trans_begin(&trans); 3371e8540e56SKent Overstreet 3372e8540e56SKent Overstreet ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); 3373e8540e56SKent Overstreet if (ret) 3374e8540e56SKent Overstreet goto err; 3375e8540e56SKent Overstreet 3376e8540e56SKent Overstreet bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, 3377e8540e56SKent Overstreet SPOS(inode->v.i_ino, pos, snapshot), 0); 3378e8540e56SKent Overstreet 3379e8540e56SKent Overstreet while (!(ret = btree_trans_too_many_iters(&trans)) && 3380e8540e56SKent Overstreet (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && 3381e8540e56SKent Overstreet !(ret = bkey_err(k))) { 3382e8540e56SKent Overstreet if (bkey_extent_is_allocation(k.k)) { 3383e8540e56SKent Overstreet u64 s = min(end, k.k->p.offset) - 3384e8540e56SKent Overstreet max(start, bkey_start_offset(k.k)); 3385e8540e56SKent Overstreet BUG_ON(s > sectors); 3386e8540e56SKent Overstreet sectors -= s; 3387e8540e56SKent Overstreet } 3388e8540e56SKent Overstreet bch2_btree_iter_advance(&iter); 3389e8540e56SKent Overstreet } 3390e8540e56SKent Overstreet pos = iter.pos.offset; 3391e8540e56SKent Overstreet bch2_trans_iter_exit(&trans, &iter); 3392e8540e56SKent Overstreet err: 3393e8540e56SKent Overstreet if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 3394e8540e56SKent Overstreet goto retry; 3395e8540e56SKent Overstreet 3396e8540e56SKent Overstreet bch2_trans_exit(&trans); 3397e8540e56SKent Overstreet 3398e8540e56SKent Overstreet if (ret) 3399e8540e56SKent Overstreet return ret; 3400e8540e56SKent Overstreet 3401e8540e56SKent Overstreet return bch2_quota_reservation_add(c, inode, res, sectors, true); 3402e8540e56SKent Overstreet } 3403e8540e56SKent Overstreet 340476426098SKent Overstreet loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, 340576426098SKent Overstreet struct file *file_dst, loff_t pos_dst, 340676426098SKent Overstreet loff_t len, unsigned remap_flags) 340776426098SKent Overstreet { 340876426098SKent Overstreet struct bch_inode_info *src = file_bch_inode(file_src); 340976426098SKent Overstreet struct bch_inode_info *dst = file_bch_inode(file_dst); 341076426098SKent Overstreet struct bch_fs *c = src->v.i_sb->s_fs_info; 3411e8540e56SKent Overstreet struct quota_res quota_res = { 0 }; 34122e87eae1SKent Overstreet s64 i_sectors_delta = 0; 3413677fc056SKent Overstreet u64 aligned_len; 341476426098SKent Overstreet loff_t ret = 0; 341576426098SKent Overstreet 341676426098SKent Overstreet if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) 341776426098SKent Overstreet return -EINVAL; 341876426098SKent Overstreet 341976426098SKent Overstreet if (remap_flags & REMAP_FILE_DEDUP) 342076426098SKent Overstreet return -EOPNOTSUPP; 342176426098SKent Overstreet 342276426098SKent Overstreet if ((pos_src & (block_bytes(c) - 1)) || 342376426098SKent Overstreet (pos_dst & (block_bytes(c) - 1))) 342476426098SKent Overstreet return -EINVAL; 342576426098SKent Overstreet 342676426098SKent Overstreet if (src == dst && 342776426098SKent Overstreet abs(pos_src - pos_dst) < len) 342876426098SKent Overstreet return -EINVAL; 342976426098SKent Overstreet 343076426098SKent Overstreet bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); 343176426098SKent Overstreet 343276426098SKent Overstreet inode_dio_wait(&src->v); 343376426098SKent Overstreet inode_dio_wait(&dst->v); 343476426098SKent Overstreet 343576426098SKent Overstreet ret = generic_remap_file_range_prep(file_src, pos_src, 343676426098SKent Overstreet file_dst, pos_dst, 343776426098SKent Overstreet &len, remap_flags); 343876426098SKent Overstreet if (ret < 0 || len == 0) 34392e87eae1SKent Overstreet goto err; 344076426098SKent Overstreet 3441677fc056SKent Overstreet aligned_len = round_up((u64) len, block_bytes(c)); 344276426098SKent Overstreet 344376426098SKent Overstreet ret = write_invalidate_inode_pages_range(dst->v.i_mapping, 3444677fc056SKent Overstreet pos_dst, pos_dst + len - 1); 344576426098SKent Overstreet if (ret) 34462e87eae1SKent Overstreet goto err; 344776426098SKent Overstreet 3448e8540e56SKent Overstreet ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, 3449e8540e56SKent Overstreet (pos_dst + aligned_len) >> 9); 3450e8540e56SKent Overstreet if (ret) 3451e8540e56SKent Overstreet goto err; 3452e8540e56SKent Overstreet 3453e8540e56SKent Overstreet file_update_time(file_dst); 3454e8540e56SKent Overstreet 3455dcfc593fSKent Overstreet mark_pagecache_unallocated(src, pos_src >> 9, 3456dcfc593fSKent Overstreet (pos_src + aligned_len) >> 9); 345776426098SKent Overstreet 34582e87eae1SKent Overstreet ret = bch2_remap_range(c, 34596fed42bbSKent Overstreet inode_inum(dst), pos_dst >> 9, 34606fed42bbSKent Overstreet inode_inum(src), pos_src >> 9, 346176426098SKent Overstreet aligned_len >> 9, 34622e87eae1SKent Overstreet pos_dst + len, &i_sectors_delta); 34632e87eae1SKent Overstreet if (ret < 0) 34642e87eae1SKent Overstreet goto err; 346576426098SKent Overstreet 34662e87eae1SKent Overstreet /* 34672e87eae1SKent Overstreet * due to alignment, we might have remapped slightly more than requsted 34682e87eae1SKent Overstreet */ 3469677fc056SKent Overstreet ret = min((u64) ret << 9, (u64) len); 34702e87eae1SKent Overstreet 3471e8540e56SKent Overstreet i_sectors_acct(c, dst, "a_res, i_sectors_delta); 34722e87eae1SKent Overstreet 34732e87eae1SKent Overstreet spin_lock(&dst->v.i_lock); 3474677fc056SKent Overstreet if (pos_dst + ret > dst->v.i_size) 3475677fc056SKent Overstreet i_size_write(&dst->v, pos_dst + ret); 34762e87eae1SKent Overstreet spin_unlock(&dst->v.i_lock); 3477e7084c9cSKent Overstreet 347868a2054dSKent Overstreet if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || 347968a2054dSKent Overstreet IS_SYNC(file_inode(file_dst))) 3480a8b3a677SKent Overstreet ret = bch2_flush_inode(c, dst); 34812e87eae1SKent Overstreet err: 3482e8540e56SKent Overstreet bch2_quota_reservation_put(c, dst, "a_res); 348376426098SKent Overstreet bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); 348476426098SKent Overstreet 34855c1ef830SKent Overstreet return bch2_err_class(ret); 348676426098SKent Overstreet } 348776426098SKent Overstreet 34881c6fdbd8SKent Overstreet /* fseek: */ 34891c6fdbd8SKent Overstreet 3490543ef2ebSKent Overstreet static int folio_data_offset(struct folio *folio, unsigned offset) 34911c6fdbd8SKent Overstreet { 3492*30bff594SKent Overstreet struct bch_folio *s = bch2_folio(folio); 3493f57a6a5dSKent Overstreet unsigned i; 3494f81b648dSKent Overstreet 3495543ef2ebSKent Overstreet if (s) 3496543ef2ebSKent Overstreet for (i = offset >> 9; i < PAGE_SECTORS; i++) 3497f57a6a5dSKent Overstreet if (s->s[i].state >= SECTOR_DIRTY) 3498543ef2ebSKent Overstreet return i << 9; 3499f57a6a5dSKent Overstreet 3500543ef2ebSKent Overstreet return -1; 35011c6fdbd8SKent Overstreet } 35021c6fdbd8SKent Overstreet 3503543ef2ebSKent Overstreet static loff_t bch2_seek_pagecache_data(struct inode *vinode, 35041c6fdbd8SKent Overstreet loff_t start_offset, 35051c6fdbd8SKent Overstreet loff_t end_offset) 35061c6fdbd8SKent Overstreet { 35071c6fdbd8SKent Overstreet struct folio_batch fbatch; 35081c6fdbd8SKent Overstreet pgoff_t start_index = start_offset >> PAGE_SHIFT; 35091c6fdbd8SKent Overstreet pgoff_t end_index = end_offset >> PAGE_SHIFT; 35101c6fdbd8SKent Overstreet pgoff_t index = start_index; 35111c6fdbd8SKent Overstreet unsigned i; 3512543ef2ebSKent Overstreet loff_t ret; 3513543ef2ebSKent Overstreet int offset; 35141c6fdbd8SKent Overstreet 35151c6fdbd8SKent Overstreet folio_batch_init(&fbatch); 35161c6fdbd8SKent Overstreet 35171c6fdbd8SKent Overstreet while (filemap_get_folios(vinode->i_mapping, 35181c6fdbd8SKent Overstreet &index, end_index, &fbatch)) { 35191c6fdbd8SKent Overstreet for (i = 0; i < folio_batch_count(&fbatch); i++) { 35201c6fdbd8SKent Overstreet struct folio *folio = fbatch.folios[i]; 35211c6fdbd8SKent Overstreet 35221c6fdbd8SKent Overstreet folio_lock(folio); 3523543ef2ebSKent Overstreet offset = folio_data_offset(folio, 3524543ef2ebSKent Overstreet folio->index == start_index 3525543ef2ebSKent Overstreet ? start_offset & (PAGE_SIZE - 1) 3526543ef2ebSKent Overstreet : 0); 3527543ef2ebSKent Overstreet if (offset >= 0) { 3528543ef2ebSKent Overstreet ret = clamp(((loff_t) folio->index << PAGE_SHIFT) + 3529543ef2ebSKent Overstreet offset, 3530543ef2ebSKent Overstreet start_offset, end_offset); 35311c6fdbd8SKent Overstreet folio_unlock(folio); 35321c6fdbd8SKent Overstreet folio_batch_release(&fbatch); 3533543ef2ebSKent Overstreet return ret; 35341c6fdbd8SKent Overstreet } 35351c6fdbd8SKent Overstreet folio_unlock(folio); 35361c6fdbd8SKent Overstreet } 35371c6fdbd8SKent Overstreet folio_batch_release(&fbatch); 35381c6fdbd8SKent Overstreet cond_resched(); 35391c6fdbd8SKent Overstreet } 35401c6fdbd8SKent Overstreet 35411c6fdbd8SKent Overstreet return end_offset; 35421c6fdbd8SKent Overstreet } 35431c6fdbd8SKent Overstreet 35441c6fdbd8SKent Overstreet static loff_t bch2_seek_data(struct file *file, u64 offset) 35451c6fdbd8SKent Overstreet { 35461c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 35471c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 3548424eb881SKent Overstreet struct btree_trans trans; 354967e0dd8fSKent Overstreet struct btree_iter iter; 35501c6fdbd8SKent Overstreet struct bkey_s_c k; 35516fed42bbSKent Overstreet subvol_inum inum = inode_inum(inode); 35521c6fdbd8SKent Overstreet u64 isize, next_data = MAX_LFS_FILESIZE; 35536fed42bbSKent Overstreet u32 snapshot; 35541c6fdbd8SKent Overstreet int ret; 35551c6fdbd8SKent Overstreet 35561c6fdbd8SKent Overstreet isize = i_size_read(&inode->v); 35571c6fdbd8SKent Overstreet if (offset >= isize) 35581c6fdbd8SKent Overstreet return -ENXIO; 35591c6fdbd8SKent Overstreet 356020bceecbSKent Overstreet bch2_trans_init(&trans, c, 0, 0); 35616fed42bbSKent Overstreet retry: 35626fed42bbSKent Overstreet bch2_trans_begin(&trans); 35636fed42bbSKent Overstreet 35646fed42bbSKent Overstreet ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 35656fed42bbSKent Overstreet if (ret) 35666fed42bbSKent Overstreet goto err; 3567424eb881SKent Overstreet 3568c72f687aSKent Overstreet for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, 3569c72f687aSKent Overstreet SPOS(inode->v.i_ino, offset >> 9, snapshot), 3570c72f687aSKent Overstreet POS(inode->v.i_ino, U64_MAX), 3571c72f687aSKent Overstreet 0, k, ret) { 3572c72f687aSKent Overstreet if (bkey_extent_is_data(k.k)) { 35731c6fdbd8SKent Overstreet next_data = max(offset, bkey_start_offset(k.k) << 9); 35741c6fdbd8SKent Overstreet break; 35751c6fdbd8SKent Overstreet } else if (k.k->p.offset >> 9 > isize) 35761c6fdbd8SKent Overstreet break; 35771c6fdbd8SKent Overstreet } 357867e0dd8fSKent Overstreet bch2_trans_iter_exit(&trans, &iter); 35796fed42bbSKent Overstreet err: 3580549d173cSKent Overstreet if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 35816fed42bbSKent Overstreet goto retry; 35821c6fdbd8SKent Overstreet 35839a796fdbSKent Overstreet bch2_trans_exit(&trans); 35841c6fdbd8SKent Overstreet if (ret) 35851c6fdbd8SKent Overstreet return ret; 35861c6fdbd8SKent Overstreet 35871c6fdbd8SKent Overstreet if (next_data > offset) 3588543ef2ebSKent Overstreet next_data = bch2_seek_pagecache_data(&inode->v, 35891c6fdbd8SKent Overstreet offset, next_data); 35901c6fdbd8SKent Overstreet 3591e10d3094SKent Overstreet if (next_data >= isize) 35921c6fdbd8SKent Overstreet return -ENXIO; 35931c6fdbd8SKent Overstreet 35941c6fdbd8SKent Overstreet return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); 35951c6fdbd8SKent Overstreet } 35961c6fdbd8SKent Overstreet 3597*30bff594SKent Overstreet static int __folio_hole_offset(struct folio *folio, unsigned offset) 35981c6fdbd8SKent Overstreet { 3599*30bff594SKent Overstreet struct bch_folio *s = bch2_folio(folio); 3600543ef2ebSKent Overstreet unsigned i; 3601543ef2ebSKent Overstreet 3602543ef2ebSKent Overstreet if (!s) 3603543ef2ebSKent Overstreet return 0; 3604543ef2ebSKent Overstreet 3605543ef2ebSKent Overstreet for (i = offset >> 9; i < PAGE_SECTORS; i++) 3606543ef2ebSKent Overstreet if (s->s[i].state < SECTOR_DIRTY) 3607543ef2ebSKent Overstreet return i << 9; 3608543ef2ebSKent Overstreet 3609543ef2ebSKent Overstreet return -1; 3610543ef2ebSKent Overstreet } 3611543ef2ebSKent Overstreet 3612*30bff594SKent Overstreet static loff_t folio_hole_offset(struct address_space *mapping, loff_t offset) 3613543ef2ebSKent Overstreet { 3614543ef2ebSKent Overstreet pgoff_t index = offset >> PAGE_SHIFT; 3615*30bff594SKent Overstreet struct folio *folio; 3616*30bff594SKent Overstreet int folio_offset; 3617543ef2ebSKent Overstreet loff_t ret = -1; 36181c6fdbd8SKent Overstreet 3619*30bff594SKent Overstreet folio = filemap_lock_folio(mapping, index); 3620*30bff594SKent Overstreet if (!folio) 3621543ef2ebSKent Overstreet return offset; 36221c6fdbd8SKent Overstreet 3623*30bff594SKent Overstreet folio_offset = __folio_hole_offset(folio, offset & (folio_size(folio) - 1)); 3624*30bff594SKent Overstreet if (folio_offset >= 0) 3625*30bff594SKent Overstreet ret = folio_pos(folio) + folio_offset; 3626*30bff594SKent Overstreet folio_unlock(folio); 36271c6fdbd8SKent Overstreet 36281c6fdbd8SKent Overstreet return ret; 36291c6fdbd8SKent Overstreet } 36301c6fdbd8SKent Overstreet 3631543ef2ebSKent Overstreet static loff_t bch2_seek_pagecache_hole(struct inode *vinode, 36321c6fdbd8SKent Overstreet loff_t start_offset, 36331c6fdbd8SKent Overstreet loff_t end_offset) 36341c6fdbd8SKent Overstreet { 36351c6fdbd8SKent Overstreet struct address_space *mapping = vinode->i_mapping; 3636543ef2ebSKent Overstreet loff_t offset = start_offset, hole; 36371c6fdbd8SKent Overstreet 3638543ef2ebSKent Overstreet while (offset < end_offset) { 3639*30bff594SKent Overstreet hole = folio_hole_offset(mapping, offset); 3640543ef2ebSKent Overstreet if (hole >= 0 && hole <= end_offset) 3641543ef2ebSKent Overstreet return max(start_offset, hole); 3642543ef2ebSKent Overstreet 3643543ef2ebSKent Overstreet offset += PAGE_SIZE; 3644543ef2ebSKent Overstreet offset &= PAGE_MASK; 3645543ef2ebSKent Overstreet } 36461c6fdbd8SKent Overstreet 36471c6fdbd8SKent Overstreet return end_offset; 36481c6fdbd8SKent Overstreet } 36491c6fdbd8SKent Overstreet 36501c6fdbd8SKent Overstreet static loff_t bch2_seek_hole(struct file *file, u64 offset) 36511c6fdbd8SKent Overstreet { 36521c6fdbd8SKent Overstreet struct bch_inode_info *inode = file_bch_inode(file); 36531c6fdbd8SKent Overstreet struct bch_fs *c = inode->v.i_sb->s_fs_info; 3654424eb881SKent Overstreet struct btree_trans trans; 365567e0dd8fSKent Overstreet struct btree_iter iter; 36561c6fdbd8SKent Overstreet struct bkey_s_c k; 36576fed42bbSKent Overstreet subvol_inum inum = inode_inum(inode); 36581c6fdbd8SKent Overstreet u64 isize, next_hole = MAX_LFS_FILESIZE; 36596fed42bbSKent Overstreet u32 snapshot; 36601c6fdbd8SKent Overstreet int ret; 36611c6fdbd8SKent Overstreet 36621c6fdbd8SKent Overstreet isize = i_size_read(&inode->v); 36631c6fdbd8SKent Overstreet if (offset >= isize) 36641c6fdbd8SKent Overstreet return -ENXIO; 36651c6fdbd8SKent Overstreet 366620bceecbSKent Overstreet bch2_trans_init(&trans, c, 0, 0); 36676fed42bbSKent Overstreet retry: 36686fed42bbSKent Overstreet bch2_trans_begin(&trans); 36696fed42bbSKent Overstreet 36706fed42bbSKent Overstreet ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 36716fed42bbSKent Overstreet if (ret) 36726fed42bbSKent Overstreet goto err; 3673424eb881SKent Overstreet 3674e5fa91d7SKent Overstreet for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, 36756fed42bbSKent Overstreet SPOS(inode->v.i_ino, offset >> 9, snapshot), 367694f651e2SKent Overstreet BTREE_ITER_SLOTS, k, ret) { 36771c6fdbd8SKent Overstreet if (k.k->p.inode != inode->v.i_ino) { 3678543ef2ebSKent Overstreet next_hole = bch2_seek_pagecache_hole(&inode->v, 36791c6fdbd8SKent Overstreet offset, MAX_LFS_FILESIZE); 36801c6fdbd8SKent Overstreet break; 36811c6fdbd8SKent Overstreet } else if (!bkey_extent_is_data(k.k)) { 3682543ef2ebSKent Overstreet next_hole = bch2_seek_pagecache_hole(&inode->v, 36831c6fdbd8SKent Overstreet max(offset, bkey_start_offset(k.k) << 9), 36841c6fdbd8SKent Overstreet k.k->p.offset << 9); 36851c6fdbd8SKent Overstreet 36861c6fdbd8SKent Overstreet if (next_hole < k.k->p.offset << 9) 36871c6fdbd8SKent Overstreet break; 36881c6fdbd8SKent Overstreet } else { 36891c6fdbd8SKent Overstreet offset = max(offset, bkey_start_offset(k.k) << 9); 36901c6fdbd8SKent Overstreet } 36911c6fdbd8SKent Overstreet } 369267e0dd8fSKent Overstreet bch2_trans_iter_exit(&trans, &iter); 36936fed42bbSKent Overstreet err: 3694549d173cSKent Overstreet if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 36956fed42bbSKent Overstreet goto retry; 36961c6fdbd8SKent Overstreet 36979a796fdbSKent Overstreet bch2_trans_exit(&trans); 36981c6fdbd8SKent Overstreet if (ret) 36991c6fdbd8SKent Overstreet return ret; 37001c6fdbd8SKent Overstreet 37011c6fdbd8SKent Overstreet if (next_hole > isize) 37021c6fdbd8SKent Overstreet next_hole = isize; 37031c6fdbd8SKent Overstreet 37041c6fdbd8SKent Overstreet return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); 37051c6fdbd8SKent Overstreet } 37061c6fdbd8SKent Overstreet 37071c6fdbd8SKent Overstreet loff_t bch2_llseek(struct file *file, loff_t offset, int whence) 37081c6fdbd8SKent Overstreet { 37095c1ef830SKent Overstreet loff_t ret; 37105c1ef830SKent Overstreet 37111c6fdbd8SKent Overstreet switch (whence) { 37121c6fdbd8SKent Overstreet case SEEK_SET: 37131c6fdbd8SKent Overstreet case SEEK_CUR: 37141c6fdbd8SKent Overstreet case SEEK_END: 37155c1ef830SKent Overstreet ret = generic_file_llseek(file, offset, whence); 37165c1ef830SKent Overstreet break; 37171c6fdbd8SKent Overstreet case SEEK_DATA: 37185c1ef830SKent Overstreet ret = bch2_seek_data(file, offset); 37195c1ef830SKent Overstreet break; 37201c6fdbd8SKent Overstreet case SEEK_HOLE: 37215c1ef830SKent Overstreet ret = bch2_seek_hole(file, offset); 37225c1ef830SKent Overstreet break; 37235c1ef830SKent Overstreet default: 37245c1ef830SKent Overstreet ret = -EINVAL; 37255c1ef830SKent Overstreet break; 37261c6fdbd8SKent Overstreet } 37271c6fdbd8SKent Overstreet 37285c1ef830SKent Overstreet return bch2_err_class(ret); 37291c6fdbd8SKent Overstreet } 37301c6fdbd8SKent Overstreet 37311c6fdbd8SKent Overstreet void bch2_fs_fsio_exit(struct bch_fs *c) 37321c6fdbd8SKent Overstreet { 3733a8b3a677SKent Overstreet bioset_exit(&c->nocow_flush_bioset); 37341c6fdbd8SKent Overstreet bioset_exit(&c->dio_write_bioset); 37351c6fdbd8SKent Overstreet bioset_exit(&c->dio_read_bioset); 37361c6fdbd8SKent Overstreet bioset_exit(&c->writepage_bioset); 37371c6fdbd8SKent Overstreet } 37381c6fdbd8SKent Overstreet 37391c6fdbd8SKent Overstreet int bch2_fs_fsio_init(struct bch_fs *c) 37401c6fdbd8SKent Overstreet { 37411c6fdbd8SKent Overstreet int ret = 0; 37421c6fdbd8SKent Overstreet 37431c6fdbd8SKent Overstreet pr_verbose_init(c->opts, ""); 37441c6fdbd8SKent Overstreet 37451c6fdbd8SKent Overstreet if (bioset_init(&c->writepage_bioset, 37469a3df993SKent Overstreet 4, offsetof(struct bch_writepage_io, op.wbio.bio), 374765d48e35SKent Overstreet BIOSET_NEED_BVECS)) 374865d48e35SKent Overstreet return -BCH_ERR_ENOMEM_writepage_bioset_init; 374965d48e35SKent Overstreet 375065d48e35SKent Overstreet if (bioset_init(&c->dio_read_bioset, 37511c6fdbd8SKent Overstreet 4, offsetof(struct dio_read, rbio.bio), 375265d48e35SKent Overstreet BIOSET_NEED_BVECS)) 375365d48e35SKent Overstreet return -BCH_ERR_ENOMEM_dio_read_bioset_init; 375465d48e35SKent Overstreet 375565d48e35SKent Overstreet if (bioset_init(&c->dio_write_bioset, 37569a3df993SKent Overstreet 4, offsetof(struct dio_write, op.wbio.bio), 375765d48e35SKent Overstreet BIOSET_NEED_BVECS)) 375865d48e35SKent Overstreet return -BCH_ERR_ENOMEM_dio_write_bioset_init; 375965d48e35SKent Overstreet 376065d48e35SKent Overstreet if (bioset_init(&c->nocow_flush_bioset, 3761a8b3a677SKent Overstreet 1, offsetof(struct nocow_flush, bio), 0)) 376265d48e35SKent Overstreet return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; 37631c6fdbd8SKent Overstreet 37641c6fdbd8SKent Overstreet pr_verbose_init(c->opts, "ret %i", ret); 37651c6fdbd8SKent Overstreet return ret; 37661c6fdbd8SKent Overstreet } 37671c6fdbd8SKent Overstreet 37681c6fdbd8SKent Overstreet #endif /* NO_BCACHEFS_FS */ 3769