1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "alloc_foreground.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "clock.h" 10 #include "error.h" 11 #include "extents.h" 12 #include "extent_update.h" 13 #include "fs.h" 14 #include "fs-io.h" 15 #include "fs-io-buffered.h" 16 #include "fs-io-pagecache.h" 17 #include "fsck.h" 18 #include "inode.h" 19 #include "journal.h" 20 #include "io_misc.h" 21 #include "keylist.h" 22 #include "quota.h" 23 #include "reflink.h" 24 #include "trace.h" 25 26 #include <linux/aio.h> 27 #include <linux/backing-dev.h> 28 #include <linux/falloc.h> 29 #include <linux/migrate.h> 30 #include <linux/mmu_context.h> 31 #include <linux/pagevec.h> 32 #include <linux/rmap.h> 33 #include <linux/sched/signal.h> 34 #include <linux/task_io_accounting_ops.h> 35 #include <linux/uio.h> 36 37 #include <trace/events/writeback.h> 38 39 struct nocow_flush { 40 struct closure *cl; 41 struct bch_dev *ca; 42 struct bio bio; 43 }; 44 45 static void nocow_flush_endio(struct bio *_bio) 46 { 47 48 struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); 49 50 closure_put(bio->cl); 51 percpu_ref_put(&bio->ca->io_ref); 52 bio_put(&bio->bio); 53 } 54 55 void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 56 struct bch_inode_info *inode, 57 struct closure *cl) 58 { 59 struct nocow_flush *bio; 60 struct bch_dev *ca; 61 struct bch_devs_mask devs; 62 unsigned dev; 63 64 dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); 65 if (dev == BCH_SB_MEMBERS_MAX) 66 return; 67 68 devs = inode->ei_devs_need_flush; 69 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 70 71 for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { 72 rcu_read_lock(); 73 ca = rcu_dereference(c->devs[dev]); 74 if (ca && !percpu_ref_tryget(&ca->io_ref)) 75 ca = NULL; 76 rcu_read_unlock(); 77 78 if (!ca) 79 continue; 80 81 bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, 82 REQ_OP_WRITE|REQ_PREFLUSH, 83 GFP_KERNEL, 84 &c->nocow_flush_bioset), 85 struct nocow_flush, bio); 86 bio->cl = cl; 87 bio->ca = ca; 88 bio->bio.bi_end_io = nocow_flush_endio; 89 closure_bio_submit(&bio->bio, cl); 90 } 91 } 92 93 static int bch2_inode_flush_nocow_writes(struct bch_fs *c, 94 struct bch_inode_info *inode) 95 { 96 struct closure cl; 97 98 closure_init_stack(&cl); 99 bch2_inode_flush_nocow_writes_async(c, inode, &cl); 100 closure_sync(&cl); 101 102 return 0; 103 } 104 105 /* i_size updates: */ 106 107 struct inode_new_size { 108 loff_t new_size; 109 u64 now; 110 unsigned fields; 111 }; 112 113 static int inode_set_size(struct btree_trans *trans, 114 struct bch_inode_info *inode, 115 struct bch_inode_unpacked *bi, 116 void *p) 117 { 118 struct inode_new_size *s = p; 119 120 bi->bi_size = s->new_size; 121 if (s->fields & ATTR_ATIME) 122 bi->bi_atime = s->now; 123 if (s->fields & ATTR_MTIME) 124 bi->bi_mtime = s->now; 125 if (s->fields & ATTR_CTIME) 126 bi->bi_ctime = s->now; 127 128 return 0; 129 } 130 131 int __must_check bch2_write_inode_size(struct bch_fs *c, 132 struct bch_inode_info *inode, 133 loff_t new_size, unsigned fields) 134 { 135 struct inode_new_size s = { 136 .new_size = new_size, 137 .now = bch2_current_time(c), 138 .fields = fields, 139 }; 140 141 return bch2_write_inode(c, inode, inode_set_size, &s, fields); 142 } 143 144 void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 145 struct quota_res *quota_res, s64 sectors) 146 { 147 bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, 148 "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", 149 inode->v.i_ino, (u64) inode->v.i_blocks, sectors, 150 inode->ei_inode.bi_sectors); 151 inode->v.i_blocks += sectors; 152 153 #ifdef CONFIG_BCACHEFS_QUOTA 154 if (quota_res && 155 !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && 156 sectors > 0) { 157 BUG_ON(sectors > quota_res->sectors); 158 BUG_ON(sectors > inode->ei_quota_reserved); 159 160 quota_res->sectors -= sectors; 161 inode->ei_quota_reserved -= sectors; 162 } else { 163 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); 164 } 165 #endif 166 } 167 168 /* fsync: */ 169 170 /* 171 * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an 172 * insert trigger: look up the btree inode instead 173 */ 174 static int bch2_flush_inode(struct bch_fs *c, 175 struct bch_inode_info *inode) 176 { 177 if (c->opts.journal_flush_disabled) 178 return 0; 179 180 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) 181 return -EROFS; 182 183 struct bch_inode_unpacked u; 184 int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: 185 bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: 186 bch2_inode_flush_nocow_writes(c, inode); 187 bch2_write_ref_put(c, BCH_WRITE_REF_fsync); 188 return ret; 189 } 190 191 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) 192 { 193 struct bch_inode_info *inode = file_bch_inode(file); 194 struct bch_fs *c = inode->v.i_sb->s_fs_info; 195 int ret, err; 196 197 trace_bch2_fsync(file, datasync); 198 199 ret = file_write_and_wait_range(file, start, end); 200 if (ret) 201 goto out; 202 ret = sync_inode_metadata(&inode->v, 1); 203 if (ret) 204 goto out; 205 ret = bch2_flush_inode(c, inode); 206 out: 207 ret = bch2_err_class(ret); 208 if (ret == -EROFS) 209 ret = -EIO; 210 211 err = file_check_and_advance_wb_err(file); 212 if (!ret) 213 ret = err; 214 215 return ret; 216 } 217 218 /* truncate: */ 219 220 static inline int range_has_data(struct bch_fs *c, u32 subvol, 221 struct bpos start, 222 struct bpos end) 223 { 224 struct btree_trans *trans = bch2_trans_get(c); 225 struct btree_iter iter; 226 struct bkey_s_c k; 227 int ret = 0; 228 retry: 229 bch2_trans_begin(trans); 230 231 ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); 232 if (ret) 233 goto err; 234 235 for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) 236 if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { 237 ret = 1; 238 break; 239 } 240 start = iter.pos; 241 bch2_trans_iter_exit(trans, &iter); 242 err: 243 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 244 goto retry; 245 246 bch2_trans_put(trans); 247 return ret; 248 } 249 250 static int __bch2_truncate_folio(struct bch_inode_info *inode, 251 pgoff_t index, loff_t start, loff_t end) 252 { 253 struct bch_fs *c = inode->v.i_sb->s_fs_info; 254 struct address_space *mapping = inode->v.i_mapping; 255 struct bch_folio *s; 256 unsigned start_offset; 257 unsigned end_offset; 258 unsigned i; 259 struct folio *folio; 260 s64 i_sectors_delta = 0; 261 int ret = 0; 262 u64 end_pos; 263 264 folio = filemap_lock_folio(mapping, index); 265 if (IS_ERR_OR_NULL(folio)) { 266 /* 267 * XXX: we're doing two index lookups when we end up reading the 268 * folio 269 */ 270 ret = range_has_data(c, inode->ei_subvol, 271 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), 272 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); 273 if (ret <= 0) 274 return ret; 275 276 folio = __filemap_get_folio(mapping, index, 277 FGP_LOCK|FGP_CREAT, GFP_KERNEL); 278 if (IS_ERR_OR_NULL(folio)) { 279 ret = -ENOMEM; 280 goto out; 281 } 282 } 283 284 BUG_ON(start >= folio_end_pos(folio)); 285 BUG_ON(end <= folio_pos(folio)); 286 287 start_offset = max(start, folio_pos(folio)) - folio_pos(folio); 288 end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); 289 290 /* Folio boundary? Nothing to do */ 291 if (start_offset == 0 && 292 end_offset == folio_size(folio)) { 293 ret = 0; 294 goto unlock; 295 } 296 297 s = bch2_folio_create(folio, 0); 298 if (!s) { 299 ret = -ENOMEM; 300 goto unlock; 301 } 302 303 if (!folio_test_uptodate(folio)) { 304 ret = bch2_read_single_folio(folio, mapping); 305 if (ret) 306 goto unlock; 307 } 308 309 ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); 310 if (ret) 311 goto unlock; 312 313 for (i = round_up(start_offset, block_bytes(c)) >> 9; 314 i < round_down(end_offset, block_bytes(c)) >> 9; 315 i++) { 316 s->s[i].nr_replicas = 0; 317 318 i_sectors_delta -= s->s[i].state == SECTOR_dirty; 319 bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); 320 } 321 322 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 323 324 /* 325 * Caller needs to know whether this folio will be written out by 326 * writeback - doing an i_size update if necessary - or whether it will 327 * be responsible for the i_size update. 328 * 329 * Note that we shouldn't ever see a folio beyond EOF, but check and 330 * warn if so. This has been observed by failure to clean up folios 331 * after a short write and there's still a chance reclaim will fix 332 * things up. 333 */ 334 WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); 335 end_pos = folio_end_pos(folio); 336 if (inode->v.i_size > folio_pos(folio)) 337 end_pos = min_t(u64, inode->v.i_size, end_pos); 338 ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; 339 340 folio_zero_segment(folio, start_offset, end_offset); 341 342 /* 343 * Bit of a hack - we don't want truncate to fail due to -ENOSPC. 344 * 345 * XXX: because we aren't currently tracking whether the folio has actual 346 * data in it (vs. just 0s, or only partially written) this wrong. ick. 347 */ 348 BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); 349 350 /* 351 * This removes any writeable userspace mappings; we need to force 352 * .page_mkwrite to be called again before any mmapped writes, to 353 * redirty the full page: 354 */ 355 folio_mkclean(folio); 356 filemap_dirty_folio(mapping, folio); 357 unlock: 358 folio_unlock(folio); 359 folio_put(folio); 360 out: 361 return ret; 362 } 363 364 static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) 365 { 366 return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, 367 from, ANYSINT_MAX(loff_t)); 368 } 369 370 static int bch2_truncate_folios(struct bch_inode_info *inode, 371 loff_t start, loff_t end) 372 { 373 int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, 374 start, end); 375 376 if (ret >= 0 && 377 start >> PAGE_SHIFT != end >> PAGE_SHIFT) 378 ret = __bch2_truncate_folio(inode, 379 (end - 1) >> PAGE_SHIFT, 380 start, end); 381 return ret; 382 } 383 384 static int bch2_extend(struct mnt_idmap *idmap, 385 struct bch_inode_info *inode, 386 struct bch_inode_unpacked *inode_u, 387 struct iattr *iattr) 388 { 389 struct address_space *mapping = inode->v.i_mapping; 390 int ret; 391 392 /* 393 * sync appends: 394 * 395 * this has to be done _before_ extending i_size: 396 */ 397 ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); 398 if (ret) 399 return ret; 400 401 truncate_setsize(&inode->v, iattr->ia_size); 402 403 return bch2_setattr_nonsize(idmap, inode, iattr); 404 } 405 406 int bchfs_truncate(struct mnt_idmap *idmap, 407 struct bch_inode_info *inode, struct iattr *iattr) 408 { 409 struct bch_fs *c = inode->v.i_sb->s_fs_info; 410 struct address_space *mapping = inode->v.i_mapping; 411 struct bch_inode_unpacked inode_u; 412 s64 i_sectors_delta = 0; 413 int ret = 0; 414 415 /* 416 * If the truncate call with change the size of the file, the 417 * cmtimes should be updated. If the size will not change, we 418 * do not need to update the cmtimes. 419 */ 420 if (iattr->ia_size != inode->v.i_size) { 421 if (!(iattr->ia_valid & ATTR_MTIME)) 422 ktime_get_coarse_real_ts64(&iattr->ia_mtime); 423 if (!(iattr->ia_valid & ATTR_CTIME)) 424 ktime_get_coarse_real_ts64(&iattr->ia_ctime); 425 iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; 426 } 427 428 inode_dio_wait(&inode->v); 429 bch2_pagecache_block_get(inode); 430 431 ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); 432 if (ret) 433 goto err; 434 435 /* 436 * check this before next assertion; on filesystem error our normal 437 * invariants are a bit broken (truncate has to truncate the page cache 438 * before the inode). 439 */ 440 ret = bch2_journal_error(&c->journal); 441 if (ret) 442 goto err; 443 444 WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && 445 inode->v.i_size < inode_u.bi_size, 446 "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", 447 (u64) inode->v.i_size, inode_u.bi_size); 448 449 if (iattr->ia_size > inode->v.i_size) { 450 ret = bch2_extend(idmap, inode, &inode_u, iattr); 451 goto err; 452 } 453 454 iattr->ia_valid &= ~ATTR_SIZE; 455 456 ret = bch2_truncate_folio(inode, iattr->ia_size); 457 if (unlikely(ret < 0)) 458 goto err; 459 460 truncate_setsize(&inode->v, iattr->ia_size); 461 462 /* 463 * When extending, we're going to write the new i_size to disk 464 * immediately so we need to flush anything above the current on disk 465 * i_size first: 466 * 467 * Also, when extending we need to flush the page that i_size currently 468 * straddles - if it's mapped to userspace, we need to ensure that 469 * userspace has to redirty it and call .mkwrite -> set_page_dirty 470 * again to allocate the part of the page that was extended. 471 */ 472 if (iattr->ia_size > inode_u.bi_size) 473 ret = filemap_write_and_wait_range(mapping, 474 inode_u.bi_size, 475 iattr->ia_size - 1); 476 else if (iattr->ia_size & (PAGE_SIZE - 1)) 477 ret = filemap_write_and_wait_range(mapping, 478 round_down(iattr->ia_size, PAGE_SIZE), 479 iattr->ia_size - 1); 480 if (ret) 481 goto err; 482 483 ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); 484 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 485 486 if (unlikely(ret)) { 487 /* 488 * If we error here, VFS caches are now inconsistent with btree 489 */ 490 set_bit(EI_INODE_ERROR, &inode->ei_flags); 491 goto err; 492 } 493 494 bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && 495 !bch2_journal_error(&c->journal), c, 496 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", 497 inode->v.i_ino, (u64) inode->v.i_blocks, 498 inode->ei_inode.bi_sectors); 499 500 ret = bch2_setattr_nonsize(idmap, inode, iattr); 501 err: 502 bch2_pagecache_block_put(inode); 503 return bch2_err_class(ret); 504 } 505 506 /* fallocate: */ 507 508 static int inode_update_times_fn(struct btree_trans *trans, 509 struct bch_inode_info *inode, 510 struct bch_inode_unpacked *bi, void *p) 511 { 512 struct bch_fs *c = inode->v.i_sb->s_fs_info; 513 514 bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); 515 return 0; 516 } 517 518 static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) 519 { 520 struct bch_fs *c = inode->v.i_sb->s_fs_info; 521 u64 end = offset + len; 522 u64 block_start = round_up(offset, block_bytes(c)); 523 u64 block_end = round_down(end, block_bytes(c)); 524 bool truncated_last_page; 525 int ret = 0; 526 527 ret = bch2_truncate_folios(inode, offset, end); 528 if (unlikely(ret < 0)) 529 goto err; 530 531 truncated_last_page = ret; 532 533 truncate_pagecache_range(&inode->v, offset, end - 1); 534 535 if (block_start < block_end) { 536 s64 i_sectors_delta = 0; 537 538 ret = bch2_fpunch(c, inode_inum(inode), 539 block_start >> 9, block_end >> 9, 540 &i_sectors_delta); 541 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 542 } 543 544 mutex_lock(&inode->ei_update_lock); 545 if (end >= inode->v.i_size && !truncated_last_page) { 546 ret = bch2_write_inode_size(c, inode, inode->v.i_size, 547 ATTR_MTIME|ATTR_CTIME); 548 } else { 549 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 550 ATTR_MTIME|ATTR_CTIME); 551 } 552 mutex_unlock(&inode->ei_update_lock); 553 err: 554 return ret; 555 } 556 557 static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, 558 loff_t offset, loff_t len, 559 bool insert) 560 { 561 struct bch_fs *c = inode->v.i_sb->s_fs_info; 562 struct address_space *mapping = inode->v.i_mapping; 563 s64 i_sectors_delta = 0; 564 int ret = 0; 565 566 if ((offset | len) & (block_bytes(c) - 1)) 567 return -EINVAL; 568 569 if (insert) { 570 if (offset >= inode->v.i_size) 571 return -EINVAL; 572 } else { 573 if (offset + len >= inode->v.i_size) 574 return -EINVAL; 575 } 576 577 ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); 578 if (ret) 579 return ret; 580 581 if (insert) 582 i_size_write(&inode->v, inode->v.i_size + len); 583 584 ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, 585 insert, &i_sectors_delta); 586 if (!ret && !insert) 587 i_size_write(&inode->v, inode->v.i_size - len); 588 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 589 590 return ret; 591 } 592 593 static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, 594 u64 start_sector, u64 end_sector) 595 { 596 struct bch_fs *c = inode->v.i_sb->s_fs_info; 597 struct btree_trans *trans = bch2_trans_get(c); 598 struct btree_iter iter; 599 struct bpos end_pos = POS(inode->v.i_ino, end_sector); 600 struct bch_io_opts opts; 601 int ret = 0; 602 603 bch2_inode_opts_get(&opts, c, &inode->ei_inode); 604 605 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 606 POS(inode->v.i_ino, start_sector), 607 BTREE_ITER_slots|BTREE_ITER_intent); 608 609 while (!ret && bkey_lt(iter.pos, end_pos)) { 610 s64 i_sectors_delta = 0; 611 struct quota_res quota_res = { 0 }; 612 struct bkey_s_c k; 613 unsigned sectors; 614 bool is_allocation; 615 u64 hole_start, hole_end; 616 u32 snapshot; 617 618 bch2_trans_begin(trans); 619 620 ret = bch2_subvolume_get_snapshot(trans, 621 inode->ei_subvol, &snapshot); 622 if (ret) 623 goto bkey_err; 624 625 bch2_btree_iter_set_snapshot(&iter, snapshot); 626 627 k = bch2_btree_iter_peek_slot(&iter); 628 if ((ret = bkey_err(k))) 629 goto bkey_err; 630 631 hole_start = iter.pos.offset; 632 hole_end = bpos_min(k.k->p, end_pos).offset; 633 is_allocation = bkey_extent_is_allocation(k.k); 634 635 /* already reserved */ 636 if (bkey_extent_is_reservation(k) && 637 bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { 638 bch2_btree_iter_advance(&iter); 639 continue; 640 } 641 642 if (bkey_extent_is_data(k.k) && 643 !(mode & FALLOC_FL_ZERO_RANGE)) { 644 bch2_btree_iter_advance(&iter); 645 continue; 646 } 647 648 if (!(mode & FALLOC_FL_ZERO_RANGE)) { 649 /* 650 * Lock ordering - can't be holding btree locks while 651 * blocking on a folio lock: 652 */ 653 if (bch2_clamp_data_hole(&inode->v, 654 &hole_start, 655 &hole_end, 656 opts.data_replicas, true)) 657 ret = drop_locks_do(trans, 658 (bch2_clamp_data_hole(&inode->v, 659 &hole_start, 660 &hole_end, 661 opts.data_replicas, false), 0)); 662 bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); 663 664 if (ret) 665 goto bkey_err; 666 667 if (hole_start == hole_end) 668 continue; 669 } 670 671 sectors = hole_end - hole_start; 672 673 if (!is_allocation) { 674 ret = bch2_quota_reservation_add(c, inode, 675 "a_res, sectors, true); 676 if (unlikely(ret)) 677 goto bkey_err; 678 } 679 680 ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, 681 sectors, opts, &i_sectors_delta, 682 writepoint_hashed((unsigned long) current)); 683 if (ret) 684 goto bkey_err; 685 686 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 687 688 if (bch2_mark_pagecache_reserved(inode, &hole_start, 689 iter.pos.offset, true)) 690 drop_locks_do(trans, 691 bch2_mark_pagecache_reserved(inode, &hole_start, 692 iter.pos.offset, false)); 693 bkey_err: 694 bch2_quota_reservation_put(c, inode, "a_res); 695 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 696 ret = 0; 697 } 698 699 if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { 700 struct quota_res quota_res = { 0 }; 701 s64 i_sectors_delta = 0; 702 703 bch2_fpunch_at(trans, &iter, inode_inum(inode), 704 end_sector, &i_sectors_delta); 705 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 706 bch2_quota_reservation_put(c, inode, "a_res); 707 } 708 709 bch2_trans_iter_exit(trans, &iter); 710 bch2_trans_put(trans); 711 return ret; 712 } 713 714 static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, 715 loff_t offset, loff_t len) 716 { 717 struct bch_fs *c = inode->v.i_sb->s_fs_info; 718 u64 end = offset + len; 719 u64 block_start = round_down(offset, block_bytes(c)); 720 u64 block_end = round_up(end, block_bytes(c)); 721 bool truncated_last_page = false; 722 int ret, ret2 = 0; 723 724 if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { 725 ret = inode_newsize_ok(&inode->v, end); 726 if (ret) 727 return ret; 728 } 729 730 if (mode & FALLOC_FL_ZERO_RANGE) { 731 ret = bch2_truncate_folios(inode, offset, end); 732 if (unlikely(ret < 0)) 733 return ret; 734 735 truncated_last_page = ret; 736 737 truncate_pagecache_range(&inode->v, offset, end - 1); 738 739 block_start = round_up(offset, block_bytes(c)); 740 block_end = round_down(end, block_bytes(c)); 741 } 742 743 ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); 744 745 /* 746 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, 747 * so that the VFS cache i_size is consistent with the btree i_size: 748 */ 749 if (ret && 750 !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) 751 return ret; 752 753 if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) 754 end = inode->v.i_size; 755 756 if (end >= inode->v.i_size && 757 (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || 758 !(mode & FALLOC_FL_KEEP_SIZE))) { 759 spin_lock(&inode->v.i_lock); 760 i_size_write(&inode->v, end); 761 spin_unlock(&inode->v.i_lock); 762 763 mutex_lock(&inode->ei_update_lock); 764 ret2 = bch2_write_inode_size(c, inode, end, 0); 765 mutex_unlock(&inode->ei_update_lock); 766 } 767 768 return ret ?: ret2; 769 } 770 771 long bch2_fallocate_dispatch(struct file *file, int mode, 772 loff_t offset, loff_t len) 773 { 774 struct bch_inode_info *inode = file_bch_inode(file); 775 struct bch_fs *c = inode->v.i_sb->s_fs_info; 776 long ret; 777 778 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) 779 return -EROFS; 780 781 inode_lock(&inode->v); 782 inode_dio_wait(&inode->v); 783 bch2_pagecache_block_get(inode); 784 785 ret = file_modified(file); 786 if (ret) 787 goto err; 788 789 if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) 790 ret = bchfs_fallocate(inode, mode, offset, len); 791 else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) 792 ret = bchfs_fpunch(inode, offset, len); 793 else if (mode == FALLOC_FL_INSERT_RANGE) 794 ret = bchfs_fcollapse_finsert(inode, offset, len, true); 795 else if (mode == FALLOC_FL_COLLAPSE_RANGE) 796 ret = bchfs_fcollapse_finsert(inode, offset, len, false); 797 else 798 ret = -EOPNOTSUPP; 799 err: 800 bch2_pagecache_block_put(inode); 801 inode_unlock(&inode->v); 802 bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); 803 804 return bch2_err_class(ret); 805 } 806 807 /* 808 * Take a quota reservation for unallocated blocks in a given file range 809 * Does not check pagecache 810 */ 811 static int quota_reserve_range(struct bch_inode_info *inode, 812 struct quota_res *res, 813 u64 start, u64 end) 814 { 815 struct bch_fs *c = inode->v.i_sb->s_fs_info; 816 struct btree_trans *trans = bch2_trans_get(c); 817 struct btree_iter iter; 818 struct bkey_s_c k; 819 u32 snapshot; 820 u64 sectors = end - start; 821 u64 pos = start; 822 int ret; 823 retry: 824 bch2_trans_begin(trans); 825 826 ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); 827 if (ret) 828 goto err; 829 830 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 831 SPOS(inode->v.i_ino, pos, snapshot), 0); 832 833 while (!(ret = btree_trans_too_many_iters(trans)) && 834 (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && 835 !(ret = bkey_err(k))) { 836 if (bkey_extent_is_allocation(k.k)) { 837 u64 s = min(end, k.k->p.offset) - 838 max(start, bkey_start_offset(k.k)); 839 BUG_ON(s > sectors); 840 sectors -= s; 841 } 842 bch2_btree_iter_advance(&iter); 843 } 844 pos = iter.pos.offset; 845 bch2_trans_iter_exit(trans, &iter); 846 err: 847 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 848 goto retry; 849 850 bch2_trans_put(trans); 851 852 return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); 853 } 854 855 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, 856 struct file *file_dst, loff_t pos_dst, 857 loff_t len, unsigned remap_flags) 858 { 859 struct bch_inode_info *src = file_bch_inode(file_src); 860 struct bch_inode_info *dst = file_bch_inode(file_dst); 861 struct bch_fs *c = src->v.i_sb->s_fs_info; 862 struct quota_res quota_res = { 0 }; 863 s64 i_sectors_delta = 0; 864 u64 aligned_len; 865 loff_t ret = 0; 866 867 if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) 868 return -EINVAL; 869 870 if ((pos_src & (block_bytes(c) - 1)) || 871 (pos_dst & (block_bytes(c) - 1))) 872 return -EINVAL; 873 874 if (src == dst && 875 abs(pos_src - pos_dst) < len) 876 return -EINVAL; 877 878 lock_two_nondirectories(&src->v, &dst->v); 879 bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); 880 881 inode_dio_wait(&src->v); 882 inode_dio_wait(&dst->v); 883 884 ret = generic_remap_file_range_prep(file_src, pos_src, 885 file_dst, pos_dst, 886 &len, remap_flags); 887 if (ret < 0 || len == 0) 888 goto err; 889 890 aligned_len = round_up((u64) len, block_bytes(c)); 891 892 ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, 893 pos_dst, pos_dst + len - 1); 894 if (ret) 895 goto err; 896 897 ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, 898 (pos_dst + aligned_len) >> 9); 899 if (ret) 900 goto err; 901 902 if (!(remap_flags & REMAP_FILE_DEDUP)) 903 file_update_time(file_dst); 904 905 bch2_mark_pagecache_unallocated(src, pos_src >> 9, 906 (pos_src + aligned_len) >> 9); 907 908 ret = bch2_remap_range(c, 909 inode_inum(dst), pos_dst >> 9, 910 inode_inum(src), pos_src >> 9, 911 aligned_len >> 9, 912 pos_dst + len, &i_sectors_delta); 913 if (ret < 0) 914 goto err; 915 916 /* 917 * due to alignment, we might have remapped slightly more than requsted 918 */ 919 ret = min((u64) ret << 9, (u64) len); 920 921 bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); 922 923 spin_lock(&dst->v.i_lock); 924 if (pos_dst + ret > dst->v.i_size) 925 i_size_write(&dst->v, pos_dst + ret); 926 spin_unlock(&dst->v.i_lock); 927 928 if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || 929 IS_SYNC(file_inode(file_dst))) 930 ret = bch2_flush_inode(c, dst); 931 err: 932 bch2_quota_reservation_put(c, dst, "a_res); 933 bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); 934 unlock_two_nondirectories(&src->v, &dst->v); 935 936 return bch2_err_class(ret); 937 } 938 939 /* fseek: */ 940 941 static loff_t bch2_seek_data(struct file *file, u64 offset) 942 { 943 struct bch_inode_info *inode = file_bch_inode(file); 944 struct bch_fs *c = inode->v.i_sb->s_fs_info; 945 struct btree_trans *trans; 946 struct btree_iter iter; 947 struct bkey_s_c k; 948 subvol_inum inum = inode_inum(inode); 949 u64 isize, next_data = MAX_LFS_FILESIZE; 950 u32 snapshot; 951 int ret; 952 953 isize = i_size_read(&inode->v); 954 if (offset >= isize) 955 return -ENXIO; 956 957 trans = bch2_trans_get(c); 958 retry: 959 bch2_trans_begin(trans); 960 961 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 962 if (ret) 963 goto err; 964 965 for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, 966 SPOS(inode->v.i_ino, offset >> 9, snapshot), 967 POS(inode->v.i_ino, U64_MAX), 968 0, k, ret) { 969 if (bkey_extent_is_data(k.k)) { 970 next_data = max(offset, bkey_start_offset(k.k) << 9); 971 break; 972 } else if (k.k->p.offset >> 9 > isize) 973 break; 974 } 975 bch2_trans_iter_exit(trans, &iter); 976 err: 977 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 978 goto retry; 979 980 bch2_trans_put(trans); 981 if (ret) 982 return ret; 983 984 if (next_data > offset) 985 next_data = bch2_seek_pagecache_data(&inode->v, 986 offset, next_data, 0, false); 987 988 if (next_data >= isize) 989 return -ENXIO; 990 991 return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); 992 } 993 994 static loff_t bch2_seek_hole(struct file *file, u64 offset) 995 { 996 struct bch_inode_info *inode = file_bch_inode(file); 997 struct bch_fs *c = inode->v.i_sb->s_fs_info; 998 struct btree_trans *trans; 999 struct btree_iter iter; 1000 struct bkey_s_c k; 1001 subvol_inum inum = inode_inum(inode); 1002 u64 isize, next_hole = MAX_LFS_FILESIZE; 1003 u32 snapshot; 1004 int ret; 1005 1006 isize = i_size_read(&inode->v); 1007 if (offset >= isize) 1008 return -ENXIO; 1009 1010 trans = bch2_trans_get(c); 1011 retry: 1012 bch2_trans_begin(trans); 1013 1014 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1015 if (ret) 1016 goto err; 1017 1018 for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, 1019 SPOS(inode->v.i_ino, offset >> 9, snapshot), 1020 BTREE_ITER_slots, k, ret) { 1021 if (k.k->p.inode != inode->v.i_ino) { 1022 next_hole = bch2_seek_pagecache_hole(&inode->v, 1023 offset, MAX_LFS_FILESIZE, 0, false); 1024 break; 1025 } else if (!bkey_extent_is_data(k.k)) { 1026 next_hole = bch2_seek_pagecache_hole(&inode->v, 1027 max(offset, bkey_start_offset(k.k) << 9), 1028 k.k->p.offset << 9, 0, false); 1029 1030 if (next_hole < k.k->p.offset << 9) 1031 break; 1032 } else { 1033 offset = max(offset, bkey_start_offset(k.k) << 9); 1034 } 1035 } 1036 bch2_trans_iter_exit(trans, &iter); 1037 err: 1038 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1039 goto retry; 1040 1041 bch2_trans_put(trans); 1042 if (ret) 1043 return ret; 1044 1045 if (next_hole > isize) 1046 next_hole = isize; 1047 1048 return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); 1049 } 1050 1051 loff_t bch2_llseek(struct file *file, loff_t offset, int whence) 1052 { 1053 loff_t ret; 1054 1055 switch (whence) { 1056 case SEEK_SET: 1057 case SEEK_CUR: 1058 case SEEK_END: 1059 ret = generic_file_llseek(file, offset, whence); 1060 break; 1061 case SEEK_DATA: 1062 ret = bch2_seek_data(file, offset); 1063 break; 1064 case SEEK_HOLE: 1065 ret = bch2_seek_hole(file, offset); 1066 break; 1067 default: 1068 ret = -EINVAL; 1069 break; 1070 } 1071 1072 return bch2_err_class(ret); 1073 } 1074 1075 void bch2_fs_fsio_exit(struct bch_fs *c) 1076 { 1077 bioset_exit(&c->nocow_flush_bioset); 1078 } 1079 1080 int bch2_fs_fsio_init(struct bch_fs *c) 1081 { 1082 if (bioset_init(&c->nocow_flush_bioset, 1083 1, offsetof(struct nocow_flush, bio), 0)) 1084 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; 1085 1086 return 0; 1087 } 1088 1089 #endif /* NO_BCACHEFS_FS */ 1090