1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "alloc_foreground.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "clock.h" 10 #include "error.h" 11 #include "extents.h" 12 #include "extent_update.h" 13 #include "fs.h" 14 #include "fs-io.h" 15 #include "fs-io-buffered.h" 16 #include "fs-io-pagecache.h" 17 #include "fsck.h" 18 #include "inode.h" 19 #include "journal.h" 20 #include "io_misc.h" 21 #include "keylist.h" 22 #include "quota.h" 23 #include "reflink.h" 24 #include "trace.h" 25 26 #include <linux/aio.h> 27 #include <linux/backing-dev.h> 28 #include <linux/falloc.h> 29 #include <linux/migrate.h> 30 #include <linux/mmu_context.h> 31 #include <linux/pagevec.h> 32 #include <linux/rmap.h> 33 #include <linux/sched/signal.h> 34 #include <linux/task_io_accounting_ops.h> 35 #include <linux/uio.h> 36 37 #include <trace/events/writeback.h> 38 39 struct nocow_flush { 40 struct closure *cl; 41 struct bch_dev *ca; 42 struct bio bio; 43 }; 44 45 static void nocow_flush_endio(struct bio *_bio) 46 { 47 48 struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); 49 50 closure_put(bio->cl); 51 percpu_ref_put(&bio->ca->io_ref); 52 bio_put(&bio->bio); 53 } 54 55 void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 56 struct bch_inode_info *inode, 57 struct closure *cl) 58 { 59 struct nocow_flush *bio; 60 struct bch_dev *ca; 61 struct bch_devs_mask devs; 62 unsigned dev; 63 64 dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); 65 if (dev == BCH_SB_MEMBERS_MAX) 66 return; 67 68 devs = inode->ei_devs_need_flush; 69 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 70 71 for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { 72 rcu_read_lock(); 73 ca = rcu_dereference(c->devs[dev]); 74 if (ca && !percpu_ref_tryget(&ca->io_ref)) 75 ca = NULL; 76 rcu_read_unlock(); 77 78 if (!ca) 79 continue; 80 81 bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, 82 REQ_OP_FLUSH, 83 GFP_KERNEL, 84 &c->nocow_flush_bioset), 85 struct nocow_flush, bio); 86 bio->cl = cl; 87 bio->ca = ca; 88 bio->bio.bi_end_io = nocow_flush_endio; 89 closure_bio_submit(&bio->bio, cl); 90 } 91 } 92 93 static int bch2_inode_flush_nocow_writes(struct bch_fs *c, 94 struct bch_inode_info *inode) 95 { 96 struct closure cl; 97 98 closure_init_stack(&cl); 99 bch2_inode_flush_nocow_writes_async(c, inode, &cl); 100 closure_sync(&cl); 101 102 return 0; 103 } 104 105 /* i_size updates: */ 106 107 struct inode_new_size { 108 loff_t new_size; 109 u64 now; 110 unsigned fields; 111 }; 112 113 static int inode_set_size(struct btree_trans *trans, 114 struct bch_inode_info *inode, 115 struct bch_inode_unpacked *bi, 116 void *p) 117 { 118 struct inode_new_size *s = p; 119 120 bi->bi_size = s->new_size; 121 if (s->fields & ATTR_ATIME) 122 bi->bi_atime = s->now; 123 if (s->fields & ATTR_MTIME) 124 bi->bi_mtime = s->now; 125 if (s->fields & ATTR_CTIME) 126 bi->bi_ctime = s->now; 127 128 return 0; 129 } 130 131 int __must_check bch2_write_inode_size(struct bch_fs *c, 132 struct bch_inode_info *inode, 133 loff_t new_size, unsigned fields) 134 { 135 struct inode_new_size s = { 136 .new_size = new_size, 137 .now = bch2_current_time(c), 138 .fields = fields, 139 }; 140 141 return bch2_write_inode(c, inode, inode_set_size, &s, fields); 142 } 143 144 void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 145 struct quota_res *quota_res, s64 sectors) 146 { 147 bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, 148 "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", 149 inode->v.i_ino, (u64) inode->v.i_blocks, sectors, 150 inode->ei_inode.bi_sectors); 151 inode->v.i_blocks += sectors; 152 153 #ifdef CONFIG_BCACHEFS_QUOTA 154 if (quota_res && 155 !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && 156 sectors > 0) { 157 BUG_ON(sectors > quota_res->sectors); 158 BUG_ON(sectors > inode->ei_quota_reserved); 159 160 quota_res->sectors -= sectors; 161 inode->ei_quota_reserved -= sectors; 162 } else { 163 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); 164 } 165 #endif 166 } 167 168 /* fsync: */ 169 170 /* 171 * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an 172 * insert trigger: look up the btree inode instead 173 */ 174 static int bch2_flush_inode(struct bch_fs *c, 175 struct bch_inode_info *inode) 176 { 177 struct bch_inode_unpacked u; 178 int ret; 179 180 if (c->opts.journal_flush_disabled) 181 return 0; 182 183 ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); 184 if (ret) 185 return ret; 186 187 return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: 188 bch2_inode_flush_nocow_writes(c, inode); 189 } 190 191 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) 192 { 193 struct bch_inode_info *inode = file_bch_inode(file); 194 struct bch_fs *c = inode->v.i_sb->s_fs_info; 195 int ret, ret2, ret3; 196 197 ret = file_write_and_wait_range(file, start, end); 198 ret2 = sync_inode_metadata(&inode->v, 1); 199 ret3 = bch2_flush_inode(c, inode); 200 201 return bch2_err_class(ret ?: ret2 ?: ret3); 202 } 203 204 /* truncate: */ 205 206 static inline int range_has_data(struct bch_fs *c, u32 subvol, 207 struct bpos start, 208 struct bpos end) 209 { 210 struct btree_trans *trans = bch2_trans_get(c); 211 struct btree_iter iter; 212 struct bkey_s_c k; 213 int ret = 0; 214 retry: 215 bch2_trans_begin(trans); 216 217 ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); 218 if (ret) 219 goto err; 220 221 for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) 222 if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { 223 ret = 1; 224 break; 225 } 226 start = iter.pos; 227 bch2_trans_iter_exit(trans, &iter); 228 err: 229 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 230 goto retry; 231 232 bch2_trans_put(trans); 233 return ret; 234 } 235 236 static int __bch2_truncate_folio(struct bch_inode_info *inode, 237 pgoff_t index, loff_t start, loff_t end) 238 { 239 struct bch_fs *c = inode->v.i_sb->s_fs_info; 240 struct address_space *mapping = inode->v.i_mapping; 241 struct bch_folio *s; 242 unsigned start_offset; 243 unsigned end_offset; 244 unsigned i; 245 struct folio *folio; 246 s64 i_sectors_delta = 0; 247 int ret = 0; 248 u64 end_pos; 249 250 folio = filemap_lock_folio(mapping, index); 251 if (IS_ERR_OR_NULL(folio)) { 252 /* 253 * XXX: we're doing two index lookups when we end up reading the 254 * folio 255 */ 256 ret = range_has_data(c, inode->ei_subvol, 257 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), 258 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); 259 if (ret <= 0) 260 return ret; 261 262 folio = __filemap_get_folio(mapping, index, 263 FGP_LOCK|FGP_CREAT, GFP_KERNEL); 264 if (IS_ERR_OR_NULL(folio)) { 265 ret = -ENOMEM; 266 goto out; 267 } 268 } 269 270 BUG_ON(start >= folio_end_pos(folio)); 271 BUG_ON(end <= folio_pos(folio)); 272 273 start_offset = max(start, folio_pos(folio)) - folio_pos(folio); 274 end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); 275 276 /* Folio boundary? Nothing to do */ 277 if (start_offset == 0 && 278 end_offset == folio_size(folio)) { 279 ret = 0; 280 goto unlock; 281 } 282 283 s = bch2_folio_create(folio, 0); 284 if (!s) { 285 ret = -ENOMEM; 286 goto unlock; 287 } 288 289 if (!folio_test_uptodate(folio)) { 290 ret = bch2_read_single_folio(folio, mapping); 291 if (ret) 292 goto unlock; 293 } 294 295 ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); 296 if (ret) 297 goto unlock; 298 299 for (i = round_up(start_offset, block_bytes(c)) >> 9; 300 i < round_down(end_offset, block_bytes(c)) >> 9; 301 i++) { 302 s->s[i].nr_replicas = 0; 303 304 i_sectors_delta -= s->s[i].state == SECTOR_dirty; 305 bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); 306 } 307 308 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 309 310 /* 311 * Caller needs to know whether this folio will be written out by 312 * writeback - doing an i_size update if necessary - or whether it will 313 * be responsible for the i_size update. 314 * 315 * Note that we shouldn't ever see a folio beyond EOF, but check and 316 * warn if so. This has been observed by failure to clean up folios 317 * after a short write and there's still a chance reclaim will fix 318 * things up. 319 */ 320 WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); 321 end_pos = folio_end_pos(folio); 322 if (inode->v.i_size > folio_pos(folio)) 323 end_pos = min_t(u64, inode->v.i_size, end_pos); 324 ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; 325 326 folio_zero_segment(folio, start_offset, end_offset); 327 328 /* 329 * Bit of a hack - we don't want truncate to fail due to -ENOSPC. 330 * 331 * XXX: because we aren't currently tracking whether the folio has actual 332 * data in it (vs. just 0s, or only partially written) this wrong. ick. 333 */ 334 BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); 335 336 /* 337 * This removes any writeable userspace mappings; we need to force 338 * .page_mkwrite to be called again before any mmapped writes, to 339 * redirty the full page: 340 */ 341 folio_mkclean(folio); 342 filemap_dirty_folio(mapping, folio); 343 unlock: 344 folio_unlock(folio); 345 folio_put(folio); 346 out: 347 return ret; 348 } 349 350 static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) 351 { 352 return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, 353 from, ANYSINT_MAX(loff_t)); 354 } 355 356 static int bch2_truncate_folios(struct bch_inode_info *inode, 357 loff_t start, loff_t end) 358 { 359 int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, 360 start, end); 361 362 if (ret >= 0 && 363 start >> PAGE_SHIFT != end >> PAGE_SHIFT) 364 ret = __bch2_truncate_folio(inode, 365 (end - 1) >> PAGE_SHIFT, 366 start, end); 367 return ret; 368 } 369 370 static int bch2_extend(struct mnt_idmap *idmap, 371 struct bch_inode_info *inode, 372 struct bch_inode_unpacked *inode_u, 373 struct iattr *iattr) 374 { 375 struct address_space *mapping = inode->v.i_mapping; 376 int ret; 377 378 /* 379 * sync appends: 380 * 381 * this has to be done _before_ extending i_size: 382 */ 383 ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); 384 if (ret) 385 return ret; 386 387 truncate_setsize(&inode->v, iattr->ia_size); 388 389 return bch2_setattr_nonsize(idmap, inode, iattr); 390 } 391 392 int bchfs_truncate(struct mnt_idmap *idmap, 393 struct bch_inode_info *inode, struct iattr *iattr) 394 { 395 struct bch_fs *c = inode->v.i_sb->s_fs_info; 396 struct address_space *mapping = inode->v.i_mapping; 397 struct bch_inode_unpacked inode_u; 398 s64 i_sectors_delta = 0; 399 int ret = 0; 400 401 /* 402 * If the truncate call with change the size of the file, the 403 * cmtimes should be updated. If the size will not change, we 404 * do not need to update the cmtimes. 405 */ 406 if (iattr->ia_size != inode->v.i_size) { 407 if (!(iattr->ia_valid & ATTR_MTIME)) 408 ktime_get_coarse_real_ts64(&iattr->ia_mtime); 409 if (!(iattr->ia_valid & ATTR_CTIME)) 410 ktime_get_coarse_real_ts64(&iattr->ia_ctime); 411 iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; 412 } 413 414 inode_dio_wait(&inode->v); 415 bch2_pagecache_block_get(inode); 416 417 ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); 418 if (ret) 419 goto err; 420 421 /* 422 * check this before next assertion; on filesystem error our normal 423 * invariants are a bit broken (truncate has to truncate the page cache 424 * before the inode). 425 */ 426 ret = bch2_journal_error(&c->journal); 427 if (ret) 428 goto err; 429 430 WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && 431 inode->v.i_size < inode_u.bi_size, 432 "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", 433 (u64) inode->v.i_size, inode_u.bi_size); 434 435 if (iattr->ia_size > inode->v.i_size) { 436 ret = bch2_extend(idmap, inode, &inode_u, iattr); 437 goto err; 438 } 439 440 iattr->ia_valid &= ~ATTR_SIZE; 441 442 ret = bch2_truncate_folio(inode, iattr->ia_size); 443 if (unlikely(ret < 0)) 444 goto err; 445 446 truncate_setsize(&inode->v, iattr->ia_size); 447 448 /* 449 * When extending, we're going to write the new i_size to disk 450 * immediately so we need to flush anything above the current on disk 451 * i_size first: 452 * 453 * Also, when extending we need to flush the page that i_size currently 454 * straddles - if it's mapped to userspace, we need to ensure that 455 * userspace has to redirty it and call .mkwrite -> set_page_dirty 456 * again to allocate the part of the page that was extended. 457 */ 458 if (iattr->ia_size > inode_u.bi_size) 459 ret = filemap_write_and_wait_range(mapping, 460 inode_u.bi_size, 461 iattr->ia_size - 1); 462 else if (iattr->ia_size & (PAGE_SIZE - 1)) 463 ret = filemap_write_and_wait_range(mapping, 464 round_down(iattr->ia_size, PAGE_SIZE), 465 iattr->ia_size - 1); 466 if (ret) 467 goto err; 468 469 ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); 470 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 471 472 if (unlikely(ret)) { 473 /* 474 * If we error here, VFS caches are now inconsistent with btree 475 */ 476 set_bit(EI_INODE_ERROR, &inode->ei_flags); 477 goto err; 478 } 479 480 bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && 481 !bch2_journal_error(&c->journal), c, 482 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", 483 inode->v.i_ino, (u64) inode->v.i_blocks, 484 inode->ei_inode.bi_sectors); 485 486 ret = bch2_setattr_nonsize(idmap, inode, iattr); 487 err: 488 bch2_pagecache_block_put(inode); 489 return bch2_err_class(ret); 490 } 491 492 /* fallocate: */ 493 494 static int inode_update_times_fn(struct btree_trans *trans, 495 struct bch_inode_info *inode, 496 struct bch_inode_unpacked *bi, void *p) 497 { 498 struct bch_fs *c = inode->v.i_sb->s_fs_info; 499 500 bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); 501 return 0; 502 } 503 504 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) 505 { 506 struct bch_fs *c = inode->v.i_sb->s_fs_info; 507 u64 end = offset + len; 508 u64 block_start = round_up(offset, block_bytes(c)); 509 u64 block_end = round_down(end, block_bytes(c)); 510 bool truncated_last_page; 511 int ret = 0; 512 513 ret = bch2_truncate_folios(inode, offset, end); 514 if (unlikely(ret < 0)) 515 goto err; 516 517 truncated_last_page = ret; 518 519 truncate_pagecache_range(&inode->v, offset, end - 1); 520 521 if (block_start < block_end) { 522 s64 i_sectors_delta = 0; 523 524 ret = bch2_fpunch(c, inode_inum(inode), 525 block_start >> 9, block_end >> 9, 526 &i_sectors_delta); 527 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 528 } 529 530 mutex_lock(&inode->ei_update_lock); 531 if (end >= inode->v.i_size && !truncated_last_page) { 532 ret = bch2_write_inode_size(c, inode, inode->v.i_size, 533 ATTR_MTIME|ATTR_CTIME); 534 } else { 535 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 536 ATTR_MTIME|ATTR_CTIME); 537 } 538 mutex_unlock(&inode->ei_update_lock); 539 err: 540 return ret; 541 } 542 543 static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, 544 loff_t offset, loff_t len, 545 bool insert) 546 { 547 struct bch_fs *c = inode->v.i_sb->s_fs_info; 548 struct address_space *mapping = inode->v.i_mapping; 549 s64 i_sectors_delta = 0; 550 int ret = 0; 551 552 if ((offset | len) & (block_bytes(c) - 1)) 553 return -EINVAL; 554 555 if (insert) { 556 if (offset >= inode->v.i_size) 557 return -EINVAL; 558 } else { 559 if (offset + len >= inode->v.i_size) 560 return -EINVAL; 561 } 562 563 ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); 564 if (ret) 565 return ret; 566 567 if (insert) 568 i_size_write(&inode->v, inode->v.i_size + len); 569 570 ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, 571 insert, &i_sectors_delta); 572 if (!ret && !insert) 573 i_size_write(&inode->v, inode->v.i_size - len); 574 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 575 576 return ret; 577 } 578 579 static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, 580 u64 start_sector, u64 end_sector) 581 { 582 struct bch_fs *c = inode->v.i_sb->s_fs_info; 583 struct btree_trans *trans = bch2_trans_get(c); 584 struct btree_iter iter; 585 struct bpos end_pos = POS(inode->v.i_ino, end_sector); 586 struct bch_io_opts opts; 587 int ret = 0; 588 589 bch2_inode_opts_get(&opts, c, &inode->ei_inode); 590 591 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 592 POS(inode->v.i_ino, start_sector), 593 BTREE_ITER_SLOTS|BTREE_ITER_INTENT); 594 595 while (!ret && bkey_lt(iter.pos, end_pos)) { 596 s64 i_sectors_delta = 0; 597 struct quota_res quota_res = { 0 }; 598 struct bkey_s_c k; 599 unsigned sectors; 600 bool is_allocation; 601 u64 hole_start, hole_end; 602 u32 snapshot; 603 604 bch2_trans_begin(trans); 605 606 ret = bch2_subvolume_get_snapshot(trans, 607 inode->ei_subvol, &snapshot); 608 if (ret) 609 goto bkey_err; 610 611 bch2_btree_iter_set_snapshot(&iter, snapshot); 612 613 k = bch2_btree_iter_peek_slot(&iter); 614 if ((ret = bkey_err(k))) 615 goto bkey_err; 616 617 hole_start = iter.pos.offset; 618 hole_end = bpos_min(k.k->p, end_pos).offset; 619 is_allocation = bkey_extent_is_allocation(k.k); 620 621 /* already reserved */ 622 if (bkey_extent_is_reservation(k) && 623 bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { 624 bch2_btree_iter_advance(&iter); 625 continue; 626 } 627 628 if (bkey_extent_is_data(k.k) && 629 !(mode & FALLOC_FL_ZERO_RANGE)) { 630 bch2_btree_iter_advance(&iter); 631 continue; 632 } 633 634 if (!(mode & FALLOC_FL_ZERO_RANGE)) { 635 /* 636 * Lock ordering - can't be holding btree locks while 637 * blocking on a folio lock: 638 */ 639 if (bch2_clamp_data_hole(&inode->v, 640 &hole_start, 641 &hole_end, 642 opts.data_replicas, true)) 643 ret = drop_locks_do(trans, 644 (bch2_clamp_data_hole(&inode->v, 645 &hole_start, 646 &hole_end, 647 opts.data_replicas, false), 0)); 648 bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); 649 650 if (ret) 651 goto bkey_err; 652 653 if (hole_start == hole_end) 654 continue; 655 } 656 657 sectors = hole_end - hole_start; 658 659 if (!is_allocation) { 660 ret = bch2_quota_reservation_add(c, inode, 661 "a_res, sectors, true); 662 if (unlikely(ret)) 663 goto bkey_err; 664 } 665 666 ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, 667 sectors, opts, &i_sectors_delta, 668 writepoint_hashed((unsigned long) current)); 669 if (ret) 670 goto bkey_err; 671 672 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 673 674 drop_locks_do(trans, 675 (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); 676 bkey_err: 677 bch2_quota_reservation_put(c, inode, "a_res); 678 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 679 ret = 0; 680 } 681 682 if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { 683 struct quota_res quota_res = { 0 }; 684 s64 i_sectors_delta = 0; 685 686 bch2_fpunch_at(trans, &iter, inode_inum(inode), 687 end_sector, &i_sectors_delta); 688 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 689 bch2_quota_reservation_put(c, inode, "a_res); 690 } 691 692 bch2_trans_iter_exit(trans, &iter); 693 bch2_trans_put(trans); 694 return ret; 695 } 696 697 static long bchfs_fallocate(struct bch_inode_info *inode, int mode, 698 loff_t offset, loff_t len) 699 { 700 struct bch_fs *c = inode->v.i_sb->s_fs_info; 701 u64 end = offset + len; 702 u64 block_start = round_down(offset, block_bytes(c)); 703 u64 block_end = round_up(end, block_bytes(c)); 704 bool truncated_last_page = false; 705 int ret, ret2 = 0; 706 707 if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { 708 ret = inode_newsize_ok(&inode->v, end); 709 if (ret) 710 return ret; 711 } 712 713 if (mode & FALLOC_FL_ZERO_RANGE) { 714 ret = bch2_truncate_folios(inode, offset, end); 715 if (unlikely(ret < 0)) 716 return ret; 717 718 truncated_last_page = ret; 719 720 truncate_pagecache_range(&inode->v, offset, end - 1); 721 722 block_start = round_up(offset, block_bytes(c)); 723 block_end = round_down(end, block_bytes(c)); 724 } 725 726 ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); 727 728 /* 729 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, 730 * so that the VFS cache i_size is consistent with the btree i_size: 731 */ 732 if (ret && 733 !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) 734 return ret; 735 736 if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) 737 end = inode->v.i_size; 738 739 if (end >= inode->v.i_size && 740 (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || 741 !(mode & FALLOC_FL_KEEP_SIZE))) { 742 spin_lock(&inode->v.i_lock); 743 i_size_write(&inode->v, end); 744 spin_unlock(&inode->v.i_lock); 745 746 mutex_lock(&inode->ei_update_lock); 747 ret2 = bch2_write_inode_size(c, inode, end, 0); 748 mutex_unlock(&inode->ei_update_lock); 749 } 750 751 return ret ?: ret2; 752 } 753 754 long bch2_fallocate_dispatch(struct file *file, int mode, 755 loff_t offset, loff_t len) 756 { 757 struct bch_inode_info *inode = file_bch_inode(file); 758 struct bch_fs *c = inode->v.i_sb->s_fs_info; 759 long ret; 760 761 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) 762 return -EROFS; 763 764 inode_lock(&inode->v); 765 inode_dio_wait(&inode->v); 766 bch2_pagecache_block_get(inode); 767 768 ret = file_modified(file); 769 if (ret) 770 goto err; 771 772 if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) 773 ret = bchfs_fallocate(inode, mode, offset, len); 774 else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) 775 ret = bchfs_fpunch(inode, offset, len); 776 else if (mode == FALLOC_FL_INSERT_RANGE) 777 ret = bchfs_fcollapse_finsert(inode, offset, len, true); 778 else if (mode == FALLOC_FL_COLLAPSE_RANGE) 779 ret = bchfs_fcollapse_finsert(inode, offset, len, false); 780 else 781 ret = -EOPNOTSUPP; 782 err: 783 bch2_pagecache_block_put(inode); 784 inode_unlock(&inode->v); 785 bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); 786 787 return bch2_err_class(ret); 788 } 789 790 /* 791 * Take a quota reservation for unallocated blocks in a given file range 792 * Does not check pagecache 793 */ 794 static int quota_reserve_range(struct bch_inode_info *inode, 795 struct quota_res *res, 796 u64 start, u64 end) 797 { 798 struct bch_fs *c = inode->v.i_sb->s_fs_info; 799 struct btree_trans *trans = bch2_trans_get(c); 800 struct btree_iter iter; 801 struct bkey_s_c k; 802 u32 snapshot; 803 u64 sectors = end - start; 804 u64 pos = start; 805 int ret; 806 retry: 807 bch2_trans_begin(trans); 808 809 ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); 810 if (ret) 811 goto err; 812 813 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 814 SPOS(inode->v.i_ino, pos, snapshot), 0); 815 816 while (!(ret = btree_trans_too_many_iters(trans)) && 817 (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && 818 !(ret = bkey_err(k))) { 819 if (bkey_extent_is_allocation(k.k)) { 820 u64 s = min(end, k.k->p.offset) - 821 max(start, bkey_start_offset(k.k)); 822 BUG_ON(s > sectors); 823 sectors -= s; 824 } 825 bch2_btree_iter_advance(&iter); 826 } 827 pos = iter.pos.offset; 828 bch2_trans_iter_exit(trans, &iter); 829 err: 830 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 831 goto retry; 832 833 bch2_trans_put(trans); 834 835 return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); 836 } 837 838 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, 839 struct file *file_dst, loff_t pos_dst, 840 loff_t len, unsigned remap_flags) 841 { 842 struct bch_inode_info *src = file_bch_inode(file_src); 843 struct bch_inode_info *dst = file_bch_inode(file_dst); 844 struct bch_fs *c = src->v.i_sb->s_fs_info; 845 struct quota_res quota_res = { 0 }; 846 s64 i_sectors_delta = 0; 847 u64 aligned_len; 848 loff_t ret = 0; 849 850 if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) 851 return -EINVAL; 852 853 if (remap_flags & REMAP_FILE_DEDUP) 854 return -EOPNOTSUPP; 855 856 if ((pos_src & (block_bytes(c) - 1)) || 857 (pos_dst & (block_bytes(c) - 1))) 858 return -EINVAL; 859 860 if (src == dst && 861 abs(pos_src - pos_dst) < len) 862 return -EINVAL; 863 864 bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); 865 866 inode_dio_wait(&src->v); 867 inode_dio_wait(&dst->v); 868 869 ret = generic_remap_file_range_prep(file_src, pos_src, 870 file_dst, pos_dst, 871 &len, remap_flags); 872 if (ret < 0 || len == 0) 873 goto err; 874 875 aligned_len = round_up((u64) len, block_bytes(c)); 876 877 ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, 878 pos_dst, pos_dst + len - 1); 879 if (ret) 880 goto err; 881 882 ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, 883 (pos_dst + aligned_len) >> 9); 884 if (ret) 885 goto err; 886 887 file_update_time(file_dst); 888 889 bch2_mark_pagecache_unallocated(src, pos_src >> 9, 890 (pos_src + aligned_len) >> 9); 891 892 ret = bch2_remap_range(c, 893 inode_inum(dst), pos_dst >> 9, 894 inode_inum(src), pos_src >> 9, 895 aligned_len >> 9, 896 pos_dst + len, &i_sectors_delta); 897 if (ret < 0) 898 goto err; 899 900 /* 901 * due to alignment, we might have remapped slightly more than requsted 902 */ 903 ret = min((u64) ret << 9, (u64) len); 904 905 bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); 906 907 spin_lock(&dst->v.i_lock); 908 if (pos_dst + ret > dst->v.i_size) 909 i_size_write(&dst->v, pos_dst + ret); 910 spin_unlock(&dst->v.i_lock); 911 912 if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || 913 IS_SYNC(file_inode(file_dst))) 914 ret = bch2_flush_inode(c, dst); 915 err: 916 bch2_quota_reservation_put(c, dst, "a_res); 917 bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); 918 919 return bch2_err_class(ret); 920 } 921 922 /* fseek: */ 923 924 static loff_t bch2_seek_data(struct file *file, u64 offset) 925 { 926 struct bch_inode_info *inode = file_bch_inode(file); 927 struct bch_fs *c = inode->v.i_sb->s_fs_info; 928 struct btree_trans *trans; 929 struct btree_iter iter; 930 struct bkey_s_c k; 931 subvol_inum inum = inode_inum(inode); 932 u64 isize, next_data = MAX_LFS_FILESIZE; 933 u32 snapshot; 934 int ret; 935 936 isize = i_size_read(&inode->v); 937 if (offset >= isize) 938 return -ENXIO; 939 940 trans = bch2_trans_get(c); 941 retry: 942 bch2_trans_begin(trans); 943 944 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 945 if (ret) 946 goto err; 947 948 for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, 949 SPOS(inode->v.i_ino, offset >> 9, snapshot), 950 POS(inode->v.i_ino, U64_MAX), 951 0, k, ret) { 952 if (bkey_extent_is_data(k.k)) { 953 next_data = max(offset, bkey_start_offset(k.k) << 9); 954 break; 955 } else if (k.k->p.offset >> 9 > isize) 956 break; 957 } 958 bch2_trans_iter_exit(trans, &iter); 959 err: 960 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 961 goto retry; 962 963 bch2_trans_put(trans); 964 if (ret) 965 return ret; 966 967 if (next_data > offset) 968 next_data = bch2_seek_pagecache_data(&inode->v, 969 offset, next_data, 0, false); 970 971 if (next_data >= isize) 972 return -ENXIO; 973 974 return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); 975 } 976 977 static loff_t bch2_seek_hole(struct file *file, u64 offset) 978 { 979 struct bch_inode_info *inode = file_bch_inode(file); 980 struct bch_fs *c = inode->v.i_sb->s_fs_info; 981 struct btree_trans *trans; 982 struct btree_iter iter; 983 struct bkey_s_c k; 984 subvol_inum inum = inode_inum(inode); 985 u64 isize, next_hole = MAX_LFS_FILESIZE; 986 u32 snapshot; 987 int ret; 988 989 isize = i_size_read(&inode->v); 990 if (offset >= isize) 991 return -ENXIO; 992 993 trans = bch2_trans_get(c); 994 retry: 995 bch2_trans_begin(trans); 996 997 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 998 if (ret) 999 goto err; 1000 1001 for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, 1002 SPOS(inode->v.i_ino, offset >> 9, snapshot), 1003 BTREE_ITER_SLOTS, k, ret) { 1004 if (k.k->p.inode != inode->v.i_ino) { 1005 next_hole = bch2_seek_pagecache_hole(&inode->v, 1006 offset, MAX_LFS_FILESIZE, 0, false); 1007 break; 1008 } else if (!bkey_extent_is_data(k.k)) { 1009 next_hole = bch2_seek_pagecache_hole(&inode->v, 1010 max(offset, bkey_start_offset(k.k) << 9), 1011 k.k->p.offset << 9, 0, false); 1012 1013 if (next_hole < k.k->p.offset << 9) 1014 break; 1015 } else { 1016 offset = max(offset, bkey_start_offset(k.k) << 9); 1017 } 1018 } 1019 bch2_trans_iter_exit(trans, &iter); 1020 err: 1021 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1022 goto retry; 1023 1024 bch2_trans_put(trans); 1025 if (ret) 1026 return ret; 1027 1028 if (next_hole > isize) 1029 next_hole = isize; 1030 1031 return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); 1032 } 1033 1034 loff_t bch2_llseek(struct file *file, loff_t offset, int whence) 1035 { 1036 loff_t ret; 1037 1038 switch (whence) { 1039 case SEEK_SET: 1040 case SEEK_CUR: 1041 case SEEK_END: 1042 ret = generic_file_llseek(file, offset, whence); 1043 break; 1044 case SEEK_DATA: 1045 ret = bch2_seek_data(file, offset); 1046 break; 1047 case SEEK_HOLE: 1048 ret = bch2_seek_hole(file, offset); 1049 break; 1050 default: 1051 ret = -EINVAL; 1052 break; 1053 } 1054 1055 return bch2_err_class(ret); 1056 } 1057 1058 void bch2_fs_fsio_exit(struct bch_fs *c) 1059 { 1060 bioset_exit(&c->nocow_flush_bioset); 1061 } 1062 1063 int bch2_fs_fsio_init(struct bch_fs *c) 1064 { 1065 if (bioset_init(&c->nocow_flush_bioset, 1066 1, offsetof(struct nocow_flush, bio), 0)) 1067 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; 1068 1069 return 0; 1070 } 1071 1072 #endif /* NO_BCACHEFS_FS */ 1073