1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "alloc_foreground.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "clock.h" 10 #include "error.h" 11 #include "extents.h" 12 #include "extent_update.h" 13 #include "fs.h" 14 #include "fs-io.h" 15 #include "fs-io-buffered.h" 16 #include "fs-io-pagecache.h" 17 #include "fsck.h" 18 #include "inode.h" 19 #include "journal.h" 20 #include "io_misc.h" 21 #include "keylist.h" 22 #include "quota.h" 23 #include "reflink.h" 24 #include "trace.h" 25 26 #include <linux/aio.h> 27 #include <linux/backing-dev.h> 28 #include <linux/falloc.h> 29 #include <linux/migrate.h> 30 #include <linux/mmu_context.h> 31 #include <linux/pagevec.h> 32 #include <linux/rmap.h> 33 #include <linux/sched/signal.h> 34 #include <linux/task_io_accounting_ops.h> 35 #include <linux/uio.h> 36 37 #include <trace/events/writeback.h> 38 39 struct nocow_flush { 40 struct closure *cl; 41 struct bch_dev *ca; 42 struct bio bio; 43 }; 44 45 static void nocow_flush_endio(struct bio *_bio) 46 { 47 48 struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); 49 50 closure_put(bio->cl); 51 percpu_ref_put(&bio->ca->io_ref[WRITE]); 52 bio_put(&bio->bio); 53 } 54 55 void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 56 struct bch_inode_info *inode, 57 struct closure *cl) 58 { 59 struct nocow_flush *bio; 60 struct bch_dev *ca; 61 struct bch_devs_mask devs; 62 unsigned dev; 63 64 dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); 65 if (dev == BCH_SB_MEMBERS_MAX) 66 return; 67 68 devs = inode->ei_devs_need_flush; 69 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 70 71 for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { 72 rcu_read_lock(); 73 ca = rcu_dereference(c->devs[dev]); 74 if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE])) 75 ca = NULL; 76 rcu_read_unlock(); 77 78 if (!ca) 79 continue; 80 81 bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, 82 REQ_OP_WRITE|REQ_PREFLUSH, 83 GFP_KERNEL, 84 &c->nocow_flush_bioset), 85 struct nocow_flush, bio); 86 bio->cl = cl; 87 bio->ca = ca; 88 bio->bio.bi_end_io = nocow_flush_endio; 89 closure_bio_submit(&bio->bio, cl); 90 } 91 } 92 93 static int bch2_inode_flush_nocow_writes(struct bch_fs *c, 94 struct bch_inode_info *inode) 95 { 96 struct closure cl; 97 98 closure_init_stack(&cl); 99 bch2_inode_flush_nocow_writes_async(c, inode, &cl); 100 closure_sync(&cl); 101 102 return 0; 103 } 104 105 /* i_size updates: */ 106 107 struct inode_new_size { 108 loff_t new_size; 109 u64 now; 110 unsigned fields; 111 }; 112 113 static int inode_set_size(struct btree_trans *trans, 114 struct bch_inode_info *inode, 115 struct bch_inode_unpacked *bi, 116 void *p) 117 { 118 struct inode_new_size *s = p; 119 120 bi->bi_size = s->new_size; 121 if (s->fields & ATTR_ATIME) 122 bi->bi_atime = s->now; 123 if (s->fields & ATTR_MTIME) 124 bi->bi_mtime = s->now; 125 if (s->fields & ATTR_CTIME) 126 bi->bi_ctime = s->now; 127 128 return 0; 129 } 130 131 int __must_check bch2_write_inode_size(struct bch_fs *c, 132 struct bch_inode_info *inode, 133 loff_t new_size, unsigned fields) 134 { 135 struct inode_new_size s = { 136 .new_size = new_size, 137 .now = bch2_current_time(c), 138 .fields = fields, 139 }; 140 141 return bch2_write_inode(c, inode, inode_set_size, &s, fields); 142 } 143 144 void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 145 struct quota_res *quota_res, s64 sectors) 146 { 147 bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, 148 "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", 149 inode->v.i_ino, (u64) inode->v.i_blocks, sectors, 150 inode->ei_inode.bi_sectors); 151 inode->v.i_blocks += sectors; 152 153 #ifdef CONFIG_BCACHEFS_QUOTA 154 if (quota_res && 155 !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && 156 sectors > 0) { 157 BUG_ON(sectors > quota_res->sectors); 158 BUG_ON(sectors > inode->ei_quota_reserved); 159 160 quota_res->sectors -= sectors; 161 inode->ei_quota_reserved -= sectors; 162 } else { 163 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); 164 } 165 #endif 166 } 167 168 /* fsync: */ 169 170 static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, 171 u64 *seq) 172 { 173 struct printbuf buf = PRINTBUF; 174 struct bch_inode_unpacked u; 175 struct btree_iter iter; 176 int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); 177 if (ret) 178 return ret; 179 180 u64 cur_seq = journal_cur_seq(&trans->c->journal); 181 *seq = min(cur_seq, u.bi_journal_seq); 182 183 if (fsck_err_on(u.bi_journal_seq > cur_seq, 184 trans, inode_journal_seq_in_future, 185 "inode journal seq in future (currently at %llu)\n%s", 186 cur_seq, 187 (bch2_inode_unpacked_to_text(&buf, &u), 188 buf.buf))) { 189 u.bi_journal_seq = cur_seq; 190 ret = bch2_inode_write(trans, &iter, &u); 191 } 192 fsck_err: 193 bch2_trans_iter_exit(trans, &iter); 194 printbuf_exit(&buf); 195 return ret; 196 } 197 198 /* 199 * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an 200 * insert trigger: look up the btree inode instead 201 */ 202 static int bch2_flush_inode(struct bch_fs *c, 203 struct bch_inode_info *inode) 204 { 205 if (c->opts.journal_flush_disabled) 206 return 0; 207 208 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) 209 return -EROFS; 210 211 u64 seq; 212 int ret = bch2_trans_commit_do(c, NULL, NULL, 0, 213 bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: 214 bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: 215 bch2_inode_flush_nocow_writes(c, inode); 216 bch2_write_ref_put(c, BCH_WRITE_REF_fsync); 217 return ret; 218 } 219 220 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) 221 { 222 struct bch_inode_info *inode = file_bch_inode(file); 223 struct bch_fs *c = inode->v.i_sb->s_fs_info; 224 int ret, err; 225 226 trace_bch2_fsync(file, datasync); 227 228 ret = file_write_and_wait_range(file, start, end); 229 if (ret) 230 goto out; 231 ret = sync_inode_metadata(&inode->v, 1); 232 if (ret) 233 goto out; 234 ret = bch2_flush_inode(c, inode); 235 out: 236 ret = bch2_err_class(ret); 237 if (ret == -EROFS) 238 ret = -EIO; 239 240 err = file_check_and_advance_wb_err(file); 241 if (!ret) 242 ret = err; 243 244 return ret; 245 } 246 247 /* truncate: */ 248 249 static inline int range_has_data(struct bch_fs *c, u32 subvol, 250 struct bpos start, 251 struct bpos end) 252 { 253 return bch2_trans_run(c, 254 for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, 255 subvol, 0, k, ({ 256 bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); 257 }))); 258 } 259 260 static int __bch2_truncate_folio(struct bch_inode_info *inode, 261 pgoff_t index, loff_t start, loff_t end) 262 { 263 struct bch_fs *c = inode->v.i_sb->s_fs_info; 264 struct address_space *mapping = inode->v.i_mapping; 265 struct bch_folio *s; 266 unsigned start_offset; 267 unsigned end_offset; 268 unsigned i; 269 struct folio *folio; 270 s64 i_sectors_delta = 0; 271 int ret = 0; 272 u64 end_pos; 273 274 folio = filemap_lock_folio(mapping, index); 275 if (IS_ERR_OR_NULL(folio)) { 276 /* 277 * XXX: we're doing two index lookups when we end up reading the 278 * folio 279 */ 280 ret = range_has_data(c, inode->ei_inum.subvol, 281 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), 282 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); 283 if (ret <= 0) 284 return ret; 285 286 folio = __filemap_get_folio(mapping, index, 287 FGP_LOCK|FGP_CREAT, GFP_KERNEL); 288 if (IS_ERR(folio)) { 289 ret = -ENOMEM; 290 goto out; 291 } 292 } 293 294 BUG_ON(start >= folio_end_pos(folio)); 295 BUG_ON(end <= folio_pos(folio)); 296 297 start_offset = max(start, folio_pos(folio)) - folio_pos(folio); 298 end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); 299 300 /* Folio boundary? Nothing to do */ 301 if (start_offset == 0 && 302 end_offset == folio_size(folio)) { 303 ret = 0; 304 goto unlock; 305 } 306 307 s = bch2_folio_create(folio, 0); 308 if (!s) { 309 ret = -ENOMEM; 310 goto unlock; 311 } 312 313 if (!folio_test_uptodate(folio)) { 314 ret = bch2_read_single_folio(folio, mapping); 315 if (ret) 316 goto unlock; 317 } 318 319 ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); 320 if (ret) 321 goto unlock; 322 323 for (i = round_up(start_offset, block_bytes(c)) >> 9; 324 i < round_down(end_offset, block_bytes(c)) >> 9; 325 i++) { 326 s->s[i].nr_replicas = 0; 327 328 i_sectors_delta -= s->s[i].state == SECTOR_dirty; 329 bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); 330 } 331 332 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 333 334 /* 335 * Caller needs to know whether this folio will be written out by 336 * writeback - doing an i_size update if necessary - or whether it will 337 * be responsible for the i_size update. 338 * 339 * Note that we shouldn't ever see a folio beyond EOF, but check and 340 * warn if so. This has been observed by failure to clean up folios 341 * after a short write and there's still a chance reclaim will fix 342 * things up. 343 */ 344 WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); 345 end_pos = folio_end_pos(folio); 346 if (inode->v.i_size > folio_pos(folio)) 347 end_pos = min_t(u64, inode->v.i_size, end_pos); 348 ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; 349 350 folio_zero_segment(folio, start_offset, end_offset); 351 352 /* 353 * Bit of a hack - we don't want truncate to fail due to -ENOSPC. 354 * 355 * XXX: because we aren't currently tracking whether the folio has actual 356 * data in it (vs. just 0s, or only partially written) this wrong. ick. 357 */ 358 BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); 359 360 /* 361 * This removes any writeable userspace mappings; we need to force 362 * .page_mkwrite to be called again before any mmapped writes, to 363 * redirty the full page: 364 */ 365 folio_mkclean(folio); 366 filemap_dirty_folio(mapping, folio); 367 unlock: 368 folio_unlock(folio); 369 folio_put(folio); 370 out: 371 return ret; 372 } 373 374 static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) 375 { 376 return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, 377 from, ANYSINT_MAX(loff_t)); 378 } 379 380 static int bch2_truncate_folios(struct bch_inode_info *inode, 381 loff_t start, loff_t end) 382 { 383 int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, 384 start, end); 385 386 if (ret >= 0 && 387 start >> PAGE_SHIFT != end >> PAGE_SHIFT) 388 ret = __bch2_truncate_folio(inode, 389 (end - 1) >> PAGE_SHIFT, 390 start, end); 391 return ret; 392 } 393 394 static int bch2_extend(struct mnt_idmap *idmap, 395 struct bch_inode_info *inode, 396 struct bch_inode_unpacked *inode_u, 397 struct iattr *iattr) 398 { 399 struct address_space *mapping = inode->v.i_mapping; 400 int ret; 401 402 /* 403 * sync appends: 404 * 405 * this has to be done _before_ extending i_size: 406 */ 407 ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); 408 if (ret) 409 return ret; 410 411 truncate_setsize(&inode->v, iattr->ia_size); 412 413 return bch2_setattr_nonsize(idmap, inode, iattr); 414 } 415 416 int bchfs_truncate(struct mnt_idmap *idmap, 417 struct bch_inode_info *inode, struct iattr *iattr) 418 { 419 struct bch_fs *c = inode->v.i_sb->s_fs_info; 420 struct address_space *mapping = inode->v.i_mapping; 421 struct bch_inode_unpacked inode_u; 422 s64 i_sectors_delta = 0; 423 int ret = 0; 424 425 /* 426 * If the truncate call with change the size of the file, the 427 * cmtimes should be updated. If the size will not change, we 428 * do not need to update the cmtimes. 429 */ 430 if (iattr->ia_size != inode->v.i_size) { 431 if (!(iattr->ia_valid & ATTR_MTIME)) 432 ktime_get_coarse_real_ts64(&iattr->ia_mtime); 433 if (!(iattr->ia_valid & ATTR_CTIME)) 434 ktime_get_coarse_real_ts64(&iattr->ia_ctime); 435 iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; 436 } 437 438 inode_dio_wait(&inode->v); 439 bch2_pagecache_block_get(inode); 440 441 ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); 442 if (ret) 443 goto err; 444 445 /* 446 * check this before next assertion; on filesystem error our normal 447 * invariants are a bit broken (truncate has to truncate the page cache 448 * before the inode). 449 */ 450 ret = bch2_journal_error(&c->journal); 451 if (ret) 452 goto err; 453 454 WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && 455 inode->v.i_size < inode_u.bi_size, 456 "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", 457 (u64) inode->v.i_size, inode_u.bi_size); 458 459 if (iattr->ia_size > inode->v.i_size) { 460 ret = bch2_extend(idmap, inode, &inode_u, iattr); 461 goto err; 462 } 463 464 iattr->ia_valid &= ~ATTR_SIZE; 465 466 ret = bch2_truncate_folio(inode, iattr->ia_size); 467 if (unlikely(ret < 0)) 468 goto err; 469 ret = 0; 470 471 truncate_setsize(&inode->v, iattr->ia_size); 472 473 /* 474 * When extending, we're going to write the new i_size to disk 475 * immediately so we need to flush anything above the current on disk 476 * i_size first: 477 * 478 * Also, when extending we need to flush the page that i_size currently 479 * straddles - if it's mapped to userspace, we need to ensure that 480 * userspace has to redirty it and call .mkwrite -> set_page_dirty 481 * again to allocate the part of the page that was extended. 482 */ 483 if (iattr->ia_size > inode_u.bi_size) 484 ret = filemap_write_and_wait_range(mapping, 485 inode_u.bi_size, 486 iattr->ia_size - 1); 487 else if (iattr->ia_size & (PAGE_SIZE - 1)) 488 ret = filemap_write_and_wait_range(mapping, 489 round_down(iattr->ia_size, PAGE_SIZE), 490 iattr->ia_size - 1); 491 if (ret) 492 goto err; 493 494 ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); 495 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 496 497 if (unlikely(ret)) { 498 /* 499 * If we error here, VFS caches are now inconsistent with btree 500 */ 501 set_bit(EI_INODE_ERROR, &inode->ei_flags); 502 goto err; 503 } 504 505 bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && 506 !bch2_journal_error(&c->journal), c, 507 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", 508 inode->v.i_ino, (u64) inode->v.i_blocks, 509 inode->ei_inode.bi_sectors); 510 511 ret = bch2_setattr_nonsize(idmap, inode, iattr); 512 err: 513 bch2_pagecache_block_put(inode); 514 return bch2_err_class(ret); 515 } 516 517 /* fallocate: */ 518 519 static int inode_update_times_fn(struct btree_trans *trans, 520 struct bch_inode_info *inode, 521 struct bch_inode_unpacked *bi, void *p) 522 { 523 struct bch_fs *c = inode->v.i_sb->s_fs_info; 524 525 bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); 526 return 0; 527 } 528 529 static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) 530 { 531 struct bch_fs *c = inode->v.i_sb->s_fs_info; 532 u64 end = offset + len; 533 u64 block_start = round_up(offset, block_bytes(c)); 534 u64 block_end = round_down(end, block_bytes(c)); 535 bool truncated_last_page; 536 int ret = 0; 537 538 ret = bch2_truncate_folios(inode, offset, end); 539 if (unlikely(ret < 0)) 540 goto err; 541 542 truncated_last_page = ret; 543 544 truncate_pagecache_range(&inode->v, offset, end - 1); 545 546 if (block_start < block_end) { 547 s64 i_sectors_delta = 0; 548 549 ret = bch2_fpunch(c, inode_inum(inode), 550 block_start >> 9, block_end >> 9, 551 &i_sectors_delta); 552 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 553 } 554 555 mutex_lock(&inode->ei_update_lock); 556 if (end >= inode->v.i_size && !truncated_last_page) { 557 ret = bch2_write_inode_size(c, inode, inode->v.i_size, 558 ATTR_MTIME|ATTR_CTIME); 559 } else { 560 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 561 ATTR_MTIME|ATTR_CTIME); 562 } 563 mutex_unlock(&inode->ei_update_lock); 564 err: 565 return ret; 566 } 567 568 static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, 569 loff_t offset, loff_t len, 570 bool insert) 571 { 572 struct bch_fs *c = inode->v.i_sb->s_fs_info; 573 struct address_space *mapping = inode->v.i_mapping; 574 s64 i_sectors_delta = 0; 575 int ret = 0; 576 577 if ((offset | len) & (block_bytes(c) - 1)) 578 return -EINVAL; 579 580 if (insert) { 581 if (offset >= inode->v.i_size) 582 return -EINVAL; 583 } else { 584 if (offset + len >= inode->v.i_size) 585 return -EINVAL; 586 } 587 588 ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); 589 if (ret) 590 return ret; 591 592 if (insert) 593 i_size_write(&inode->v, inode->v.i_size + len); 594 595 ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, 596 insert, &i_sectors_delta); 597 if (!ret && !insert) 598 i_size_write(&inode->v, inode->v.i_size - len); 599 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 600 601 return ret; 602 } 603 604 static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, 605 u64 start_sector, u64 end_sector) 606 { 607 struct bch_fs *c = inode->v.i_sb->s_fs_info; 608 struct btree_trans *trans = bch2_trans_get(c); 609 struct btree_iter iter; 610 struct bpos end_pos = POS(inode->v.i_ino, end_sector); 611 struct bch_io_opts opts; 612 int ret = 0; 613 614 bch2_inode_opts_get(&opts, c, &inode->ei_inode); 615 616 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 617 POS(inode->v.i_ino, start_sector), 618 BTREE_ITER_slots|BTREE_ITER_intent); 619 620 while (!ret) { 621 s64 i_sectors_delta = 0; 622 struct quota_res quota_res = { 0 }; 623 struct bkey_s_c k; 624 unsigned sectors; 625 bool is_allocation; 626 u64 hole_start, hole_end; 627 u32 snapshot; 628 629 bch2_trans_begin(trans); 630 631 if (bkey_ge(iter.pos, end_pos)) 632 break; 633 634 ret = bch2_subvolume_get_snapshot(trans, 635 inode->ei_inum.subvol, &snapshot); 636 if (ret) 637 goto bkey_err; 638 639 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 640 641 k = bch2_btree_iter_peek_slot(trans, &iter); 642 if ((ret = bkey_err(k))) 643 goto bkey_err; 644 645 hole_start = iter.pos.offset; 646 hole_end = bpos_min(k.k->p, end_pos).offset; 647 is_allocation = bkey_extent_is_allocation(k.k); 648 649 /* already reserved */ 650 if (bkey_extent_is_reservation(k) && 651 bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { 652 bch2_btree_iter_advance(trans, &iter); 653 continue; 654 } 655 656 if (bkey_extent_is_data(k.k) && 657 !(mode & FALLOC_FL_ZERO_RANGE)) { 658 bch2_btree_iter_advance(trans, &iter); 659 continue; 660 } 661 662 if (!(mode & FALLOC_FL_ZERO_RANGE)) { 663 /* 664 * Lock ordering - can't be holding btree locks while 665 * blocking on a folio lock: 666 */ 667 if (bch2_clamp_data_hole(&inode->v, 668 &hole_start, 669 &hole_end, 670 opts.data_replicas, true)) { 671 ret = drop_locks_do(trans, 672 (bch2_clamp_data_hole(&inode->v, 673 &hole_start, 674 &hole_end, 675 opts.data_replicas, false), 0)); 676 if (ret) 677 goto bkey_err; 678 } 679 bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); 680 681 if (ret) 682 goto bkey_err; 683 684 if (hole_start == hole_end) 685 continue; 686 } 687 688 sectors = hole_end - hole_start; 689 690 if (!is_allocation) { 691 ret = bch2_quota_reservation_add(c, inode, 692 "a_res, sectors, true); 693 if (unlikely(ret)) 694 goto bkey_err; 695 } 696 697 ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, 698 sectors, opts, &i_sectors_delta, 699 writepoint_hashed((unsigned long) current)); 700 if (ret) 701 goto bkey_err; 702 703 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 704 705 if (bch2_mark_pagecache_reserved(inode, &hole_start, 706 iter.pos.offset, true)) { 707 ret = drop_locks_do(trans, 708 bch2_mark_pagecache_reserved(inode, &hole_start, 709 iter.pos.offset, false)); 710 if (ret) 711 goto bkey_err; 712 } 713 bkey_err: 714 bch2_quota_reservation_put(c, inode, "a_res); 715 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 716 ret = 0; 717 } 718 719 if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { 720 struct quota_res quota_res = { 0 }; 721 s64 i_sectors_delta = 0; 722 723 bch2_fpunch_at(trans, &iter, inode_inum(inode), 724 end_sector, &i_sectors_delta); 725 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 726 bch2_quota_reservation_put(c, inode, "a_res); 727 } 728 729 bch2_trans_iter_exit(trans, &iter); 730 bch2_trans_put(trans); 731 return ret; 732 } 733 734 static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, 735 loff_t offset, loff_t len) 736 { 737 struct bch_fs *c = inode->v.i_sb->s_fs_info; 738 u64 end = offset + len; 739 u64 block_start = round_down(offset, block_bytes(c)); 740 u64 block_end = round_up(end, block_bytes(c)); 741 bool truncated_last_page = false; 742 int ret, ret2 = 0; 743 744 if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { 745 ret = inode_newsize_ok(&inode->v, end); 746 if (ret) 747 return ret; 748 } 749 750 if (mode & FALLOC_FL_ZERO_RANGE) { 751 ret = bch2_truncate_folios(inode, offset, end); 752 if (unlikely(ret < 0)) 753 return ret; 754 755 truncated_last_page = ret; 756 757 truncate_pagecache_range(&inode->v, offset, end - 1); 758 759 block_start = round_up(offset, block_bytes(c)); 760 block_end = round_down(end, block_bytes(c)); 761 } 762 763 ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); 764 765 /* 766 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, 767 * so that the VFS cache i_size is consistent with the btree i_size: 768 */ 769 if (ret && 770 !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) 771 return ret; 772 773 if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) 774 end = inode->v.i_size; 775 776 if (end >= inode->v.i_size && 777 (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || 778 !(mode & FALLOC_FL_KEEP_SIZE))) { 779 spin_lock(&inode->v.i_lock); 780 i_size_write(&inode->v, end); 781 spin_unlock(&inode->v.i_lock); 782 783 mutex_lock(&inode->ei_update_lock); 784 ret2 = bch2_write_inode_size(c, inode, end, 0); 785 mutex_unlock(&inode->ei_update_lock); 786 } 787 788 return ret ?: ret2; 789 } 790 791 long bch2_fallocate_dispatch(struct file *file, int mode, 792 loff_t offset, loff_t len) 793 { 794 struct bch_inode_info *inode = file_bch_inode(file); 795 struct bch_fs *c = inode->v.i_sb->s_fs_info; 796 long ret; 797 798 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) 799 return -EROFS; 800 801 inode_lock(&inode->v); 802 inode_dio_wait(&inode->v); 803 bch2_pagecache_block_get(inode); 804 805 ret = file_modified(file); 806 if (ret) 807 goto err; 808 809 if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) 810 ret = bchfs_fallocate(inode, mode, offset, len); 811 else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) 812 ret = bchfs_fpunch(inode, offset, len); 813 else if (mode == FALLOC_FL_INSERT_RANGE) 814 ret = bchfs_fcollapse_finsert(inode, offset, len, true); 815 else if (mode == FALLOC_FL_COLLAPSE_RANGE) 816 ret = bchfs_fcollapse_finsert(inode, offset, len, false); 817 else 818 ret = -EOPNOTSUPP; 819 err: 820 bch2_pagecache_block_put(inode); 821 inode_unlock(&inode->v); 822 bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); 823 824 return bch2_err_class(ret); 825 } 826 827 /* 828 * Take a quota reservation for unallocated blocks in a given file range 829 * Does not check pagecache 830 */ 831 static int quota_reserve_range(struct bch_inode_info *inode, 832 struct quota_res *res, 833 u64 start, u64 end) 834 { 835 struct bch_fs *c = inode->v.i_sb->s_fs_info; 836 u64 sectors = end - start; 837 838 int ret = bch2_trans_run(c, 839 for_each_btree_key_in_subvolume_max(trans, iter, 840 BTREE_ID_extents, 841 POS(inode->v.i_ino, start), 842 POS(inode->v.i_ino, end - 1), 843 inode->ei_inum.subvol, 0, k, ({ 844 if (bkey_extent_is_allocation(k.k)) { 845 u64 s = min(end, k.k->p.offset) - 846 max(start, bkey_start_offset(k.k)); 847 BUG_ON(s > sectors); 848 sectors -= s; 849 } 850 851 0; 852 }))); 853 854 return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); 855 } 856 857 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, 858 struct file *file_dst, loff_t pos_dst, 859 loff_t len, unsigned remap_flags) 860 { 861 struct bch_inode_info *src = file_bch_inode(file_src); 862 struct bch_inode_info *dst = file_bch_inode(file_dst); 863 struct bch_fs *c = src->v.i_sb->s_fs_info; 864 struct quota_res quota_res = { 0 }; 865 s64 i_sectors_delta = 0; 866 u64 aligned_len; 867 loff_t ret = 0; 868 869 if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) 870 return -EINVAL; 871 872 if ((pos_src & (block_bytes(c) - 1)) || 873 (pos_dst & (block_bytes(c) - 1))) 874 return -EINVAL; 875 876 if (src == dst && 877 abs(pos_src - pos_dst) < len) 878 return -EINVAL; 879 880 lock_two_nondirectories(&src->v, &dst->v); 881 bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); 882 883 inode_dio_wait(&src->v); 884 inode_dio_wait(&dst->v); 885 886 ret = generic_remap_file_range_prep(file_src, pos_src, 887 file_dst, pos_dst, 888 &len, remap_flags); 889 if (ret < 0 || len == 0) 890 goto err; 891 892 aligned_len = round_up((u64) len, block_bytes(c)); 893 894 ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, 895 pos_dst, pos_dst + len - 1); 896 if (ret) 897 goto err; 898 899 ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, 900 (pos_dst + aligned_len) >> 9); 901 if (ret) 902 goto err; 903 904 if (!(remap_flags & REMAP_FILE_DEDUP)) 905 file_update_time(file_dst); 906 907 bch2_mark_pagecache_unallocated(src, pos_src >> 9, 908 (pos_src + aligned_len) >> 9); 909 910 /* 911 * XXX: we'd like to be telling bch2_remap_range() if we have 912 * permission to write to the source file, and thus if io path option 913 * changes should be propagated through the copy, but we need mnt_idmap 914 * from the pathwalk, awkward 915 */ 916 ret = bch2_remap_range(c, 917 inode_inum(dst), pos_dst >> 9, 918 inode_inum(src), pos_src >> 9, 919 aligned_len >> 9, 920 pos_dst + len, &i_sectors_delta, 921 false); 922 if (ret < 0) 923 goto err; 924 925 /* 926 * due to alignment, we might have remapped slightly more than requsted 927 */ 928 ret = min((u64) ret << 9, (u64) len); 929 930 bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); 931 932 spin_lock(&dst->v.i_lock); 933 if (pos_dst + ret > dst->v.i_size) 934 i_size_write(&dst->v, pos_dst + ret); 935 spin_unlock(&dst->v.i_lock); 936 937 if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || 938 IS_SYNC(file_inode(file_dst))) 939 ret = bch2_flush_inode(c, dst); 940 err: 941 bch2_quota_reservation_put(c, dst, "a_res); 942 bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); 943 unlock_two_nondirectories(&src->v, &dst->v); 944 945 return bch2_err_class(ret); 946 } 947 948 /* fseek: */ 949 950 static loff_t bch2_seek_data(struct file *file, u64 offset) 951 { 952 struct bch_inode_info *inode = file_bch_inode(file); 953 struct bch_fs *c = inode->v.i_sb->s_fs_info; 954 subvol_inum inum = inode_inum(inode); 955 u64 isize, next_data = MAX_LFS_FILESIZE; 956 957 isize = i_size_read(&inode->v); 958 if (offset >= isize) 959 return -ENXIO; 960 961 int ret = bch2_trans_run(c, 962 for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, 963 POS(inode->v.i_ino, offset >> 9), 964 POS(inode->v.i_ino, U64_MAX), 965 inum.subvol, 0, k, ({ 966 if (bkey_extent_is_data(k.k)) { 967 next_data = max(offset, bkey_start_offset(k.k) << 9); 968 break; 969 } else if (k.k->p.offset >> 9 > isize) 970 break; 971 0; 972 }))); 973 if (ret) 974 return ret; 975 976 if (next_data > offset) 977 next_data = bch2_seek_pagecache_data(&inode->v, 978 offset, next_data, 0, false); 979 980 if (next_data >= isize) 981 return -ENXIO; 982 983 return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); 984 } 985 986 static loff_t bch2_seek_hole(struct file *file, u64 offset) 987 { 988 struct bch_inode_info *inode = file_bch_inode(file); 989 struct bch_fs *c = inode->v.i_sb->s_fs_info; 990 subvol_inum inum = inode_inum(inode); 991 u64 isize, next_hole = MAX_LFS_FILESIZE; 992 993 isize = i_size_read(&inode->v); 994 if (offset >= isize) 995 return -ENXIO; 996 997 int ret = bch2_trans_run(c, 998 for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, 999 POS(inode->v.i_ino, offset >> 9), 1000 POS(inode->v.i_ino, U64_MAX), 1001 inum.subvol, BTREE_ITER_slots, k, ({ 1002 if (k.k->p.inode != inode->v.i_ino || 1003 !bkey_extent_is_data(k.k)) { 1004 loff_t start_offset = k.k->p.inode == inode->v.i_ino 1005 ? max(offset, bkey_start_offset(k.k) << 9) 1006 : offset; 1007 loff_t end_offset = k.k->p.inode == inode->v.i_ino 1008 ? MAX_LFS_FILESIZE 1009 : k.k->p.offset << 9; 1010 1011 /* 1012 * Found a hole in the btree, now make sure it's 1013 * a hole in the pagecache. We might have to 1014 * keep searching if this hole is entirely dirty 1015 * in the page cache: 1016 */ 1017 bch2_trans_unlock(trans); 1018 loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, 1019 start_offset, end_offset, 0, false); 1020 if (pagecache_hole < end_offset) { 1021 next_hole = pagecache_hole; 1022 break; 1023 } 1024 } else { 1025 offset = max(offset, bkey_start_offset(k.k) << 9); 1026 } 1027 0; 1028 }))); 1029 if (ret) 1030 return ret; 1031 1032 if (next_hole > isize) 1033 next_hole = isize; 1034 1035 return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); 1036 } 1037 1038 loff_t bch2_llseek(struct file *file, loff_t offset, int whence) 1039 { 1040 loff_t ret; 1041 1042 switch (whence) { 1043 case SEEK_SET: 1044 case SEEK_CUR: 1045 case SEEK_END: 1046 ret = generic_file_llseek(file, offset, whence); 1047 break; 1048 case SEEK_DATA: 1049 ret = bch2_seek_data(file, offset); 1050 break; 1051 case SEEK_HOLE: 1052 ret = bch2_seek_hole(file, offset); 1053 break; 1054 default: 1055 ret = -EINVAL; 1056 break; 1057 } 1058 1059 return bch2_err_class(ret); 1060 } 1061 1062 void bch2_fs_fsio_exit(struct bch_fs *c) 1063 { 1064 bioset_exit(&c->nocow_flush_bioset); 1065 } 1066 1067 int bch2_fs_fsio_init(struct bch_fs *c) 1068 { 1069 if (bioset_init(&c->nocow_flush_bioset, 1070 1, offsetof(struct nocow_flush, bio), 0)) 1071 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; 1072 1073 return 0; 1074 } 1075 1076 #endif /* NO_BCACHEFS_FS */ 1077