1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "alloc_foreground.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "clock.h" 10 #include "enumerated_ref.h" 11 #include "error.h" 12 #include "extents.h" 13 #include "extent_update.h" 14 #include "fs.h" 15 #include "fs-io.h" 16 #include "fs-io-buffered.h" 17 #include "fs-io-pagecache.h" 18 #include "fsck.h" 19 #include "inode.h" 20 #include "journal.h" 21 #include "io_misc.h" 22 #include "keylist.h" 23 #include "quota.h" 24 #include "reflink.h" 25 #include "trace.h" 26 27 #include <linux/aio.h> 28 #include <linux/backing-dev.h> 29 #include <linux/falloc.h> 30 #include <linux/migrate.h> 31 #include <linux/mmu_context.h> 32 #include <linux/pagevec.h> 33 #include <linux/rmap.h> 34 #include <linux/sched/signal.h> 35 #include <linux/task_io_accounting_ops.h> 36 #include <linux/uio.h> 37 38 #include <trace/events/writeback.h> 39 40 struct nocow_flush { 41 struct closure *cl; 42 struct bch_dev *ca; 43 struct bio bio; 44 }; 45 46 static void nocow_flush_endio(struct bio *_bio) 47 { 48 49 struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); 50 51 closure_put(bio->cl); 52 enumerated_ref_put(&bio->ca->io_ref[WRITE], 53 BCH_DEV_WRITE_REF_nocow_flush); 54 bio_put(&bio->bio); 55 } 56 57 void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 58 struct bch_inode_info *inode, 59 struct closure *cl) 60 { 61 struct nocow_flush *bio; 62 struct bch_dev *ca; 63 struct bch_devs_mask devs; 64 unsigned dev; 65 66 dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); 67 if (dev == BCH_SB_MEMBERS_MAX) 68 return; 69 70 devs = inode->ei_devs_need_flush; 71 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 72 73 for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { 74 scoped_guard(rcu) { 75 ca = rcu_dereference(c->devs[dev]); 76 if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], 77 BCH_DEV_WRITE_REF_nocow_flush)) 78 ca = NULL; 79 } 80 81 if (!ca) 82 continue; 83 84 bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, 85 REQ_OP_WRITE|REQ_PREFLUSH, 86 GFP_KERNEL, 87 &c->nocow_flush_bioset), 88 struct nocow_flush, bio); 89 bio->cl = cl; 90 bio->ca = ca; 91 bio->bio.bi_end_io = nocow_flush_endio; 92 closure_bio_submit(&bio->bio, cl); 93 } 94 } 95 96 static int bch2_inode_flush_nocow_writes(struct bch_fs *c, 97 struct bch_inode_info *inode) 98 { 99 struct closure cl; 100 101 closure_init_stack(&cl); 102 bch2_inode_flush_nocow_writes_async(c, inode, &cl); 103 closure_sync(&cl); 104 105 return 0; 106 } 107 108 /* i_size updates: */ 109 110 struct inode_new_size { 111 loff_t new_size; 112 u64 now; 113 unsigned fields; 114 }; 115 116 static int inode_set_size(struct btree_trans *trans, 117 struct bch_inode_info *inode, 118 struct bch_inode_unpacked *bi, 119 void *p) 120 { 121 struct inode_new_size *s = p; 122 123 bi->bi_size = s->new_size; 124 if (s->fields & ATTR_ATIME) 125 bi->bi_atime = s->now; 126 if (s->fields & ATTR_MTIME) 127 bi->bi_mtime = s->now; 128 if (s->fields & ATTR_CTIME) 129 bi->bi_ctime = s->now; 130 131 return 0; 132 } 133 134 int __must_check bch2_write_inode_size(struct bch_fs *c, 135 struct bch_inode_info *inode, 136 loff_t new_size, unsigned fields) 137 { 138 struct inode_new_size s = { 139 .new_size = new_size, 140 .now = bch2_current_time(c), 141 .fields = fields, 142 }; 143 144 return bch2_write_inode(c, inode, inode_set_size, &s, fields); 145 } 146 147 void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 148 struct quota_res *quota_res, s64 sectors) 149 { 150 if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { 151 struct printbuf buf = PRINTBUF; 152 bch2_log_msg_start(c, &buf); 153 prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", 154 inode->v.i_ino, (u64) inode->v.i_blocks, sectors, 155 inode->ei_inode.bi_sectors); 156 157 bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); 158 if (print) 159 bch2_print_str(c, KERN_ERR, buf.buf); 160 printbuf_exit(&buf); 161 162 if (sectors < 0) 163 sectors = -inode->v.i_blocks; 164 else 165 sectors = 0; 166 } 167 168 inode->v.i_blocks += sectors; 169 170 #ifdef CONFIG_BCACHEFS_QUOTA 171 if (quota_res && 172 !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && 173 sectors > 0) { 174 BUG_ON(sectors > quota_res->sectors); 175 BUG_ON(sectors > inode->ei_quota_reserved); 176 177 quota_res->sectors -= sectors; 178 inode->ei_quota_reserved -= sectors; 179 } else { 180 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); 181 } 182 #endif 183 } 184 185 /* fsync: */ 186 187 static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, 188 u64 *seq) 189 { 190 struct printbuf buf = PRINTBUF; 191 struct bch_inode_unpacked u; 192 struct btree_iter iter; 193 int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); 194 if (ret) 195 return ret; 196 197 u64 cur_seq = journal_cur_seq(&trans->c->journal); 198 *seq = min(cur_seq, u.bi_journal_seq); 199 200 if (fsck_err_on(u.bi_journal_seq > cur_seq, 201 trans, inode_journal_seq_in_future, 202 "inode journal seq in future (currently at %llu)\n%s", 203 cur_seq, 204 (bch2_inode_unpacked_to_text(&buf, &u), 205 buf.buf))) { 206 u.bi_journal_seq = cur_seq; 207 ret = bch2_inode_write(trans, &iter, &u); 208 } 209 fsck_err: 210 bch2_trans_iter_exit(trans, &iter); 211 printbuf_exit(&buf); 212 return ret; 213 } 214 215 /* 216 * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an 217 * insert trigger: look up the btree inode instead 218 */ 219 static int bch2_flush_inode(struct bch_fs *c, 220 struct bch_inode_info *inode) 221 { 222 if (c->opts.journal_flush_disabled) 223 return 0; 224 225 if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) 226 return -EROFS; 227 228 u64 seq; 229 int ret = bch2_trans_commit_do(c, NULL, NULL, 0, 230 bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: 231 bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: 232 bch2_inode_flush_nocow_writes(c, inode); 233 enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); 234 return ret; 235 } 236 237 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) 238 { 239 struct bch_inode_info *inode = file_bch_inode(file); 240 struct bch_fs *c = inode->v.i_sb->s_fs_info; 241 int ret, err; 242 243 trace_bch2_fsync(file, datasync); 244 245 ret = file_write_and_wait_range(file, start, end); 246 if (ret) 247 goto out; 248 ret = sync_inode_metadata(&inode->v, 1); 249 if (ret) 250 goto out; 251 ret = bch2_flush_inode(c, inode); 252 out: 253 ret = bch2_err_class(ret); 254 if (ret == -EROFS) 255 ret = -EIO; 256 257 err = file_check_and_advance_wb_err(file); 258 if (!ret) 259 ret = err; 260 261 return ret; 262 } 263 264 /* truncate: */ 265 266 static inline int range_has_data(struct bch_fs *c, u32 subvol, 267 struct bpos start, 268 struct bpos end) 269 { 270 return bch2_trans_run(c, 271 for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, 272 subvol, 0, k, ({ 273 bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); 274 }))); 275 } 276 277 static int __bch2_truncate_folio(struct bch_inode_info *inode, 278 pgoff_t index, loff_t start, loff_t end) 279 { 280 struct bch_fs *c = inode->v.i_sb->s_fs_info; 281 struct address_space *mapping = inode->v.i_mapping; 282 struct bch_folio *s; 283 unsigned start_offset; 284 unsigned end_offset; 285 unsigned i; 286 struct folio *folio; 287 s64 i_sectors_delta = 0; 288 int ret = 0; 289 u64 end_pos; 290 291 folio = filemap_lock_folio(mapping, index); 292 if (IS_ERR_OR_NULL(folio)) { 293 /* 294 * XXX: we're doing two index lookups when we end up reading the 295 * folio 296 */ 297 ret = range_has_data(c, inode->ei_inum.subvol, 298 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), 299 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); 300 if (ret <= 0) 301 return ret; 302 303 folio = __filemap_get_folio(mapping, index, 304 FGP_LOCK|FGP_CREAT, GFP_KERNEL); 305 if (IS_ERR(folio)) { 306 ret = -ENOMEM; 307 goto out; 308 } 309 } 310 311 BUG_ON(start >= folio_end_pos(folio)); 312 BUG_ON(end <= folio_pos(folio)); 313 314 start_offset = max(start, folio_pos(folio)) - folio_pos(folio); 315 end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); 316 317 /* Folio boundary? Nothing to do */ 318 if (start_offset == 0 && 319 end_offset == folio_size(folio)) { 320 ret = 0; 321 goto unlock; 322 } 323 324 s = bch2_folio_create(folio, 0); 325 if (!s) { 326 ret = -ENOMEM; 327 goto unlock; 328 } 329 330 if (!folio_test_uptodate(folio)) { 331 ret = bch2_read_single_folio(folio, mapping); 332 if (ret) 333 goto unlock; 334 } 335 336 ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); 337 if (ret) 338 goto unlock; 339 340 for (i = round_up(start_offset, block_bytes(c)) >> 9; 341 i < round_down(end_offset, block_bytes(c)) >> 9; 342 i++) { 343 s->s[i].nr_replicas = 0; 344 345 i_sectors_delta -= s->s[i].state == SECTOR_dirty; 346 bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); 347 } 348 349 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 350 351 /* 352 * Caller needs to know whether this folio will be written out by 353 * writeback - doing an i_size update if necessary - or whether it will 354 * be responsible for the i_size update. 355 * 356 * Note that we shouldn't ever see a folio beyond EOF, but check and 357 * warn if so. This has been observed by failure to clean up folios 358 * after a short write and there's still a chance reclaim will fix 359 * things up. 360 */ 361 WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); 362 end_pos = folio_end_pos(folio); 363 if (inode->v.i_size > folio_pos(folio)) 364 end_pos = min_t(u64, inode->v.i_size, end_pos); 365 ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; 366 367 folio_zero_segment(folio, start_offset, end_offset); 368 369 /* 370 * Bit of a hack - we don't want truncate to fail due to -ENOSPC. 371 * 372 * XXX: because we aren't currently tracking whether the folio has actual 373 * data in it (vs. just 0s, or only partially written) this wrong. ick. 374 */ 375 BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); 376 377 /* 378 * This removes any writeable userspace mappings; we need to force 379 * .page_mkwrite to be called again before any mmapped writes, to 380 * redirty the full page: 381 */ 382 folio_mkclean(folio); 383 filemap_dirty_folio(mapping, folio); 384 unlock: 385 folio_unlock(folio); 386 folio_put(folio); 387 out: 388 return ret; 389 } 390 391 static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) 392 { 393 return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, 394 from, ANYSINT_MAX(loff_t)); 395 } 396 397 static int bch2_truncate_folios(struct bch_inode_info *inode, 398 loff_t start, loff_t end) 399 { 400 int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, 401 start, end); 402 403 if (ret >= 0 && 404 start >> PAGE_SHIFT != end >> PAGE_SHIFT) 405 ret = __bch2_truncate_folio(inode, 406 (end - 1) >> PAGE_SHIFT, 407 start, end); 408 return ret; 409 } 410 411 static int bch2_extend(struct mnt_idmap *idmap, 412 struct bch_inode_info *inode, 413 struct bch_inode_unpacked *inode_u, 414 struct iattr *iattr) 415 { 416 struct address_space *mapping = inode->v.i_mapping; 417 int ret; 418 419 /* 420 * sync appends: 421 * 422 * this has to be done _before_ extending i_size: 423 */ 424 ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); 425 if (ret) 426 return ret; 427 428 truncate_setsize(&inode->v, iattr->ia_size); 429 430 return bch2_setattr_nonsize(idmap, inode, iattr); 431 } 432 433 int bchfs_truncate(struct mnt_idmap *idmap, 434 struct bch_inode_info *inode, struct iattr *iattr) 435 { 436 struct bch_fs *c = inode->v.i_sb->s_fs_info; 437 struct address_space *mapping = inode->v.i_mapping; 438 struct bch_inode_unpacked inode_u; 439 s64 i_sectors_delta = 0; 440 int ret = 0; 441 442 /* 443 * If the truncate call with change the size of the file, the 444 * cmtimes should be updated. If the size will not change, we 445 * do not need to update the cmtimes. 446 */ 447 if (iattr->ia_size != inode->v.i_size) { 448 if (!(iattr->ia_valid & ATTR_MTIME)) 449 ktime_get_coarse_real_ts64(&iattr->ia_mtime); 450 if (!(iattr->ia_valid & ATTR_CTIME)) 451 ktime_get_coarse_real_ts64(&iattr->ia_ctime); 452 iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; 453 } 454 455 inode_dio_wait(&inode->v); 456 bch2_pagecache_block_get(inode); 457 458 ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); 459 if (ret) 460 goto err; 461 462 /* 463 * check this before next assertion; on filesystem error our normal 464 * invariants are a bit broken (truncate has to truncate the page cache 465 * before the inode). 466 */ 467 ret = bch2_journal_error(&c->journal); 468 if (ret) 469 goto err; 470 471 WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && 472 inode->v.i_size < inode_u.bi_size, 473 "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", 474 (u64) inode->v.i_size, inode_u.bi_size); 475 476 if (iattr->ia_size > inode->v.i_size) { 477 ret = bch2_extend(idmap, inode, &inode_u, iattr); 478 goto err; 479 } 480 481 iattr->ia_valid &= ~ATTR_SIZE; 482 483 ret = bch2_truncate_folio(inode, iattr->ia_size); 484 if (unlikely(ret < 0)) 485 goto err; 486 ret = 0; 487 488 truncate_setsize(&inode->v, iattr->ia_size); 489 490 /* 491 * When extending, we're going to write the new i_size to disk 492 * immediately so we need to flush anything above the current on disk 493 * i_size first: 494 * 495 * Also, when extending we need to flush the page that i_size currently 496 * straddles - if it's mapped to userspace, we need to ensure that 497 * userspace has to redirty it and call .mkwrite -> set_page_dirty 498 * again to allocate the part of the page that was extended. 499 */ 500 if (iattr->ia_size > inode_u.bi_size) 501 ret = filemap_write_and_wait_range(mapping, 502 inode_u.bi_size, 503 iattr->ia_size - 1); 504 else if (iattr->ia_size & (PAGE_SIZE - 1)) 505 ret = filemap_write_and_wait_range(mapping, 506 round_down(iattr->ia_size, PAGE_SIZE), 507 iattr->ia_size - 1); 508 if (ret) 509 goto err; 510 511 ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); 512 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 513 514 if (unlikely(ret)) { 515 /* 516 * If we error here, VFS caches are now inconsistent with btree 517 */ 518 set_bit(EI_INODE_ERROR, &inode->ei_flags); 519 goto err; 520 } 521 522 if (unlikely(!inode->v.i_size && inode->v.i_blocks && 523 !bch2_journal_error(&c->journal))) { 524 struct printbuf buf = PRINTBUF; 525 bch2_log_msg_start(c, &buf); 526 prt_printf(&buf, 527 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", 528 inode->v.i_ino, (u64) inode->v.i_blocks, 529 inode->ei_inode.bi_sectors); 530 531 bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); 532 if (print) 533 bch2_print_str(c, KERN_ERR, buf.buf); 534 printbuf_exit(&buf); 535 } 536 537 ret = bch2_setattr_nonsize(idmap, inode, iattr); 538 err: 539 bch2_pagecache_block_put(inode); 540 return bch2_err_class(ret); 541 } 542 543 /* fallocate: */ 544 545 static int inode_update_times_fn(struct btree_trans *trans, 546 struct bch_inode_info *inode, 547 struct bch_inode_unpacked *bi, void *p) 548 { 549 struct bch_fs *c = inode->v.i_sb->s_fs_info; 550 551 bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); 552 return 0; 553 } 554 555 static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) 556 { 557 struct bch_fs *c = inode->v.i_sb->s_fs_info; 558 u64 end = offset + len; 559 u64 block_start = round_up(offset, block_bytes(c)); 560 u64 block_end = round_down(end, block_bytes(c)); 561 bool truncated_last_page; 562 int ret = 0; 563 564 ret = bch2_truncate_folios(inode, offset, end); 565 if (unlikely(ret < 0)) 566 goto err; 567 568 truncated_last_page = ret; 569 570 truncate_pagecache_range(&inode->v, offset, end - 1); 571 572 if (block_start < block_end) { 573 s64 i_sectors_delta = 0; 574 575 ret = bch2_fpunch(c, inode_inum(inode), 576 block_start >> 9, block_end >> 9, 577 &i_sectors_delta); 578 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 579 } 580 581 mutex_lock(&inode->ei_update_lock); 582 if (end >= inode->v.i_size && !truncated_last_page) { 583 ret = bch2_write_inode_size(c, inode, inode->v.i_size, 584 ATTR_MTIME|ATTR_CTIME); 585 } else { 586 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 587 ATTR_MTIME|ATTR_CTIME); 588 } 589 mutex_unlock(&inode->ei_update_lock); 590 err: 591 return ret; 592 } 593 594 static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, 595 loff_t offset, loff_t len, 596 bool insert) 597 { 598 struct bch_fs *c = inode->v.i_sb->s_fs_info; 599 struct address_space *mapping = inode->v.i_mapping; 600 s64 i_sectors_delta = 0; 601 int ret = 0; 602 603 if ((offset | len) & (block_bytes(c) - 1)) 604 return -EINVAL; 605 606 if (insert) { 607 if (offset >= inode->v.i_size) 608 return -EINVAL; 609 } else { 610 if (offset + len >= inode->v.i_size) 611 return -EINVAL; 612 } 613 614 ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); 615 if (ret) 616 return ret; 617 618 if (insert) 619 i_size_write(&inode->v, inode->v.i_size + len); 620 621 ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, 622 insert, &i_sectors_delta); 623 if (!ret && !insert) 624 i_size_write(&inode->v, inode->v.i_size - len); 625 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 626 627 return ret; 628 } 629 630 static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, 631 u64 start_sector, u64 end_sector) 632 { 633 struct bch_fs *c = inode->v.i_sb->s_fs_info; 634 struct btree_trans *trans = bch2_trans_get(c); 635 struct btree_iter iter; 636 struct bpos end_pos = POS(inode->v.i_ino, end_sector); 637 struct bch_io_opts opts; 638 int ret = 0; 639 640 bch2_inode_opts_get(&opts, c, &inode->ei_inode); 641 642 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 643 POS(inode->v.i_ino, start_sector), 644 BTREE_ITER_slots|BTREE_ITER_intent); 645 646 while (!ret) { 647 s64 i_sectors_delta = 0; 648 struct quota_res quota_res = { 0 }; 649 struct bkey_s_c k; 650 unsigned sectors; 651 bool is_allocation; 652 u64 hole_start, hole_end; 653 u32 snapshot; 654 655 bch2_trans_begin(trans); 656 657 if (bkey_ge(iter.pos, end_pos)) 658 break; 659 660 ret = bch2_subvolume_get_snapshot(trans, 661 inode->ei_inum.subvol, &snapshot); 662 if (ret) 663 goto bkey_err; 664 665 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 666 667 k = bch2_btree_iter_peek_slot(trans, &iter); 668 if ((ret = bkey_err(k))) 669 goto bkey_err; 670 671 hole_start = iter.pos.offset; 672 hole_end = bpos_min(k.k->p, end_pos).offset; 673 is_allocation = bkey_extent_is_allocation(k.k); 674 675 /* already reserved */ 676 if (bkey_extent_is_reservation(k) && 677 bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { 678 bch2_btree_iter_advance(trans, &iter); 679 continue; 680 } 681 682 if (bkey_extent_is_data(k.k) && 683 !(mode & FALLOC_FL_ZERO_RANGE)) { 684 bch2_btree_iter_advance(trans, &iter); 685 continue; 686 } 687 688 if (!(mode & FALLOC_FL_ZERO_RANGE)) { 689 /* 690 * Lock ordering - can't be holding btree locks while 691 * blocking on a folio lock: 692 */ 693 if (bch2_clamp_data_hole(&inode->v, 694 &hole_start, 695 &hole_end, 696 opts.data_replicas, true)) { 697 ret = drop_locks_do(trans, 698 (bch2_clamp_data_hole(&inode->v, 699 &hole_start, 700 &hole_end, 701 opts.data_replicas, false), 0)); 702 if (ret) 703 goto bkey_err; 704 } 705 bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); 706 707 if (ret) 708 goto bkey_err; 709 710 if (hole_start == hole_end) 711 continue; 712 } 713 714 sectors = hole_end - hole_start; 715 716 if (!is_allocation) { 717 ret = bch2_quota_reservation_add(c, inode, 718 "a_res, sectors, true); 719 if (unlikely(ret)) 720 goto bkey_err; 721 } 722 723 ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, 724 sectors, opts, &i_sectors_delta, 725 writepoint_hashed((unsigned long) current)); 726 if (ret) 727 goto bkey_err; 728 729 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 730 731 if (bch2_mark_pagecache_reserved(inode, &hole_start, 732 iter.pos.offset, true)) { 733 ret = drop_locks_do(trans, 734 bch2_mark_pagecache_reserved(inode, &hole_start, 735 iter.pos.offset, false)); 736 if (ret) 737 goto bkey_err; 738 } 739 bkey_err: 740 bch2_quota_reservation_put(c, inode, "a_res); 741 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 742 ret = 0; 743 } 744 745 if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { 746 struct quota_res quota_res = { 0 }; 747 s64 i_sectors_delta = 0; 748 749 bch2_fpunch_at(trans, &iter, inode_inum(inode), 750 end_sector, &i_sectors_delta); 751 bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); 752 bch2_quota_reservation_put(c, inode, "a_res); 753 } 754 755 bch2_trans_iter_exit(trans, &iter); 756 bch2_trans_put(trans); 757 return ret; 758 } 759 760 static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, 761 loff_t offset, loff_t len) 762 { 763 struct bch_fs *c = inode->v.i_sb->s_fs_info; 764 u64 end = offset + len; 765 u64 block_start = round_down(offset, block_bytes(c)); 766 u64 block_end = round_up(end, block_bytes(c)); 767 bool truncated_last_page = false; 768 int ret, ret2 = 0; 769 770 if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { 771 ret = inode_newsize_ok(&inode->v, end); 772 if (ret) 773 return ret; 774 } 775 776 if (mode & FALLOC_FL_ZERO_RANGE) { 777 ret = bch2_truncate_folios(inode, offset, end); 778 if (unlikely(ret < 0)) 779 return ret; 780 781 truncated_last_page = ret; 782 783 truncate_pagecache_range(&inode->v, offset, end - 1); 784 785 block_start = round_up(offset, block_bytes(c)); 786 block_end = round_down(end, block_bytes(c)); 787 } 788 789 ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); 790 791 /* 792 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, 793 * so that the VFS cache i_size is consistent with the btree i_size: 794 */ 795 if (ret && 796 !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) 797 return ret; 798 799 if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) 800 end = inode->v.i_size; 801 802 if (end >= inode->v.i_size && 803 (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || 804 !(mode & FALLOC_FL_KEEP_SIZE))) { 805 spin_lock(&inode->v.i_lock); 806 i_size_write(&inode->v, end); 807 spin_unlock(&inode->v.i_lock); 808 809 mutex_lock(&inode->ei_update_lock); 810 ret2 = bch2_write_inode_size(c, inode, end, 0); 811 mutex_unlock(&inode->ei_update_lock); 812 } 813 814 return ret ?: ret2; 815 } 816 817 long bch2_fallocate_dispatch(struct file *file, int mode, 818 loff_t offset, loff_t len) 819 { 820 struct bch_inode_info *inode = file_bch_inode(file); 821 struct bch_fs *c = inode->v.i_sb->s_fs_info; 822 long ret; 823 824 if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) 825 return -EROFS; 826 827 inode_lock(&inode->v); 828 inode_dio_wait(&inode->v); 829 bch2_pagecache_block_get(inode); 830 831 ret = file_modified(file); 832 if (ret) 833 goto err; 834 835 if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) 836 ret = bchfs_fallocate(inode, mode, offset, len); 837 else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) 838 ret = bchfs_fpunch(inode, offset, len); 839 else if (mode == FALLOC_FL_INSERT_RANGE) 840 ret = bchfs_fcollapse_finsert(inode, offset, len, true); 841 else if (mode == FALLOC_FL_COLLAPSE_RANGE) 842 ret = bchfs_fcollapse_finsert(inode, offset, len, false); 843 else 844 ret = -EOPNOTSUPP; 845 err: 846 bch2_pagecache_block_put(inode); 847 inode_unlock(&inode->v); 848 enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); 849 850 return bch2_err_class(ret); 851 } 852 853 /* 854 * Take a quota reservation for unallocated blocks in a given file range 855 * Does not check pagecache 856 */ 857 static int quota_reserve_range(struct bch_inode_info *inode, 858 struct quota_res *res, 859 u64 start, u64 end) 860 { 861 struct bch_fs *c = inode->v.i_sb->s_fs_info; 862 u64 sectors = end - start; 863 864 int ret = bch2_trans_run(c, 865 for_each_btree_key_in_subvolume_max(trans, iter, 866 BTREE_ID_extents, 867 POS(inode->v.i_ino, start), 868 POS(inode->v.i_ino, end - 1), 869 inode->ei_inum.subvol, 0, k, ({ 870 if (bkey_extent_is_allocation(k.k)) { 871 u64 s = min(end, k.k->p.offset) - 872 max(start, bkey_start_offset(k.k)); 873 BUG_ON(s > sectors); 874 sectors -= s; 875 } 876 877 0; 878 }))); 879 880 return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); 881 } 882 883 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, 884 struct file *file_dst, loff_t pos_dst, 885 loff_t len, unsigned remap_flags) 886 { 887 struct bch_inode_info *src = file_bch_inode(file_src); 888 struct bch_inode_info *dst = file_bch_inode(file_dst); 889 struct bch_fs *c = src->v.i_sb->s_fs_info; 890 struct quota_res quota_res = { 0 }; 891 s64 i_sectors_delta = 0; 892 u64 aligned_len; 893 loff_t ret = 0; 894 895 if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) 896 return -EINVAL; 897 898 if ((pos_src & (block_bytes(c) - 1)) || 899 (pos_dst & (block_bytes(c) - 1))) 900 return -EINVAL; 901 902 if (src == dst && 903 abs(pos_src - pos_dst) < len) 904 return -EINVAL; 905 906 lock_two_nondirectories(&src->v, &dst->v); 907 bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); 908 909 inode_dio_wait(&src->v); 910 inode_dio_wait(&dst->v); 911 912 ret = generic_remap_file_range_prep(file_src, pos_src, 913 file_dst, pos_dst, 914 &len, remap_flags); 915 if (ret < 0 || len == 0) 916 goto err; 917 918 aligned_len = round_up((u64) len, block_bytes(c)); 919 920 ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, 921 pos_dst, pos_dst + len - 1); 922 if (ret) 923 goto err; 924 925 ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, 926 (pos_dst + aligned_len) >> 9); 927 if (ret) 928 goto err; 929 930 if (!(remap_flags & REMAP_FILE_DEDUP)) 931 file_update_time(file_dst); 932 933 bch2_mark_pagecache_unallocated(src, pos_src >> 9, 934 (pos_src + aligned_len) >> 9); 935 936 /* 937 * XXX: we'd like to be telling bch2_remap_range() if we have 938 * permission to write to the source file, and thus if io path option 939 * changes should be propagated through the copy, but we need mnt_idmap 940 * from the pathwalk, awkward 941 */ 942 ret = bch2_remap_range(c, 943 inode_inum(dst), pos_dst >> 9, 944 inode_inum(src), pos_src >> 9, 945 aligned_len >> 9, 946 pos_dst + len, &i_sectors_delta, 947 false); 948 if (ret < 0) 949 goto err; 950 951 /* 952 * due to alignment, we might have remapped slightly more than requsted 953 */ 954 ret = min((u64) ret << 9, (u64) len); 955 956 bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); 957 958 spin_lock(&dst->v.i_lock); 959 if (pos_dst + ret > dst->v.i_size) 960 i_size_write(&dst->v, pos_dst + ret); 961 spin_unlock(&dst->v.i_lock); 962 963 if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || 964 IS_SYNC(file_inode(file_dst))) 965 ret = bch2_flush_inode(c, dst); 966 err: 967 bch2_quota_reservation_put(c, dst, "a_res); 968 bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); 969 unlock_two_nondirectories(&src->v, &dst->v); 970 971 return bch2_err_class(ret); 972 } 973 974 /* fseek: */ 975 976 static loff_t bch2_seek_data(struct file *file, u64 offset) 977 { 978 struct bch_inode_info *inode = file_bch_inode(file); 979 struct bch_fs *c = inode->v.i_sb->s_fs_info; 980 subvol_inum inum = inode_inum(inode); 981 u64 isize, next_data = MAX_LFS_FILESIZE; 982 983 isize = i_size_read(&inode->v); 984 if (offset >= isize) 985 return -ENXIO; 986 987 int ret = bch2_trans_run(c, 988 for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, 989 POS(inode->v.i_ino, offset >> 9), 990 POS(inode->v.i_ino, U64_MAX), 991 inum.subvol, 0, k, ({ 992 if (bkey_extent_is_data(k.k)) { 993 next_data = max(offset, bkey_start_offset(k.k) << 9); 994 break; 995 } else if (k.k->p.offset >> 9 > isize) 996 break; 997 0; 998 }))); 999 if (ret) 1000 return ret; 1001 1002 if (next_data > offset) 1003 next_data = bch2_seek_pagecache_data(&inode->v, 1004 offset, next_data, 0, false); 1005 1006 if (next_data >= isize) 1007 return -ENXIO; 1008 1009 return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); 1010 } 1011 1012 static loff_t bch2_seek_hole(struct file *file, u64 offset) 1013 { 1014 struct bch_inode_info *inode = file_bch_inode(file); 1015 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1016 subvol_inum inum = inode_inum(inode); 1017 u64 isize, next_hole = MAX_LFS_FILESIZE; 1018 1019 isize = i_size_read(&inode->v); 1020 if (offset >= isize) 1021 return -ENXIO; 1022 1023 int ret = bch2_trans_run(c, 1024 for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, 1025 POS(inode->v.i_ino, offset >> 9), 1026 POS(inode->v.i_ino, U64_MAX), 1027 inum.subvol, BTREE_ITER_slots, k, ({ 1028 if (k.k->p.inode != inode->v.i_ino || 1029 !bkey_extent_is_data(k.k)) { 1030 loff_t start_offset = k.k->p.inode == inode->v.i_ino 1031 ? max(offset, bkey_start_offset(k.k) << 9) 1032 : offset; 1033 loff_t end_offset = k.k->p.inode == inode->v.i_ino 1034 ? MAX_LFS_FILESIZE 1035 : k.k->p.offset << 9; 1036 1037 /* 1038 * Found a hole in the btree, now make sure it's 1039 * a hole in the pagecache. We might have to 1040 * keep searching if this hole is entirely dirty 1041 * in the page cache: 1042 */ 1043 bch2_trans_unlock(trans); 1044 loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, 1045 start_offset, end_offset, 0, false); 1046 if (pagecache_hole < end_offset) { 1047 next_hole = pagecache_hole; 1048 break; 1049 } 1050 } else { 1051 offset = max(offset, bkey_start_offset(k.k) << 9); 1052 } 1053 0; 1054 }))); 1055 if (ret) 1056 return ret; 1057 1058 if (next_hole > isize) 1059 next_hole = isize; 1060 1061 return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); 1062 } 1063 1064 loff_t bch2_llseek(struct file *file, loff_t offset, int whence) 1065 { 1066 loff_t ret; 1067 1068 switch (whence) { 1069 case SEEK_SET: 1070 case SEEK_CUR: 1071 case SEEK_END: 1072 ret = generic_file_llseek(file, offset, whence); 1073 break; 1074 case SEEK_DATA: 1075 ret = bch2_seek_data(file, offset); 1076 break; 1077 case SEEK_HOLE: 1078 ret = bch2_seek_hole(file, offset); 1079 break; 1080 default: 1081 ret = -EINVAL; 1082 break; 1083 } 1084 1085 return bch2_err_class(ret); 1086 } 1087 1088 void bch2_fs_fsio_exit(struct bch_fs *c) 1089 { 1090 bioset_exit(&c->nocow_flush_bioset); 1091 } 1092 1093 int bch2_fs_fsio_init(struct bch_fs *c) 1094 { 1095 if (bioset_init(&c->nocow_flush_bioset, 1096 1, offsetof(struct nocow_flush, bio), 0)) 1097 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; 1098 1099 return 0; 1100 } 1101 1102 #endif /* NO_BCACHEFS_FS */ 1103