1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/fs.h> 7 #include <linux/pagemap.h> 8 #include <linux/time.h> 9 #include <linux/init.h> 10 #include <linux/string.h> 11 #include <linux/backing-dev.h> 12 #include <linux/falloc.h> 13 #include <linux/writeback.h> 14 #include <linux/compat.h> 15 #include <linux/slab.h> 16 #include <linux/btrfs.h> 17 #include <linux/uio.h> 18 #include <linux/iversion.h> 19 #include <linux/fsverity.h> 20 #include "ctree.h" 21 #include "direct-io.h" 22 #include "disk-io.h" 23 #include "transaction.h" 24 #include "btrfs_inode.h" 25 #include "tree-log.h" 26 #include "locking.h" 27 #include "qgroup.h" 28 #include "compression.h" 29 #include "delalloc-space.h" 30 #include "reflink.h" 31 #include "subpage.h" 32 #include "fs.h" 33 #include "accessors.h" 34 #include "extent-tree.h" 35 #include "file-item.h" 36 #include "ioctl.h" 37 #include "file.h" 38 #include "super.h" 39 #include "print-tree.h" 40 41 /* 42 * Unlock folio after btrfs_file_write() is done with it. 43 */ 44 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, 45 u64 pos, u64 copied) 46 { 47 u64 block_start = round_down(pos, fs_info->sectorsize); 48 u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; 49 50 ASSERT(block_len <= U32_MAX); 51 /* 52 * Folio checked is some magic around finding folios that have been 53 * modified without going through btrfs_dirty_folio(). Clear it here. 54 * There should be no need to mark the pages accessed as 55 * prepare_one_folio() should have marked them accessed in 56 * prepare_one_folio() via find_or_create_page() 57 */ 58 btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); 59 folio_unlock(folio); 60 folio_put(folio); 61 } 62 63 /* 64 * After copy_folio_from_iter_atomic(), update the following things for delalloc: 65 * - Mark newly dirtied folio as DELALLOC in the io tree. 66 * Used to advise which range is to be written back. 67 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup 68 * - Update inode size for past EOF write 69 */ 70 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, 71 size_t write_bytes, struct extent_state **cached, bool noreserve) 72 { 73 struct btrfs_fs_info *fs_info = inode->root->fs_info; 74 int ret = 0; 75 u64 num_bytes; 76 u64 start_pos; 77 u64 end_of_last_block; 78 u64 end_pos = pos + write_bytes; 79 loff_t isize = i_size_read(&inode->vfs_inode); 80 unsigned int extra_bits = 0; 81 82 if (write_bytes == 0) 83 return 0; 84 85 if (noreserve) 86 extra_bits |= EXTENT_NORESERVE; 87 88 start_pos = round_down(pos, fs_info->sectorsize); 89 num_bytes = round_up(write_bytes + pos - start_pos, 90 fs_info->sectorsize); 91 ASSERT(num_bytes <= U32_MAX); 92 ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); 93 94 end_of_last_block = start_pos + num_bytes - 1; 95 96 /* 97 * The pages may have already been dirty, clear out old accounting so 98 * we can set things up properly 99 */ 100 btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, 101 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 102 cached); 103 104 ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 105 extra_bits, cached); 106 if (ret) 107 return ret; 108 109 btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); 110 btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); 111 btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); 112 113 /* 114 * we've only changed i_size in ram, and we haven't updated 115 * the disk i_size. There is no need to log the inode 116 * at this time. 117 */ 118 if (end_pos > isize) 119 i_size_write(&inode->vfs_inode, end_pos); 120 return 0; 121 } 122 123 /* 124 * this is very complex, but the basic idea is to drop all extents 125 * in the range start - end. hint_block is filled in with a block number 126 * that would be a good hint to the block allocator for this file. 127 * 128 * If an extent intersects the range but is not entirely inside the range 129 * it is either truncated or split. Anything entirely inside the range 130 * is deleted from the tree. 131 * 132 * Note: the VFS' inode number of bytes is not updated, it's up to the caller 133 * to deal with that. We set the field 'bytes_found' of the arguments structure 134 * with the number of allocated bytes found in the target range, so that the 135 * caller can update the inode's number of bytes in an atomic way when 136 * replacing extents in a range to avoid races with stat(2). 137 */ 138 int btrfs_drop_extents(struct btrfs_trans_handle *trans, 139 struct btrfs_root *root, struct btrfs_inode *inode, 140 struct btrfs_drop_extents_args *args) 141 { 142 struct btrfs_fs_info *fs_info = root->fs_info; 143 struct extent_buffer *leaf; 144 struct btrfs_file_extent_item *fi; 145 struct btrfs_key key; 146 struct btrfs_key new_key; 147 u64 ino = btrfs_ino(inode); 148 u64 search_start = args->start; 149 u64 disk_bytenr = 0; 150 u64 num_bytes = 0; 151 u64 extent_offset = 0; 152 u64 extent_end = 0; 153 u64 last_end = args->start; 154 int del_nr = 0; 155 int del_slot = 0; 156 int extent_type; 157 int recow; 158 int ret; 159 int modify_tree = -1; 160 int update_refs; 161 int found = 0; 162 struct btrfs_path *path = args->path; 163 164 args->bytes_found = 0; 165 args->extent_inserted = false; 166 167 /* Must always have a path if ->replace_extent is true */ 168 ASSERT(!(args->replace_extent && !args->path)); 169 170 if (!path) { 171 path = btrfs_alloc_path(); 172 if (!path) { 173 ret = -ENOMEM; 174 goto out; 175 } 176 } 177 178 if (args->drop_cache) 179 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); 180 181 if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) 182 modify_tree = 0; 183 184 update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); 185 while (1) { 186 recow = 0; 187 ret = btrfs_lookup_file_extent(trans, root, path, ino, 188 search_start, modify_tree); 189 if (ret < 0) 190 break; 191 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { 192 leaf = path->nodes[0]; 193 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 194 if (key.objectid == ino && 195 key.type == BTRFS_EXTENT_DATA_KEY) 196 path->slots[0]--; 197 } 198 ret = 0; 199 next_slot: 200 leaf = path->nodes[0]; 201 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 202 if (WARN_ON(del_nr > 0)) { 203 btrfs_print_leaf(leaf); 204 ret = -EINVAL; 205 break; 206 } 207 ret = btrfs_next_leaf(root, path); 208 if (ret < 0) 209 break; 210 if (ret > 0) { 211 ret = 0; 212 break; 213 } 214 leaf = path->nodes[0]; 215 recow = 1; 216 } 217 218 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 219 220 if (key.objectid > ino) 221 break; 222 if (WARN_ON_ONCE(key.objectid < ino) || 223 key.type < BTRFS_EXTENT_DATA_KEY) { 224 ASSERT(del_nr == 0); 225 path->slots[0]++; 226 goto next_slot; 227 } 228 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) 229 break; 230 231 fi = btrfs_item_ptr(leaf, path->slots[0], 232 struct btrfs_file_extent_item); 233 extent_type = btrfs_file_extent_type(leaf, fi); 234 235 if (extent_type == BTRFS_FILE_EXTENT_REG || 236 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 237 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 238 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 239 extent_offset = btrfs_file_extent_offset(leaf, fi); 240 extent_end = key.offset + 241 btrfs_file_extent_num_bytes(leaf, fi); 242 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 243 extent_end = key.offset + 244 btrfs_file_extent_ram_bytes(leaf, fi); 245 } else { 246 /* can't happen */ 247 BUG(); 248 } 249 250 /* 251 * Don't skip extent items representing 0 byte lengths. They 252 * used to be created (bug) if while punching holes we hit 253 * -ENOSPC condition. So if we find one here, just ensure we 254 * delete it, otherwise we would insert a new file extent item 255 * with the same key (offset) as that 0 bytes length file 256 * extent item in the call to setup_items_for_insert() later 257 * in this function. 258 */ 259 if (extent_end == key.offset && extent_end >= search_start) { 260 last_end = extent_end; 261 goto delete_extent_item; 262 } 263 264 if (extent_end <= search_start) { 265 path->slots[0]++; 266 goto next_slot; 267 } 268 269 found = 1; 270 search_start = max(key.offset, args->start); 271 if (recow || !modify_tree) { 272 modify_tree = -1; 273 btrfs_release_path(path); 274 continue; 275 } 276 277 /* 278 * | - range to drop - | 279 * | -------- extent -------- | 280 */ 281 if (args->start > key.offset && args->end < extent_end) { 282 if (WARN_ON(del_nr > 0)) { 283 btrfs_print_leaf(leaf); 284 ret = -EINVAL; 285 break; 286 } 287 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 288 ret = -EOPNOTSUPP; 289 break; 290 } 291 292 memcpy(&new_key, &key, sizeof(new_key)); 293 new_key.offset = args->start; 294 ret = btrfs_duplicate_item(trans, root, path, 295 &new_key); 296 if (ret == -EAGAIN) { 297 btrfs_release_path(path); 298 continue; 299 } 300 if (ret < 0) 301 break; 302 303 leaf = path->nodes[0]; 304 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 305 struct btrfs_file_extent_item); 306 btrfs_set_file_extent_num_bytes(leaf, fi, 307 args->start - key.offset); 308 309 fi = btrfs_item_ptr(leaf, path->slots[0], 310 struct btrfs_file_extent_item); 311 312 extent_offset += args->start - key.offset; 313 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 314 btrfs_set_file_extent_num_bytes(leaf, fi, 315 extent_end - args->start); 316 317 if (update_refs && disk_bytenr > 0) { 318 struct btrfs_ref ref = { 319 .action = BTRFS_ADD_DELAYED_REF, 320 .bytenr = disk_bytenr, 321 .num_bytes = num_bytes, 322 .parent = 0, 323 .owning_root = btrfs_root_id(root), 324 .ref_root = btrfs_root_id(root), 325 }; 326 btrfs_init_data_ref(&ref, new_key.objectid, 327 args->start - extent_offset, 328 0, false); 329 ret = btrfs_inc_extent_ref(trans, &ref); 330 if (ret) { 331 btrfs_abort_transaction(trans, ret); 332 break; 333 } 334 } 335 key.offset = args->start; 336 } 337 /* 338 * From here on out we will have actually dropped something, so 339 * last_end can be updated. 340 */ 341 last_end = extent_end; 342 343 /* 344 * | ---- range to drop ----- | 345 * | -------- extent -------- | 346 */ 347 if (args->start <= key.offset && args->end < extent_end) { 348 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 349 ret = -EOPNOTSUPP; 350 break; 351 } 352 353 memcpy(&new_key, &key, sizeof(new_key)); 354 new_key.offset = args->end; 355 btrfs_set_item_key_safe(trans, path, &new_key); 356 357 extent_offset += args->end - key.offset; 358 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 359 btrfs_set_file_extent_num_bytes(leaf, fi, 360 extent_end - args->end); 361 if (update_refs && disk_bytenr > 0) 362 args->bytes_found += args->end - key.offset; 363 break; 364 } 365 366 search_start = extent_end; 367 /* 368 * | ---- range to drop ----- | 369 * | -------- extent -------- | 370 */ 371 if (args->start > key.offset && args->end >= extent_end) { 372 if (WARN_ON(del_nr > 0)) { 373 btrfs_print_leaf(leaf); 374 ret = -EINVAL; 375 break; 376 } 377 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 378 ret = -EOPNOTSUPP; 379 break; 380 } 381 382 btrfs_set_file_extent_num_bytes(leaf, fi, 383 args->start - key.offset); 384 if (update_refs && disk_bytenr > 0) 385 args->bytes_found += extent_end - args->start; 386 if (args->end == extent_end) 387 break; 388 389 path->slots[0]++; 390 goto next_slot; 391 } 392 393 /* 394 * | ---- range to drop ----- | 395 * | ------ extent ------ | 396 */ 397 if (args->start <= key.offset && args->end >= extent_end) { 398 delete_extent_item: 399 if (del_nr == 0) { 400 del_slot = path->slots[0]; 401 del_nr = 1; 402 } else { 403 if (WARN_ON(del_slot + del_nr != path->slots[0])) { 404 btrfs_print_leaf(leaf); 405 ret = -EINVAL; 406 break; 407 } 408 del_nr++; 409 } 410 411 if (update_refs && 412 extent_type == BTRFS_FILE_EXTENT_INLINE) { 413 args->bytes_found += extent_end - key.offset; 414 extent_end = ALIGN(extent_end, 415 fs_info->sectorsize); 416 } else if (update_refs && disk_bytenr > 0) { 417 struct btrfs_ref ref = { 418 .action = BTRFS_DROP_DELAYED_REF, 419 .bytenr = disk_bytenr, 420 .num_bytes = num_bytes, 421 .parent = 0, 422 .owning_root = btrfs_root_id(root), 423 .ref_root = btrfs_root_id(root), 424 }; 425 btrfs_init_data_ref(&ref, key.objectid, 426 key.offset - extent_offset, 427 0, false); 428 ret = btrfs_free_extent(trans, &ref); 429 if (ret) { 430 btrfs_abort_transaction(trans, ret); 431 break; 432 } 433 args->bytes_found += extent_end - key.offset; 434 } 435 436 if (args->end == extent_end) 437 break; 438 439 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 440 path->slots[0]++; 441 goto next_slot; 442 } 443 444 ret = btrfs_del_items(trans, root, path, del_slot, 445 del_nr); 446 if (ret) { 447 btrfs_abort_transaction(trans, ret); 448 break; 449 } 450 451 del_nr = 0; 452 del_slot = 0; 453 454 btrfs_release_path(path); 455 continue; 456 } 457 458 BUG(); 459 } 460 461 if (!ret && del_nr > 0) { 462 /* 463 * Set path->slots[0] to first slot, so that after the delete 464 * if items are move off from our leaf to its immediate left or 465 * right neighbor leafs, we end up with a correct and adjusted 466 * path->slots[0] for our insertion (if args->replace_extent). 467 */ 468 path->slots[0] = del_slot; 469 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 470 if (ret) 471 btrfs_abort_transaction(trans, ret); 472 } 473 474 leaf = path->nodes[0]; 475 /* 476 * If btrfs_del_items() was called, it might have deleted a leaf, in 477 * which case it unlocked our path, so check path->locks[0] matches a 478 * write lock. 479 */ 480 if (!ret && args->replace_extent && 481 path->locks[0] == BTRFS_WRITE_LOCK && 482 btrfs_leaf_free_space(leaf) >= 483 sizeof(struct btrfs_item) + args->extent_item_size) { 484 485 key.objectid = ino; 486 key.type = BTRFS_EXTENT_DATA_KEY; 487 key.offset = args->start; 488 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { 489 struct btrfs_key slot_key; 490 491 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); 492 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) 493 path->slots[0]++; 494 } 495 btrfs_setup_item_for_insert(trans, root, path, &key, 496 args->extent_item_size); 497 args->extent_inserted = true; 498 } 499 500 if (!args->path) 501 btrfs_free_path(path); 502 else if (!args->extent_inserted) 503 btrfs_release_path(path); 504 out: 505 args->drop_end = found ? min(args->end, last_end) : args->end; 506 507 return ret; 508 } 509 510 static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, 511 u64 bytenr, u64 orig_offset, u64 *start, u64 *end) 512 { 513 struct btrfs_file_extent_item *fi; 514 struct btrfs_key key; 515 u64 extent_end; 516 517 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 518 return false; 519 520 btrfs_item_key_to_cpu(leaf, &key, slot); 521 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 522 return false; 523 524 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 525 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 526 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 527 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 528 btrfs_file_extent_compression(leaf, fi) || 529 btrfs_file_extent_encryption(leaf, fi) || 530 btrfs_file_extent_other_encoding(leaf, fi)) 531 return false; 532 533 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 534 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 535 return false; 536 537 *start = key.offset; 538 *end = extent_end; 539 return true; 540 } 541 542 /* 543 * Mark extent in the range start - end as written. 544 * 545 * This changes extent type from 'pre-allocated' to 'regular'. If only 546 * part of extent is marked as written, the extent will be split into 547 * two or three. 548 */ 549 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 550 struct btrfs_inode *inode, u64 start, u64 end) 551 { 552 struct btrfs_root *root = inode->root; 553 struct extent_buffer *leaf; 554 BTRFS_PATH_AUTO_FREE(path); 555 struct btrfs_file_extent_item *fi; 556 struct btrfs_ref ref = { 0 }; 557 struct btrfs_key key; 558 struct btrfs_key new_key; 559 u64 bytenr; 560 u64 num_bytes; 561 u64 extent_end; 562 u64 orig_offset; 563 u64 other_start; 564 u64 other_end; 565 u64 split; 566 int del_nr = 0; 567 int del_slot = 0; 568 int recow; 569 int ret = 0; 570 u64 ino = btrfs_ino(inode); 571 572 path = btrfs_alloc_path(); 573 if (!path) 574 return -ENOMEM; 575 again: 576 recow = 0; 577 split = start; 578 key.objectid = ino; 579 key.type = BTRFS_EXTENT_DATA_KEY; 580 key.offset = split; 581 582 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 583 if (ret < 0) 584 goto out; 585 if (ret > 0 && path->slots[0] > 0) 586 path->slots[0]--; 587 588 leaf = path->nodes[0]; 589 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 590 if (key.objectid != ino || 591 key.type != BTRFS_EXTENT_DATA_KEY) { 592 ret = -EINVAL; 593 btrfs_abort_transaction(trans, ret); 594 goto out; 595 } 596 fi = btrfs_item_ptr(leaf, path->slots[0], 597 struct btrfs_file_extent_item); 598 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { 599 ret = -EINVAL; 600 btrfs_abort_transaction(trans, ret); 601 goto out; 602 } 603 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 604 if (key.offset > start || extent_end < end) { 605 ret = -EINVAL; 606 btrfs_abort_transaction(trans, ret); 607 goto out; 608 } 609 610 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 611 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 612 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 613 memcpy(&new_key, &key, sizeof(new_key)); 614 615 if (start == key.offset && end < extent_end) { 616 other_start = 0; 617 other_end = start; 618 if (extent_mergeable(leaf, path->slots[0] - 1, 619 ino, bytenr, orig_offset, 620 &other_start, &other_end)) { 621 new_key.offset = end; 622 btrfs_set_item_key_safe(trans, path, &new_key); 623 fi = btrfs_item_ptr(leaf, path->slots[0], 624 struct btrfs_file_extent_item); 625 btrfs_set_file_extent_generation(leaf, fi, 626 trans->transid); 627 btrfs_set_file_extent_num_bytes(leaf, fi, 628 extent_end - end); 629 btrfs_set_file_extent_offset(leaf, fi, 630 end - orig_offset); 631 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 632 struct btrfs_file_extent_item); 633 btrfs_set_file_extent_generation(leaf, fi, 634 trans->transid); 635 btrfs_set_file_extent_num_bytes(leaf, fi, 636 end - other_start); 637 goto out; 638 } 639 } 640 641 if (start > key.offset && end == extent_end) { 642 other_start = end; 643 other_end = 0; 644 if (extent_mergeable(leaf, path->slots[0] + 1, 645 ino, bytenr, orig_offset, 646 &other_start, &other_end)) { 647 fi = btrfs_item_ptr(leaf, path->slots[0], 648 struct btrfs_file_extent_item); 649 btrfs_set_file_extent_num_bytes(leaf, fi, 650 start - key.offset); 651 btrfs_set_file_extent_generation(leaf, fi, 652 trans->transid); 653 path->slots[0]++; 654 new_key.offset = start; 655 btrfs_set_item_key_safe(trans, path, &new_key); 656 657 fi = btrfs_item_ptr(leaf, path->slots[0], 658 struct btrfs_file_extent_item); 659 btrfs_set_file_extent_generation(leaf, fi, 660 trans->transid); 661 btrfs_set_file_extent_num_bytes(leaf, fi, 662 other_end - start); 663 btrfs_set_file_extent_offset(leaf, fi, 664 start - orig_offset); 665 goto out; 666 } 667 } 668 669 while (start > key.offset || end < extent_end) { 670 if (key.offset == start) 671 split = end; 672 673 new_key.offset = split; 674 ret = btrfs_duplicate_item(trans, root, path, &new_key); 675 if (ret == -EAGAIN) { 676 btrfs_release_path(path); 677 goto again; 678 } 679 if (ret < 0) { 680 btrfs_abort_transaction(trans, ret); 681 goto out; 682 } 683 684 leaf = path->nodes[0]; 685 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 686 struct btrfs_file_extent_item); 687 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 688 btrfs_set_file_extent_num_bytes(leaf, fi, 689 split - key.offset); 690 691 fi = btrfs_item_ptr(leaf, path->slots[0], 692 struct btrfs_file_extent_item); 693 694 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 695 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 696 btrfs_set_file_extent_num_bytes(leaf, fi, 697 extent_end - split); 698 699 ref.action = BTRFS_ADD_DELAYED_REF; 700 ref.bytenr = bytenr; 701 ref.num_bytes = num_bytes; 702 ref.parent = 0; 703 ref.owning_root = btrfs_root_id(root); 704 ref.ref_root = btrfs_root_id(root); 705 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); 706 ret = btrfs_inc_extent_ref(trans, &ref); 707 if (ret) { 708 btrfs_abort_transaction(trans, ret); 709 goto out; 710 } 711 712 if (split == start) { 713 key.offset = start; 714 } else { 715 if (start != key.offset) { 716 ret = -EINVAL; 717 btrfs_abort_transaction(trans, ret); 718 goto out; 719 } 720 path->slots[0]--; 721 extent_end = end; 722 } 723 recow = 1; 724 } 725 726 other_start = end; 727 other_end = 0; 728 729 ref.action = BTRFS_DROP_DELAYED_REF; 730 ref.bytenr = bytenr; 731 ref.num_bytes = num_bytes; 732 ref.parent = 0; 733 ref.owning_root = btrfs_root_id(root); 734 ref.ref_root = btrfs_root_id(root); 735 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); 736 if (extent_mergeable(leaf, path->slots[0] + 1, 737 ino, bytenr, orig_offset, 738 &other_start, &other_end)) { 739 if (recow) { 740 btrfs_release_path(path); 741 goto again; 742 } 743 extent_end = other_end; 744 del_slot = path->slots[0] + 1; 745 del_nr++; 746 ret = btrfs_free_extent(trans, &ref); 747 if (ret) { 748 btrfs_abort_transaction(trans, ret); 749 goto out; 750 } 751 } 752 other_start = 0; 753 other_end = start; 754 if (extent_mergeable(leaf, path->slots[0] - 1, 755 ino, bytenr, orig_offset, 756 &other_start, &other_end)) { 757 if (recow) { 758 btrfs_release_path(path); 759 goto again; 760 } 761 key.offset = other_start; 762 del_slot = path->slots[0]; 763 del_nr++; 764 ret = btrfs_free_extent(trans, &ref); 765 if (ret) { 766 btrfs_abort_transaction(trans, ret); 767 goto out; 768 } 769 } 770 if (del_nr == 0) { 771 fi = btrfs_item_ptr(leaf, path->slots[0], 772 struct btrfs_file_extent_item); 773 btrfs_set_file_extent_type(leaf, fi, 774 BTRFS_FILE_EXTENT_REG); 775 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 776 } else { 777 fi = btrfs_item_ptr(leaf, del_slot - 1, 778 struct btrfs_file_extent_item); 779 btrfs_set_file_extent_type(leaf, fi, 780 BTRFS_FILE_EXTENT_REG); 781 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 782 btrfs_set_file_extent_num_bytes(leaf, fi, 783 extent_end - key.offset); 784 785 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 786 if (ret < 0) { 787 btrfs_abort_transaction(trans, ret); 788 goto out; 789 } 790 } 791 out: 792 return ret; 793 } 794 795 /* 796 * On error return an unlocked folio and the error value 797 * On success return a locked folio and 0 798 */ 799 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, 800 u64 len) 801 { 802 u64 clamp_start = max_t(u64, pos, folio_pos(folio)); 803 u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); 804 const u32 blocksize = inode_to_fs_info(inode)->sectorsize; 805 int ret = 0; 806 807 if (folio_test_uptodate(folio)) 808 return 0; 809 810 if (IS_ALIGNED(clamp_start, blocksize) && 811 IS_ALIGNED(clamp_end, blocksize)) 812 return 0; 813 814 ret = btrfs_read_folio(NULL, folio); 815 if (ret) 816 return ret; 817 folio_lock(folio); 818 if (!folio_test_uptodate(folio)) { 819 folio_unlock(folio); 820 return -EIO; 821 } 822 823 /* 824 * Since btrfs_read_folio() will unlock the folio before it returns, 825 * there is a window where btrfs_release_folio() can be called to 826 * release the page. Here we check both inode mapping and page 827 * private to make sure the page was not released. 828 * 829 * The private flag check is essential for subpage as we need to store 830 * extra bitmap using folio private. 831 */ 832 if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { 833 folio_unlock(folio); 834 return -EAGAIN; 835 } 836 return 0; 837 } 838 839 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) 840 { 841 gfp_t gfp; 842 843 gfp = btrfs_alloc_write_mask(inode->i_mapping); 844 if (nowait) { 845 gfp &= ~__GFP_DIRECT_RECLAIM; 846 gfp |= GFP_NOWAIT; 847 } 848 849 return gfp; 850 } 851 852 /* 853 * Get folio into the page cache and lock it. 854 */ 855 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, 856 loff_t pos, size_t write_bytes, 857 bool nowait) 858 { 859 const pgoff_t index = pos >> PAGE_SHIFT; 860 gfp_t mask = get_prepare_gfp_flags(inode, nowait); 861 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | 862 fgf_set_order(write_bytes); 863 struct folio *folio; 864 int ret = 0; 865 866 again: 867 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); 868 if (IS_ERR(folio)) 869 return PTR_ERR(folio); 870 871 ret = set_folio_extent_mapped(folio); 872 if (ret < 0) { 873 folio_unlock(folio); 874 folio_put(folio); 875 return ret; 876 } 877 ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); 878 if (ret) { 879 /* The folio is already unlocked. */ 880 folio_put(folio); 881 if (!nowait && ret == -EAGAIN) { 882 ret = 0; 883 goto again; 884 } 885 return ret; 886 } 887 *folio_ret = folio; 888 return 0; 889 } 890 891 /* 892 * Locks the extent and properly waits for data=ordered extents to finish 893 * before allowing the folios to be modified if need. 894 * 895 * Return: 896 * 1 - the extent is locked 897 * 0 - the extent is not locked, and everything is OK 898 * -EAGAIN - need to prepare the folios again 899 */ 900 static noinline int 901 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, 902 loff_t pos, size_t write_bytes, 903 u64 *lockstart, u64 *lockend, bool nowait, 904 struct extent_state **cached_state) 905 { 906 struct btrfs_fs_info *fs_info = inode->root->fs_info; 907 u64 start_pos; 908 u64 last_pos; 909 int ret = 0; 910 911 start_pos = round_down(pos, fs_info->sectorsize); 912 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; 913 914 if (start_pos < inode->vfs_inode.i_size) { 915 struct btrfs_ordered_extent *ordered; 916 917 if (nowait) { 918 if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, 919 last_pos, cached_state)) { 920 folio_unlock(folio); 921 folio_put(folio); 922 return -EAGAIN; 923 } 924 } else { 925 btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, 926 cached_state); 927 } 928 929 ordered = btrfs_lookup_ordered_range(inode, start_pos, 930 last_pos - start_pos + 1); 931 if (ordered && 932 ordered->file_offset + ordered->num_bytes > start_pos && 933 ordered->file_offset <= last_pos) { 934 btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, 935 cached_state); 936 folio_unlock(folio); 937 folio_put(folio); 938 btrfs_start_ordered_extent(ordered); 939 btrfs_put_ordered_extent(ordered); 940 return -EAGAIN; 941 } 942 if (ordered) 943 btrfs_put_ordered_extent(ordered); 944 945 *lockstart = start_pos; 946 *lockend = last_pos; 947 ret = 1; 948 } 949 950 /* 951 * We should be called after prepare_one_folio() which should have locked 952 * all pages in the range. 953 */ 954 WARN_ON(!folio_test_locked(folio)); 955 956 return ret; 957 } 958 959 /* 960 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) 961 * 962 * @pos: File offset. 963 * @write_bytes: The length to write, will be updated to the nocow writeable 964 * range. 965 * @nowait: Indicate if we can block or not (non-blocking IO context). 966 * 967 * This function will flush ordered extents in the range to ensure proper 968 * nocow checks. 969 * 970 * Return: 971 * > 0 If we can nocow, and updates @write_bytes. 972 * 0 If we can't do a nocow write. 973 * -EAGAIN If we can't do a nocow write because snapshoting of the inode's 974 * root is in progress or because we are in a non-blocking IO 975 * context and need to block (@nowait is true). 976 * < 0 If an error happened. 977 * 978 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. 979 */ 980 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, 981 size_t *write_bytes, bool nowait) 982 { 983 struct btrfs_fs_info *fs_info = inode->root->fs_info; 984 struct btrfs_root *root = inode->root; 985 struct extent_state *cached_state = NULL; 986 u64 lockstart, lockend; 987 u64 cur_offset; 988 int ret = 0; 989 990 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 991 return 0; 992 993 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) 994 return -EAGAIN; 995 996 lockstart = round_down(pos, fs_info->sectorsize); 997 lockend = round_up(pos + *write_bytes, 998 fs_info->sectorsize) - 1; 999 1000 if (nowait) { 1001 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, 1002 &cached_state)) { 1003 btrfs_drew_write_unlock(&root->snapshot_lock); 1004 return -EAGAIN; 1005 } 1006 } else { 1007 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, 1008 &cached_state); 1009 } 1010 1011 cur_offset = lockstart; 1012 while (cur_offset < lockend) { 1013 u64 num_bytes = lockend - cur_offset + 1; 1014 1015 ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait); 1016 if (ret <= 0) { 1017 /* 1018 * If cur_offset == lockstart it means we haven't found 1019 * any extent against which we can NOCOW, so unlock the 1020 * snapshot lock. 1021 */ 1022 if (cur_offset == lockstart) 1023 btrfs_drew_write_unlock(&root->snapshot_lock); 1024 break; 1025 } 1026 cur_offset += num_bytes; 1027 } 1028 1029 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 1030 1031 /* 1032 * cur_offset > lockstart means there's at least a partial range we can 1033 * NOCOW, and that range can cover one or more extents. 1034 */ 1035 if (cur_offset > lockstart) { 1036 *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos); 1037 return 1; 1038 } 1039 1040 return ret; 1041 } 1042 1043 void btrfs_check_nocow_unlock(struct btrfs_inode *inode) 1044 { 1045 btrfs_drew_write_unlock(&inode->root->snapshot_lock); 1046 } 1047 1048 int btrfs_write_check(struct kiocb *iocb, size_t count) 1049 { 1050 struct file *file = iocb->ki_filp; 1051 struct inode *inode = file_inode(file); 1052 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1053 loff_t pos = iocb->ki_pos; 1054 int ret; 1055 loff_t oldsize; 1056 1057 /* 1058 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or 1059 * prealloc flags, as without those flags we always have to COW. We will 1060 * later check if we can really COW into the target range (using 1061 * can_nocow_extent() at btrfs_get_blocks_direct_write()). 1062 */ 1063 if ((iocb->ki_flags & IOCB_NOWAIT) && 1064 !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 1065 return -EAGAIN; 1066 1067 ret = file_remove_privs(file); 1068 if (ret) 1069 return ret; 1070 1071 /* 1072 * We reserve space for updating the inode when we reserve space for the 1073 * extent we are going to write, so we will enospc out there. We don't 1074 * need to start yet another transaction to update the inode as we will 1075 * update the inode when we finish writing whatever data we write. 1076 */ 1077 if (!IS_NOCMTIME(inode)) { 1078 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 1079 inode_inc_iversion(inode); 1080 } 1081 1082 oldsize = i_size_read(inode); 1083 if (pos > oldsize) { 1084 /* Expand hole size to cover write data, preventing empty gap */ 1085 loff_t end_pos = round_up(pos + count, fs_info->sectorsize); 1086 1087 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); 1088 if (ret) 1089 return ret; 1090 } 1091 1092 return 0; 1093 } 1094 1095 static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, 1096 u64 start, u64 len, bool only_release_metadata) 1097 { 1098 if (len == 0) 1099 return; 1100 1101 if (only_release_metadata) { 1102 btrfs_check_nocow_unlock(inode); 1103 btrfs_delalloc_release_metadata(inode, len, true); 1104 } else { 1105 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1106 1107 btrfs_delalloc_release_space(inode, data_reserved, 1108 round_down(start, fs_info->sectorsize), 1109 len, true); 1110 } 1111 } 1112 1113 /* 1114 * Reserve data and metadata space for this buffered write range. 1115 * 1116 * Return >0 for the number of bytes reserved, which is always block aligned. 1117 * Return <0 for error. 1118 */ 1119 static ssize_t reserve_space(struct btrfs_inode *inode, 1120 struct extent_changeset **data_reserved, 1121 u64 start, size_t *len, bool nowait, 1122 bool *only_release_metadata) 1123 { 1124 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1125 const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); 1126 size_t reserve_bytes; 1127 int ret; 1128 1129 ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); 1130 if (ret < 0) { 1131 int can_nocow; 1132 1133 if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) 1134 return -EAGAIN; 1135 1136 /* 1137 * If we don't have to COW at the offset, reserve metadata only. 1138 * write_bytes may get smaller than requested here. 1139 */ 1140 can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); 1141 if (can_nocow < 0) 1142 ret = can_nocow; 1143 if (can_nocow > 0) 1144 ret = 0; 1145 if (ret) 1146 return ret; 1147 *only_release_metadata = true; 1148 } 1149 1150 reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); 1151 WARN_ON(reserve_bytes == 0); 1152 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, 1153 reserve_bytes, nowait); 1154 if (ret) { 1155 if (!*only_release_metadata) 1156 btrfs_free_reserved_data_space(inode, *data_reserved, 1157 start, *len); 1158 else 1159 btrfs_check_nocow_unlock(inode); 1160 1161 if (nowait && ret == -ENOSPC) 1162 ret = -EAGAIN; 1163 return ret; 1164 } 1165 return reserve_bytes; 1166 } 1167 1168 /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ 1169 static void shrink_reserved_space(struct btrfs_inode *inode, 1170 struct extent_changeset *data_reserved, 1171 u64 reserved_start, u64 reserved_len, 1172 u64 new_len, bool only_release_metadata) 1173 { 1174 const u64 diff = reserved_len - new_len; 1175 1176 ASSERT(new_len <= reserved_len); 1177 btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); 1178 if (only_release_metadata) 1179 btrfs_delalloc_release_metadata(inode, diff, true); 1180 else 1181 btrfs_delalloc_release_space(inode, data_reserved, 1182 reserved_start + new_len, diff, true); 1183 } 1184 1185 /* Calculate the maximum amount of bytes we can write into one folio. */ 1186 static size_t calc_write_bytes(const struct btrfs_inode *inode, 1187 const struct iov_iter *iter, u64 start) 1188 { 1189 const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); 1190 1191 return min(max_folio_size - (start & (max_folio_size - 1)), 1192 iov_iter_count(iter)); 1193 } 1194 1195 /* 1196 * Do the heavy-lifting work to copy one range into one folio of the page cache. 1197 * 1198 * Return > 0 in case we copied all bytes or just some of them. 1199 * Return 0 if no bytes were copied, in which case the caller should retry. 1200 * Return <0 on error. 1201 */ 1202 static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, 1203 struct extent_changeset **data_reserved, u64 start, 1204 bool nowait) 1205 { 1206 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1207 struct extent_state *cached_state = NULL; 1208 size_t write_bytes = calc_write_bytes(inode, iter, start); 1209 size_t copied; 1210 const u64 reserved_start = round_down(start, fs_info->sectorsize); 1211 u64 reserved_len; 1212 struct folio *folio = NULL; 1213 int extents_locked; 1214 u64 lockstart; 1215 u64 lockend; 1216 bool only_release_metadata = false; 1217 const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); 1218 int ret; 1219 1220 /* 1221 * Fault all pages before locking them in prepare_one_folio() to avoid 1222 * recursive lock. 1223 */ 1224 if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) 1225 return -EFAULT; 1226 extent_changeset_release(*data_reserved); 1227 ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, 1228 &only_release_metadata); 1229 if (ret < 0) 1230 return ret; 1231 reserved_len = ret; 1232 /* Write range must be inside the reserved range. */ 1233 ASSERT(reserved_start <= start); 1234 ASSERT(start + write_bytes <= reserved_start + reserved_len); 1235 1236 again: 1237 ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, 1238 bdp_flags); 1239 if (ret) { 1240 btrfs_delalloc_release_extents(inode, reserved_len); 1241 release_space(inode, *data_reserved, reserved_start, reserved_len, 1242 only_release_metadata); 1243 return ret; 1244 } 1245 1246 ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); 1247 if (ret) { 1248 btrfs_delalloc_release_extents(inode, reserved_len); 1249 release_space(inode, *data_reserved, reserved_start, reserved_len, 1250 only_release_metadata); 1251 return ret; 1252 } 1253 1254 /* 1255 * The reserved range goes beyond the current folio, shrink the reserved 1256 * space to the folio boundary. 1257 */ 1258 if (reserved_start + reserved_len > folio_end(folio)) { 1259 const u64 last_block = folio_end(folio); 1260 1261 shrink_reserved_space(inode, *data_reserved, reserved_start, 1262 reserved_len, last_block - reserved_start, 1263 only_release_metadata); 1264 write_bytes = last_block - start; 1265 reserved_len = last_block - reserved_start; 1266 } 1267 1268 extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, 1269 write_bytes, &lockstart, 1270 &lockend, nowait, 1271 &cached_state); 1272 if (extents_locked < 0) { 1273 if (!nowait && extents_locked == -EAGAIN) 1274 goto again; 1275 1276 btrfs_delalloc_release_extents(inode, reserved_len); 1277 release_space(inode, *data_reserved, reserved_start, reserved_len, 1278 only_release_metadata); 1279 ret = extents_locked; 1280 return ret; 1281 } 1282 1283 copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), 1284 write_bytes, iter); 1285 flush_dcache_folio(folio); 1286 1287 if (unlikely(copied < write_bytes)) { 1288 u64 last_block; 1289 1290 /* 1291 * The original write range doesn't need an uptodate folio as 1292 * the range is block aligned. But now a short copy happened. 1293 * We cannot handle it without an uptodate folio. 1294 * 1295 * So just revert the range and we will retry. 1296 */ 1297 if (!folio_test_uptodate(folio)) { 1298 iov_iter_revert(iter, copied); 1299 copied = 0; 1300 } 1301 1302 /* No copied bytes, unlock, release reserved space and exit. */ 1303 if (copied == 0) { 1304 if (extents_locked) 1305 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, 1306 &cached_state); 1307 else 1308 btrfs_free_extent_state(cached_state); 1309 btrfs_delalloc_release_extents(inode, reserved_len); 1310 release_space(inode, *data_reserved, reserved_start, reserved_len, 1311 only_release_metadata); 1312 btrfs_drop_folio(fs_info, folio, start, copied); 1313 return 0; 1314 } 1315 1316 /* Release the reserved space beyond the last block. */ 1317 last_block = round_up(start + copied, fs_info->sectorsize); 1318 1319 shrink_reserved_space(inode, *data_reserved, reserved_start, 1320 reserved_len, last_block - reserved_start, 1321 only_release_metadata); 1322 reserved_len = last_block - reserved_start; 1323 } 1324 1325 ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, 1326 only_release_metadata); 1327 /* 1328 * If we have not locked the extent range, because the range's start 1329 * offset is >= i_size, we might still have a non-NULL cached extent 1330 * state, acquired while marking the extent range as delalloc through 1331 * btrfs_dirty_page(). Therefore free any possible cached extent state 1332 * to avoid a memory leak. 1333 */ 1334 if (extents_locked) 1335 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 1336 else 1337 btrfs_free_extent_state(cached_state); 1338 1339 btrfs_delalloc_release_extents(inode, reserved_len); 1340 if (ret) { 1341 btrfs_drop_folio(fs_info, folio, start, copied); 1342 release_space(inode, *data_reserved, reserved_start, reserved_len, 1343 only_release_metadata); 1344 return ret; 1345 } 1346 if (only_release_metadata) 1347 btrfs_check_nocow_unlock(inode); 1348 1349 btrfs_drop_folio(fs_info, folio, start, copied); 1350 return copied; 1351 } 1352 1353 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) 1354 { 1355 struct file *file = iocb->ki_filp; 1356 loff_t pos; 1357 struct inode *inode = file_inode(file); 1358 struct extent_changeset *data_reserved = NULL; 1359 size_t num_written = 0; 1360 ssize_t ret; 1361 loff_t old_isize; 1362 unsigned int ilock_flags = 0; 1363 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); 1364 1365 if (nowait) 1366 ilock_flags |= BTRFS_ILOCK_TRY; 1367 1368 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 1369 if (ret < 0) 1370 return ret; 1371 1372 /* 1373 * We can only trust the isize with inode lock held, or it can race with 1374 * other buffered writes and cause incorrect call of 1375 * pagecache_isize_extended() to overwrite existing data. 1376 */ 1377 old_isize = i_size_read(inode); 1378 1379 ret = generic_write_checks(iocb, iter); 1380 if (ret <= 0) 1381 goto out; 1382 1383 ret = btrfs_write_check(iocb, ret); 1384 if (ret < 0) 1385 goto out; 1386 1387 pos = iocb->ki_pos; 1388 while (iov_iter_count(iter) > 0) { 1389 ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); 1390 if (ret < 0) 1391 break; 1392 pos += ret; 1393 num_written += ret; 1394 cond_resched(); 1395 } 1396 1397 extent_changeset_free(data_reserved); 1398 if (num_written > 0) { 1399 pagecache_isize_extended(inode, old_isize, iocb->ki_pos); 1400 iocb->ki_pos += num_written; 1401 } 1402 out: 1403 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1404 return num_written ? num_written : ret; 1405 } 1406 1407 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, 1408 const struct btrfs_ioctl_encoded_io_args *encoded) 1409 { 1410 struct file *file = iocb->ki_filp; 1411 struct inode *inode = file_inode(file); 1412 loff_t count; 1413 ssize_t ret; 1414 1415 btrfs_inode_lock(BTRFS_I(inode), 0); 1416 count = encoded->len; 1417 ret = generic_write_checks_count(iocb, &count); 1418 if (ret == 0 && count != encoded->len) { 1419 /* 1420 * The write got truncated by generic_write_checks_count(). We 1421 * can't do a partial encoded write. 1422 */ 1423 ret = -EFBIG; 1424 } 1425 if (ret || encoded->len == 0) 1426 goto out; 1427 1428 ret = btrfs_write_check(iocb, encoded->len); 1429 if (ret < 0) 1430 goto out; 1431 1432 ret = btrfs_do_encoded_write(iocb, from, encoded); 1433 out: 1434 btrfs_inode_unlock(BTRFS_I(inode), 0); 1435 return ret; 1436 } 1437 1438 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, 1439 const struct btrfs_ioctl_encoded_io_args *encoded) 1440 { 1441 struct file *file = iocb->ki_filp; 1442 struct btrfs_inode *inode = BTRFS_I(file_inode(file)); 1443 ssize_t num_written, num_sync; 1444 1445 /* 1446 * If the fs flips readonly due to some impossible error, although we 1447 * have opened a file as writable, we have to stop this write operation 1448 * to ensure consistency. 1449 */ 1450 if (BTRFS_FS_ERROR(inode->root->fs_info)) 1451 return -EROFS; 1452 1453 if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) 1454 return -EOPNOTSUPP; 1455 1456 if (encoded) { 1457 num_written = btrfs_encoded_write(iocb, from, encoded); 1458 num_sync = encoded->len; 1459 } else if (iocb->ki_flags & IOCB_DIRECT) { 1460 num_written = btrfs_direct_write(iocb, from); 1461 num_sync = num_written; 1462 } else { 1463 num_written = btrfs_buffered_write(iocb, from); 1464 num_sync = num_written; 1465 } 1466 1467 btrfs_set_inode_last_sub_trans(inode); 1468 1469 if (num_sync > 0) { 1470 num_sync = generic_write_sync(iocb, num_sync); 1471 if (num_sync < 0) 1472 num_written = num_sync; 1473 } 1474 1475 return num_written; 1476 } 1477 1478 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1479 { 1480 return btrfs_do_write_iter(iocb, from, NULL); 1481 } 1482 1483 int btrfs_release_file(struct inode *inode, struct file *filp) 1484 { 1485 struct btrfs_file_private *private = filp->private_data; 1486 1487 if (private) { 1488 kfree(private->filldir_buf); 1489 btrfs_free_extent_state(private->llseek_cached_state); 1490 kfree(private); 1491 filp->private_data = NULL; 1492 } 1493 1494 /* 1495 * Set by setattr when we are about to truncate a file from a non-zero 1496 * size to a zero size. This tries to flush down new bytes that may 1497 * have been written if the application were using truncate to replace 1498 * a file in place. 1499 */ 1500 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 1501 &BTRFS_I(inode)->runtime_flags)) 1502 filemap_flush(inode->i_mapping); 1503 return 0; 1504 } 1505 1506 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end) 1507 { 1508 int ret; 1509 struct blk_plug plug; 1510 1511 /* 1512 * This is only called in fsync, which would do synchronous writes, so 1513 * a plug can merge adjacent IOs as much as possible. Esp. in case of 1514 * multiple disks using raid profile, a large IO can be split to 1515 * several segments of stripe length (currently 64K). 1516 */ 1517 blk_start_plug(&plug); 1518 ret = btrfs_fdatawrite_range(inode, start, end); 1519 blk_finish_plug(&plug); 1520 1521 return ret; 1522 } 1523 1524 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) 1525 { 1526 struct btrfs_inode *inode = ctx->inode; 1527 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1528 1529 if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && 1530 list_empty(&ctx->ordered_extents)) 1531 return true; 1532 1533 /* 1534 * If we are doing a fast fsync we can not bail out if the inode's 1535 * last_trans is <= then the last committed transaction, because we only 1536 * update the last_trans of the inode during ordered extent completion, 1537 * and for a fast fsync we don't wait for that, we only wait for the 1538 * writeback to complete. 1539 */ 1540 if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && 1541 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || 1542 list_empty(&ctx->ordered_extents))) 1543 return true; 1544 1545 return false; 1546 } 1547 1548 /* 1549 * fsync call for both files and directories. This logs the inode into 1550 * the tree log instead of forcing full commits whenever possible. 1551 * 1552 * It needs to call filemap_fdatawait so that all ordered extent updates are 1553 * in the metadata btree are up to date for copying to the log. 1554 * 1555 * It drops the inode mutex before doing the tree log commit. This is an 1556 * important optimization for directories because holding the mutex prevents 1557 * new operations on the dir while we write to disk. 1558 */ 1559 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1560 { 1561 struct dentry *dentry = file_dentry(file); 1562 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 1563 struct btrfs_root *root = inode->root; 1564 struct btrfs_fs_info *fs_info = root->fs_info; 1565 struct btrfs_trans_handle *trans; 1566 struct btrfs_log_ctx ctx; 1567 int ret = 0, err; 1568 u64 len; 1569 bool full_sync; 1570 bool skip_ilock = false; 1571 1572 if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { 1573 skip_ilock = true; 1574 current->journal_info = NULL; 1575 btrfs_assert_inode_locked(inode); 1576 } 1577 1578 trace_btrfs_sync_file(file, datasync); 1579 1580 btrfs_init_log_ctx(&ctx, inode); 1581 1582 /* 1583 * Always set the range to a full range, otherwise we can get into 1584 * several problems, from missing file extent items to represent holes 1585 * when not using the NO_HOLES feature, to log tree corruption due to 1586 * races between hole detection during logging and completion of ordered 1587 * extents outside the range, to missing checksums due to ordered extents 1588 * for which we flushed only a subset of their pages. 1589 */ 1590 start = 0; 1591 end = LLONG_MAX; 1592 len = (u64)LLONG_MAX + 1; 1593 1594 /* 1595 * We write the dirty pages in the range and wait until they complete 1596 * out of the ->i_mutex. If so, we can flush the dirty pages by 1597 * multi-task, and make the performance up. See 1598 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1599 */ 1600 ret = start_ordered_ops(inode, start, end); 1601 if (ret) 1602 goto out; 1603 1604 if (skip_ilock) 1605 down_write(&inode->i_mmap_lock); 1606 else 1607 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 1608 1609 atomic_inc(&root->log_batch); 1610 1611 /* 1612 * Before we acquired the inode's lock and the mmap lock, someone may 1613 * have dirtied more pages in the target range. We need to make sure 1614 * that writeback for any such pages does not start while we are logging 1615 * the inode, because if it does, any of the following might happen when 1616 * we are not doing a full inode sync: 1617 * 1618 * 1) We log an extent after its writeback finishes but before its 1619 * checksums are added to the csum tree, leading to -EIO errors 1620 * when attempting to read the extent after a log replay. 1621 * 1622 * 2) We can end up logging an extent before its writeback finishes. 1623 * Therefore after the log replay we will have a file extent item 1624 * pointing to an unwritten extent (and no data checksums as well). 1625 * 1626 * So trigger writeback for any eventual new dirty pages and then we 1627 * wait for all ordered extents to complete below. 1628 */ 1629 ret = start_ordered_ops(inode, start, end); 1630 if (ret) { 1631 if (skip_ilock) 1632 up_write(&inode->i_mmap_lock); 1633 else 1634 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1635 goto out; 1636 } 1637 1638 /* 1639 * Always check for the full sync flag while holding the inode's lock, 1640 * to avoid races with other tasks. The flag must be either set all the 1641 * time during logging or always off all the time while logging. 1642 * We check the flag here after starting delalloc above, because when 1643 * running delalloc the full sync flag may be set if we need to drop 1644 * extra extent map ranges due to temporary memory allocation failures. 1645 */ 1646 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 1647 1648 /* 1649 * We have to do this here to avoid the priority inversion of waiting on 1650 * IO of a lower priority task while holding a transaction open. 1651 * 1652 * For a full fsync we wait for the ordered extents to complete while 1653 * for a fast fsync we wait just for writeback to complete, and then 1654 * attach the ordered extents to the transaction so that a transaction 1655 * commit waits for their completion, to avoid data loss if we fsync, 1656 * the current transaction commits before the ordered extents complete 1657 * and a power failure happens right after that. 1658 * 1659 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the 1660 * logical address recorded in the ordered extent may change. We need 1661 * to wait for the IO to stabilize the logical address. 1662 */ 1663 if (full_sync || btrfs_is_zoned(fs_info)) { 1664 ret = btrfs_wait_ordered_range(inode, start, len); 1665 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); 1666 } else { 1667 /* 1668 * Get our ordered extents as soon as possible to avoid doing 1669 * checksum lookups in the csum tree, and use instead the 1670 * checksums attached to the ordered extents. 1671 */ 1672 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents); 1673 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); 1674 if (ret) 1675 goto out_release_extents; 1676 1677 /* 1678 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after 1679 * starting and waiting for writeback, because for buffered IO 1680 * it may have been set during the end IO callback 1681 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in 1682 * case an error happened and we need to wait for ordered 1683 * extents to complete so that any extent maps that point to 1684 * unwritten locations are dropped and we don't log them. 1685 */ 1686 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags)) 1687 ret = btrfs_wait_ordered_range(inode, start, len); 1688 } 1689 1690 if (ret) 1691 goto out_release_extents; 1692 1693 atomic_inc(&root->log_batch); 1694 1695 if (skip_inode_logging(&ctx)) { 1696 /* 1697 * We've had everything committed since the last time we were 1698 * modified so clear this flag in case it was set for whatever 1699 * reason, it's no longer relevant. 1700 */ 1701 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 1702 /* 1703 * An ordered extent might have started before and completed 1704 * already with io errors, in which case the inode was not 1705 * updated and we end up here. So check the inode's mapping 1706 * for any errors that might have happened since we last 1707 * checked called fsync. 1708 */ 1709 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); 1710 goto out_release_extents; 1711 } 1712 1713 btrfs_init_log_ctx_scratch_eb(&ctx); 1714 1715 /* 1716 * We use start here because we will need to wait on the IO to complete 1717 * in btrfs_sync_log, which could require joining a transaction (for 1718 * example checking cross references in the nocow path). If we use join 1719 * here we could get into a situation where we're waiting on IO to 1720 * happen that is blocked on a transaction trying to commit. With start 1721 * we inc the extwriter counter, so we wait for all extwriters to exit 1722 * before we start blocking joiners. This comment is to keep somebody 1723 * from thinking they are super smart and changing this to 1724 * btrfs_join_transaction *cough*Josef*cough*. 1725 */ 1726 trans = btrfs_start_transaction(root, 0); 1727 if (IS_ERR(trans)) { 1728 ret = PTR_ERR(trans); 1729 goto out_release_extents; 1730 } 1731 trans->in_fsync = true; 1732 1733 ret = btrfs_log_dentry_safe(trans, dentry, &ctx); 1734 /* 1735 * Scratch eb no longer needed, release before syncing log or commit 1736 * transaction, to avoid holding unnecessary memory during such long 1737 * operations. 1738 */ 1739 if (ctx.scratch_eb) { 1740 free_extent_buffer(ctx.scratch_eb); 1741 ctx.scratch_eb = NULL; 1742 } 1743 btrfs_release_log_ctx_extents(&ctx); 1744 if (ret < 0) { 1745 /* Fallthrough and commit/free transaction. */ 1746 ret = BTRFS_LOG_FORCE_COMMIT; 1747 } 1748 1749 /* we've logged all the items and now have a consistent 1750 * version of the file in the log. It is possible that 1751 * someone will come in and modify the file, but that's 1752 * fine because the log is consistent on disk, and we 1753 * have references to all of the file's extents 1754 * 1755 * It is possible that someone will come in and log the 1756 * file again, but that will end up using the synchronization 1757 * inside btrfs_sync_log to keep things safe. 1758 */ 1759 if (skip_ilock) 1760 up_write(&inode->i_mmap_lock); 1761 else 1762 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1763 1764 if (ret == BTRFS_NO_LOG_SYNC) { 1765 ret = btrfs_end_transaction(trans); 1766 goto out; 1767 } 1768 1769 /* We successfully logged the inode, attempt to sync the log. */ 1770 if (!ret) { 1771 ret = btrfs_sync_log(trans, root, &ctx); 1772 if (!ret) { 1773 ret = btrfs_end_transaction(trans); 1774 goto out; 1775 } 1776 } 1777 1778 /* 1779 * At this point we need to commit the transaction because we had 1780 * btrfs_need_log_full_commit() or some other error. 1781 * 1782 * If we didn't do a full sync we have to stop the trans handle, wait on 1783 * the ordered extents, start it again and commit the transaction. If 1784 * we attempt to wait on the ordered extents here we could deadlock with 1785 * something like fallocate() that is holding the extent lock trying to 1786 * start a transaction while some other thread is trying to commit the 1787 * transaction while we (fsync) are currently holding the transaction 1788 * open. 1789 */ 1790 if (!full_sync) { 1791 ret = btrfs_end_transaction(trans); 1792 if (ret) 1793 goto out; 1794 ret = btrfs_wait_ordered_range(inode, start, len); 1795 if (ret) 1796 goto out; 1797 1798 /* 1799 * This is safe to use here because we're only interested in 1800 * making sure the transaction that had the ordered extents is 1801 * committed. We aren't waiting on anything past this point, 1802 * we're purely getting the transaction and committing it. 1803 */ 1804 trans = btrfs_attach_transaction_barrier(root); 1805 if (IS_ERR(trans)) { 1806 ret = PTR_ERR(trans); 1807 1808 /* 1809 * We committed the transaction and there's no currently 1810 * running transaction, this means everything we care 1811 * about made it to disk and we are done. 1812 */ 1813 if (ret == -ENOENT) 1814 ret = 0; 1815 goto out; 1816 } 1817 } 1818 1819 ret = btrfs_commit_transaction(trans); 1820 out: 1821 free_extent_buffer(ctx.scratch_eb); 1822 ASSERT(list_empty(&ctx.list)); 1823 ASSERT(list_empty(&ctx.conflict_inodes)); 1824 err = file_check_and_advance_wb_err(file); 1825 if (!ret) 1826 ret = err; 1827 return ret > 0 ? -EIO : ret; 1828 1829 out_release_extents: 1830 btrfs_release_log_ctx_extents(&ctx); 1831 if (skip_ilock) 1832 up_write(&inode->i_mmap_lock); 1833 else 1834 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1835 goto out; 1836 } 1837 1838 /* 1839 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 1840 * called from a page fault handler when a page is first dirtied. Hence we must 1841 * be careful to check for EOF conditions here. We set the page up correctly 1842 * for a written page which means we get ENOSPC checking when writing into 1843 * holes and correct delalloc and unwritten extent mapping on filesystems that 1844 * support these features. 1845 * 1846 * We are not allowed to take the i_mutex here so we have to play games to 1847 * protect against truncate races as the page could now be beyond EOF. Because 1848 * truncate_setsize() writes the inode size before removing pages, once we have 1849 * the page lock we can determine safely if the page is beyond EOF. If it is not 1850 * beyond EOF, then the page is guaranteed safe against truncation until we 1851 * unlock the page. 1852 */ 1853 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) 1854 { 1855 struct page *page = vmf->page; 1856 struct folio *folio = page_folio(page); 1857 struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file)); 1858 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1859 struct extent_io_tree *io_tree = &inode->io_tree; 1860 struct btrfs_ordered_extent *ordered; 1861 struct extent_state *cached_state = NULL; 1862 struct extent_changeset *data_reserved = NULL; 1863 unsigned long zero_start; 1864 loff_t size; 1865 size_t fsize = folio_size(folio); 1866 int ret; 1867 bool only_release_metadata = false; 1868 u64 reserved_space; 1869 u64 page_start; 1870 u64 page_end; 1871 u64 end; 1872 1873 reserved_space = fsize; 1874 1875 sb_start_pagefault(inode->vfs_inode.i_sb); 1876 page_start = folio_pos(folio); 1877 page_end = page_start + folio_size(folio) - 1; 1878 end = page_end; 1879 1880 /* 1881 * Reserving delalloc space after obtaining the page lock can lead to 1882 * deadlock. For example, if a dirty page is locked by this function 1883 * and the call to btrfs_delalloc_reserve_space() ends up triggering 1884 * dirty page write out, then the btrfs_writepages() function could 1885 * end up waiting indefinitely to get a lock on the page currently 1886 * being processed by btrfs_page_mkwrite() function. 1887 */ 1888 ret = btrfs_check_data_free_space(inode, &data_reserved, page_start, 1889 reserved_space, false); 1890 if (ret < 0) { 1891 size_t write_bytes = reserved_space; 1892 1893 if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0) 1894 goto out_noreserve; 1895 1896 only_release_metadata = true; 1897 1898 /* 1899 * Can't write the whole range, there may be shared extents or 1900 * holes in the range, bail out with @only_release_metadata set 1901 * to true so that we unlock the nocow lock before returning the 1902 * error. 1903 */ 1904 if (write_bytes < reserved_space) 1905 goto out_noreserve; 1906 } 1907 ret = btrfs_delalloc_reserve_metadata(inode, reserved_space, 1908 reserved_space, false); 1909 if (ret < 0) { 1910 if (!only_release_metadata) 1911 btrfs_free_reserved_data_space(inode, data_reserved, 1912 page_start, reserved_space); 1913 goto out_noreserve; 1914 } 1915 1916 ret = file_update_time(vmf->vma->vm_file); 1917 if (ret < 0) 1918 goto out; 1919 again: 1920 down_read(&inode->i_mmap_lock); 1921 folio_lock(folio); 1922 size = i_size_read(&inode->vfs_inode); 1923 1924 if ((folio->mapping != inode->vfs_inode.i_mapping) || 1925 (page_start >= size)) { 1926 /* Page got truncated out from underneath us. */ 1927 goto out_unlock; 1928 } 1929 folio_wait_writeback(folio); 1930 1931 btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); 1932 ret = set_folio_extent_mapped(folio); 1933 if (ret < 0) { 1934 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1935 goto out_unlock; 1936 } 1937 1938 /* 1939 * We can't set the delalloc bits if there are pending ordered 1940 * extents. Drop our locks and wait for them to finish. 1941 */ 1942 ordered = btrfs_lookup_ordered_range(inode, page_start, fsize); 1943 if (ordered) { 1944 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1945 folio_unlock(folio); 1946 up_read(&inode->i_mmap_lock); 1947 btrfs_start_ordered_extent(ordered); 1948 btrfs_put_ordered_extent(ordered); 1949 goto again; 1950 } 1951 1952 if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { 1953 reserved_space = round_up(size - page_start, fs_info->sectorsize); 1954 if (reserved_space < fsize) { 1955 const u64 to_free = fsize - reserved_space; 1956 1957 end = page_start + reserved_space - 1; 1958 if (only_release_metadata) 1959 btrfs_delalloc_release_metadata(inode, to_free, true); 1960 else 1961 btrfs_delalloc_release_space(inode, data_reserved, 1962 end + 1, to_free, true); 1963 } 1964 } 1965 1966 /* 1967 * page_mkwrite gets called when the page is firstly dirtied after it's 1968 * faulted in, but write(2) could also dirty a page and set delalloc 1969 * bits, thus in this case for space account reason, we still need to 1970 * clear any delalloc bits within this page range since we have to 1971 * reserve data&meta space before lock_page() (see above comments). 1972 */ 1973 btrfs_clear_extent_bit(io_tree, page_start, end, 1974 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1975 EXTENT_DEFRAG, &cached_state); 1976 1977 ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state); 1978 if (ret < 0) { 1979 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1980 goto out_unlock; 1981 } 1982 1983 /* Page is wholly or partially inside EOF. */ 1984 if (page_start + folio_size(folio) > size) 1985 zero_start = offset_in_folio(folio, size); 1986 else 1987 zero_start = fsize; 1988 1989 if (zero_start != fsize) 1990 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); 1991 1992 btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); 1993 btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); 1994 btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); 1995 1996 btrfs_set_inode_last_sub_trans(inode); 1997 1998 if (only_release_metadata) 1999 btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE, 2000 &cached_state); 2001 2002 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 2003 up_read(&inode->i_mmap_lock); 2004 2005 btrfs_delalloc_release_extents(inode, fsize); 2006 if (only_release_metadata) 2007 btrfs_check_nocow_unlock(inode); 2008 sb_end_pagefault(inode->vfs_inode.i_sb); 2009 extent_changeset_free(data_reserved); 2010 return VM_FAULT_LOCKED; 2011 2012 out_unlock: 2013 folio_unlock(folio); 2014 up_read(&inode->i_mmap_lock); 2015 out: 2016 btrfs_delalloc_release_extents(inode, fsize); 2017 if (only_release_metadata) 2018 btrfs_delalloc_release_metadata(inode, reserved_space, true); 2019 else 2020 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2021 reserved_space, true); 2022 extent_changeset_free(data_reserved); 2023 out_noreserve: 2024 if (only_release_metadata) 2025 btrfs_check_nocow_unlock(inode); 2026 2027 sb_end_pagefault(inode->vfs_inode.i_sb); 2028 2029 if (ret < 0) 2030 return vmf_error(ret); 2031 2032 /* Make the VM retry the fault. */ 2033 return VM_FAULT_NOPAGE; 2034 } 2035 2036 static const struct vm_operations_struct btrfs_file_vm_ops = { 2037 .fault = filemap_fault, 2038 .map_pages = filemap_map_pages, 2039 .page_mkwrite = btrfs_page_mkwrite, 2040 }; 2041 2042 static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) 2043 { 2044 struct file *filp = desc->file; 2045 struct address_space *mapping = filp->f_mapping; 2046 2047 if (!mapping->a_ops->read_folio) 2048 return -ENOEXEC; 2049 2050 file_accessed(filp); 2051 desc->vm_ops = &btrfs_file_vm_ops; 2052 2053 return 0; 2054 } 2055 2056 static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, 2057 int slot, u64 start, u64 end) 2058 { 2059 struct btrfs_file_extent_item *fi; 2060 struct btrfs_key key; 2061 2062 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 2063 return false; 2064 2065 btrfs_item_key_to_cpu(leaf, &key, slot); 2066 if (key.objectid != btrfs_ino(inode) || 2067 key.type != BTRFS_EXTENT_DATA_KEY) 2068 return false; 2069 2070 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2071 2072 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2073 return false; 2074 2075 if (btrfs_file_extent_disk_bytenr(leaf, fi)) 2076 return false; 2077 2078 if (key.offset == end) 2079 return true; 2080 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) 2081 return true; 2082 return false; 2083 } 2084 2085 static int fill_holes(struct btrfs_trans_handle *trans, 2086 struct btrfs_inode *inode, 2087 struct btrfs_path *path, u64 offset, u64 end) 2088 { 2089 struct btrfs_fs_info *fs_info = trans->fs_info; 2090 struct btrfs_root *root = inode->root; 2091 struct extent_buffer *leaf; 2092 struct btrfs_file_extent_item *fi; 2093 struct extent_map *hole_em; 2094 struct btrfs_key key; 2095 int ret; 2096 2097 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 2098 goto out; 2099 2100 key.objectid = btrfs_ino(inode); 2101 key.type = BTRFS_EXTENT_DATA_KEY; 2102 key.offset = offset; 2103 2104 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2105 if (ret <= 0) { 2106 /* 2107 * We should have dropped this offset, so if we find it then 2108 * something has gone horribly wrong. 2109 */ 2110 if (ret == 0) 2111 ret = -EINVAL; 2112 return ret; 2113 } 2114 2115 leaf = path->nodes[0]; 2116 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { 2117 u64 num_bytes; 2118 2119 path->slots[0]--; 2120 fi = btrfs_item_ptr(leaf, path->slots[0], 2121 struct btrfs_file_extent_item); 2122 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + 2123 end - offset; 2124 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2125 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2126 btrfs_set_file_extent_offset(leaf, fi, 0); 2127 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2128 goto out; 2129 } 2130 2131 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { 2132 u64 num_bytes; 2133 2134 key.offset = offset; 2135 btrfs_set_item_key_safe(trans, path, &key); 2136 fi = btrfs_item_ptr(leaf, path->slots[0], 2137 struct btrfs_file_extent_item); 2138 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 2139 offset; 2140 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2141 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2142 btrfs_set_file_extent_offset(leaf, fi, 0); 2143 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2144 goto out; 2145 } 2146 btrfs_release_path(path); 2147 2148 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, 2149 end - offset); 2150 if (ret) 2151 return ret; 2152 2153 out: 2154 btrfs_release_path(path); 2155 2156 hole_em = btrfs_alloc_extent_map(); 2157 if (!hole_em) { 2158 btrfs_drop_extent_map_range(inode, offset, end - 1, false); 2159 btrfs_set_inode_full_sync(inode); 2160 } else { 2161 hole_em->start = offset; 2162 hole_em->len = end - offset; 2163 hole_em->ram_bytes = hole_em->len; 2164 2165 hole_em->disk_bytenr = EXTENT_MAP_HOLE; 2166 hole_em->disk_num_bytes = 0; 2167 hole_em->generation = trans->transid; 2168 2169 ret = btrfs_replace_extent_map_range(inode, hole_em, true); 2170 btrfs_free_extent_map(hole_em); 2171 if (ret) 2172 btrfs_set_inode_full_sync(inode); 2173 } 2174 2175 return 0; 2176 } 2177 2178 /* 2179 * Find a hole extent on given inode and change start/len to the end of hole 2180 * extent.(hole/vacuum extent whose em->start <= start && 2181 * em->start + em->len > start) 2182 * When a hole extent is found, return 1 and modify start/len. 2183 */ 2184 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 2185 { 2186 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2187 struct extent_map *em; 2188 int ret = 0; 2189 2190 em = btrfs_get_extent(inode, NULL, 2191 round_down(*start, fs_info->sectorsize), 2192 round_up(*len, fs_info->sectorsize)); 2193 if (IS_ERR(em)) 2194 return PTR_ERR(em); 2195 2196 /* Hole or vacuum extent(only exists in no-hole mode) */ 2197 if (em->disk_bytenr == EXTENT_MAP_HOLE) { 2198 ret = 1; 2199 *len = em->start + em->len > *start + *len ? 2200 0 : *start + *len - em->start - em->len; 2201 *start = em->start + em->len; 2202 } 2203 btrfs_free_extent_map(em); 2204 return ret; 2205 } 2206 2207 /* 2208 * Check if there is no folio in the range. 2209 * 2210 * We cannot utilize filemap_range_has_page() in a filemap with large folios 2211 * as we can hit the following false positive: 2212 * 2213 * start end 2214 * | | 2215 * |//|//|//|//| | | | | | | | |//|//| 2216 * \ / \ / 2217 * Folio A Folio B 2218 * 2219 * That large folio A and B cover the start and end indexes. 2220 * In that case filemap_range_has_page() will always return true, but the above 2221 * case is fine for btrfs_punch_hole_lock_range() usage. 2222 * 2223 * So here we only ensure that no other folios is in the range, excluding the 2224 * head/tail large folio. 2225 */ 2226 static bool check_range_has_page(struct inode *inode, u64 start, u64 end) 2227 { 2228 struct folio_batch fbatch; 2229 bool ret = false; 2230 /* 2231 * For subpage case, if the range is not at page boundary, we could 2232 * have pages at the leading/tailing part of the range. 2233 * This could lead to dead loop since filemap_range_has_page() 2234 * will always return true. 2235 * So here we need to do extra page alignment for 2236 * filemap_range_has_page(). 2237 * 2238 * And do not decrease page_lockend right now, as it can be 0. 2239 */ 2240 const u64 page_lockstart = round_up(start, PAGE_SIZE); 2241 const u64 page_lockend = round_down(end + 1, PAGE_SIZE); 2242 const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; 2243 const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; 2244 pgoff_t tmp = start_index; 2245 int found_folios; 2246 2247 /* The same page or adjacent pages. */ 2248 if (page_lockend <= page_lockstart) 2249 return false; 2250 2251 folio_batch_init(&fbatch); 2252 found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); 2253 for (int i = 0; i < found_folios; i++) { 2254 struct folio *folio = fbatch.folios[i]; 2255 2256 /* A large folio begins before the start. Not a target. */ 2257 if (folio->index < start_index) 2258 continue; 2259 /* A large folio extends beyond the end. Not a target. */ 2260 if (folio_next_index(folio) > end_index) 2261 continue; 2262 /* A folio doesn't cover the head/tail index. Found a target. */ 2263 ret = true; 2264 break; 2265 } 2266 folio_batch_release(&fbatch); 2267 return ret; 2268 } 2269 2270 static void btrfs_punch_hole_lock_range(struct inode *inode, 2271 const u64 lockstart, const u64 lockend, 2272 struct extent_state **cached_state) 2273 { 2274 while (1) { 2275 truncate_pagecache_range(inode, lockstart, lockend); 2276 2277 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2278 cached_state); 2279 /* 2280 * We can't have ordered extents in the range, nor dirty/writeback 2281 * pages, because we have locked the inode's VFS lock in exclusive 2282 * mode, we have locked the inode's i_mmap_lock in exclusive mode, 2283 * we have flushed all delalloc in the range and we have waited 2284 * for any ordered extents in the range to complete. 2285 * We can race with anyone reading pages from this range, so after 2286 * locking the range check if we have pages in the range, and if 2287 * we do, unlock the range and retry. 2288 */ 2289 if (!check_range_has_page(inode, lockstart, lockend)) 2290 break; 2291 2292 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2293 cached_state); 2294 } 2295 2296 btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); 2297 } 2298 2299 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, 2300 struct btrfs_inode *inode, 2301 struct btrfs_path *path, 2302 struct btrfs_replace_extent_info *extent_info, 2303 const u64 replace_len, 2304 const u64 bytes_to_drop) 2305 { 2306 struct btrfs_fs_info *fs_info = trans->fs_info; 2307 struct btrfs_root *root = inode->root; 2308 struct btrfs_file_extent_item *extent; 2309 struct extent_buffer *leaf; 2310 struct btrfs_key key; 2311 int slot; 2312 int ret; 2313 2314 if (replace_len == 0) 2315 return 0; 2316 2317 if (extent_info->disk_offset == 0 && 2318 btrfs_fs_incompat(fs_info, NO_HOLES)) { 2319 btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 2320 return 0; 2321 } 2322 2323 key.objectid = btrfs_ino(inode); 2324 key.type = BTRFS_EXTENT_DATA_KEY; 2325 key.offset = extent_info->file_offset; 2326 ret = btrfs_insert_empty_item(trans, root, path, &key, 2327 sizeof(struct btrfs_file_extent_item)); 2328 if (ret) 2329 return ret; 2330 leaf = path->nodes[0]; 2331 slot = path->slots[0]; 2332 write_extent_buffer(leaf, extent_info->extent_buf, 2333 btrfs_item_ptr_offset(leaf, slot), 2334 sizeof(struct btrfs_file_extent_item)); 2335 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2336 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); 2337 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); 2338 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); 2339 if (extent_info->is_new_extent) 2340 btrfs_set_file_extent_generation(leaf, extent, trans->transid); 2341 btrfs_release_path(path); 2342 2343 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, 2344 replace_len); 2345 if (ret) 2346 return ret; 2347 2348 /* If it's a hole, nothing more needs to be done. */ 2349 if (extent_info->disk_offset == 0) { 2350 btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 2351 return 0; 2352 } 2353 2354 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); 2355 2356 if (extent_info->is_new_extent && extent_info->insertions == 0) { 2357 key.objectid = extent_info->disk_offset; 2358 key.type = BTRFS_EXTENT_ITEM_KEY; 2359 key.offset = extent_info->disk_len; 2360 ret = btrfs_alloc_reserved_file_extent(trans, root, 2361 btrfs_ino(inode), 2362 extent_info->file_offset, 2363 extent_info->qgroup_reserved, 2364 &key); 2365 } else { 2366 struct btrfs_ref ref = { 2367 .action = BTRFS_ADD_DELAYED_REF, 2368 .bytenr = extent_info->disk_offset, 2369 .num_bytes = extent_info->disk_len, 2370 .owning_root = btrfs_root_id(root), 2371 .ref_root = btrfs_root_id(root), 2372 }; 2373 u64 ref_offset; 2374 2375 ref_offset = extent_info->file_offset - extent_info->data_offset; 2376 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); 2377 ret = btrfs_inc_extent_ref(trans, &ref); 2378 } 2379 2380 extent_info->insertions++; 2381 2382 return ret; 2383 } 2384 2385 /* 2386 * The respective range must have been previously locked, as well as the inode. 2387 * The end offset is inclusive (last byte of the range). 2388 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing 2389 * the file range with an extent. 2390 * When not punching a hole, we don't want to end up in a state where we dropped 2391 * extents without inserting a new one, so we must abort the transaction to avoid 2392 * a corruption. 2393 */ 2394 int btrfs_replace_file_extents(struct btrfs_inode *inode, 2395 struct btrfs_path *path, const u64 start, 2396 const u64 end, 2397 struct btrfs_replace_extent_info *extent_info, 2398 struct btrfs_trans_handle **trans_out) 2399 { 2400 struct btrfs_drop_extents_args drop_args = { 0 }; 2401 struct btrfs_root *root = inode->root; 2402 struct btrfs_fs_info *fs_info = root->fs_info; 2403 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); 2404 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); 2405 struct btrfs_trans_handle *trans = NULL; 2406 struct btrfs_block_rsv rsv; 2407 unsigned int rsv_count; 2408 u64 cur_offset; 2409 u64 len = end - start; 2410 int ret = 0; 2411 2412 if (end <= start) 2413 return -EINVAL; 2414 2415 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); 2416 rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1); 2417 rsv.failfast = true; 2418 2419 /* 2420 * 1 - update the inode 2421 * 1 - removing the extents in the range 2422 * 1 - adding the hole extent if no_holes isn't set or if we are 2423 * replacing the range with a new extent 2424 */ 2425 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) 2426 rsv_count = 3; 2427 else 2428 rsv_count = 2; 2429 2430 trans = btrfs_start_transaction(root, rsv_count); 2431 if (IS_ERR(trans)) { 2432 ret = PTR_ERR(trans); 2433 trans = NULL; 2434 goto out_release; 2435 } 2436 2437 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, 2438 min_size, false); 2439 if (WARN_ON(ret)) 2440 goto out_trans; 2441 trans->block_rsv = &rsv; 2442 2443 cur_offset = start; 2444 drop_args.path = path; 2445 drop_args.end = end + 1; 2446 drop_args.drop_cache = true; 2447 while (cur_offset < end) { 2448 drop_args.start = cur_offset; 2449 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2450 /* If we are punching a hole decrement the inode's byte count */ 2451 if (!extent_info) 2452 btrfs_update_inode_bytes(inode, 0, 2453 drop_args.bytes_found); 2454 if (ret != -ENOSPC) { 2455 /* 2456 * The only time we don't want to abort is if we are 2457 * attempting to clone a partial inline extent, in which 2458 * case we'll get EOPNOTSUPP. However if we aren't 2459 * clone we need to abort no matter what, because if we 2460 * got EOPNOTSUPP via prealloc then we messed up and 2461 * need to abort. 2462 */ 2463 if (ret && 2464 (ret != -EOPNOTSUPP || 2465 (extent_info && extent_info->is_new_extent))) 2466 btrfs_abort_transaction(trans, ret); 2467 break; 2468 } 2469 2470 trans->block_rsv = &fs_info->trans_block_rsv; 2471 2472 if (!extent_info && cur_offset < drop_args.drop_end && 2473 cur_offset < ino_size) { 2474 ret = fill_holes(trans, inode, path, cur_offset, 2475 drop_args.drop_end); 2476 if (ret) { 2477 /* 2478 * If we failed then we didn't insert our hole 2479 * entries for the area we dropped, so now the 2480 * fs is corrupted, so we must abort the 2481 * transaction. 2482 */ 2483 btrfs_abort_transaction(trans, ret); 2484 break; 2485 } 2486 } else if (!extent_info && cur_offset < drop_args.drop_end) { 2487 /* 2488 * We are past the i_size here, but since we didn't 2489 * insert holes we need to clear the mapped area so we 2490 * know to not set disk_i_size in this area until a new 2491 * file extent is inserted here. 2492 */ 2493 ret = btrfs_inode_clear_file_extent_range(inode, 2494 cur_offset, 2495 drop_args.drop_end - cur_offset); 2496 if (ret) { 2497 /* 2498 * We couldn't clear our area, so we could 2499 * presumably adjust up and corrupt the fs, so 2500 * we need to abort. 2501 */ 2502 btrfs_abort_transaction(trans, ret); 2503 break; 2504 } 2505 } 2506 2507 if (extent_info && 2508 drop_args.drop_end > extent_info->file_offset) { 2509 u64 replace_len = drop_args.drop_end - 2510 extent_info->file_offset; 2511 2512 ret = btrfs_insert_replace_extent(trans, inode, path, 2513 extent_info, replace_len, 2514 drop_args.bytes_found); 2515 if (ret) { 2516 btrfs_abort_transaction(trans, ret); 2517 break; 2518 } 2519 extent_info->data_len -= replace_len; 2520 extent_info->data_offset += replace_len; 2521 extent_info->file_offset += replace_len; 2522 } 2523 2524 /* 2525 * We are releasing our handle on the transaction, balance the 2526 * dirty pages of the btree inode and flush delayed items, and 2527 * then get a new transaction handle, which may now point to a 2528 * new transaction in case someone else may have committed the 2529 * transaction we used to replace/drop file extent items. So 2530 * bump the inode's iversion and update mtime and ctime except 2531 * if we are called from a dedupe context. This is because a 2532 * power failure/crash may happen after the transaction is 2533 * committed and before we finish replacing/dropping all the 2534 * file extent items we need. 2535 */ 2536 inode_inc_iversion(&inode->vfs_inode); 2537 2538 if (!extent_info || extent_info->update_times) 2539 inode_set_mtime_to_ts(&inode->vfs_inode, 2540 inode_set_ctime_current(&inode->vfs_inode)); 2541 2542 ret = btrfs_update_inode(trans, inode); 2543 if (ret) 2544 break; 2545 2546 btrfs_end_transaction(trans); 2547 btrfs_btree_balance_dirty(fs_info); 2548 2549 trans = btrfs_start_transaction(root, rsv_count); 2550 if (IS_ERR(trans)) { 2551 ret = PTR_ERR(trans); 2552 trans = NULL; 2553 break; 2554 } 2555 2556 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 2557 &rsv, min_size, false); 2558 if (WARN_ON(ret)) 2559 break; 2560 trans->block_rsv = &rsv; 2561 2562 cur_offset = drop_args.drop_end; 2563 len = end - cur_offset; 2564 if (!extent_info && len) { 2565 ret = find_first_non_hole(inode, &cur_offset, &len); 2566 if (unlikely(ret < 0)) 2567 break; 2568 if (ret && !len) { 2569 ret = 0; 2570 break; 2571 } 2572 } 2573 } 2574 2575 /* 2576 * If we were cloning, force the next fsync to be a full one since we 2577 * we replaced (or just dropped in the case of cloning holes when 2578 * NO_HOLES is enabled) file extent items and did not setup new extent 2579 * maps for the replacement extents (or holes). 2580 */ 2581 if (extent_info && !extent_info->is_new_extent) 2582 btrfs_set_inode_full_sync(inode); 2583 2584 if (ret) 2585 goto out_trans; 2586 2587 trans->block_rsv = &fs_info->trans_block_rsv; 2588 /* 2589 * If we are using the NO_HOLES feature we might have had already an 2590 * hole that overlaps a part of the region [lockstart, lockend] and 2591 * ends at (or beyond) lockend. Since we have no file extent items to 2592 * represent holes, drop_end can be less than lockend and so we must 2593 * make sure we have an extent map representing the existing hole (the 2594 * call to __btrfs_drop_extents() might have dropped the existing extent 2595 * map representing the existing hole), otherwise the fast fsync path 2596 * will not record the existence of the hole region 2597 * [existing_hole_start, lockend]. 2598 */ 2599 if (drop_args.drop_end <= end) 2600 drop_args.drop_end = end + 1; 2601 /* 2602 * Don't insert file hole extent item if it's for a range beyond eof 2603 * (because it's useless) or if it represents a 0 bytes range (when 2604 * cur_offset == drop_end). 2605 */ 2606 if (!extent_info && cur_offset < ino_size && 2607 cur_offset < drop_args.drop_end) { 2608 ret = fill_holes(trans, inode, path, cur_offset, 2609 drop_args.drop_end); 2610 if (ret) { 2611 /* Same comment as above. */ 2612 btrfs_abort_transaction(trans, ret); 2613 goto out_trans; 2614 } 2615 } else if (!extent_info && cur_offset < drop_args.drop_end) { 2616 /* See the comment in the loop above for the reasoning here. */ 2617 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, 2618 drop_args.drop_end - cur_offset); 2619 if (ret) { 2620 btrfs_abort_transaction(trans, ret); 2621 goto out_trans; 2622 } 2623 2624 } 2625 if (extent_info) { 2626 ret = btrfs_insert_replace_extent(trans, inode, path, 2627 extent_info, extent_info->data_len, 2628 drop_args.bytes_found); 2629 if (ret) { 2630 btrfs_abort_transaction(trans, ret); 2631 goto out_trans; 2632 } 2633 } 2634 2635 out_trans: 2636 if (!trans) 2637 goto out_release; 2638 2639 trans->block_rsv = &fs_info->trans_block_rsv; 2640 if (ret) 2641 btrfs_end_transaction(trans); 2642 else 2643 *trans_out = trans; 2644 out_release: 2645 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); 2646 return ret; 2647 } 2648 2649 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) 2650 { 2651 struct inode *inode = file_inode(file); 2652 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2653 struct btrfs_root *root = BTRFS_I(inode)->root; 2654 struct extent_state *cached_state = NULL; 2655 struct btrfs_path *path; 2656 struct btrfs_trans_handle *trans = NULL; 2657 u64 lockstart; 2658 u64 lockend; 2659 u64 tail_start; 2660 u64 tail_len; 2661 const u64 orig_start = offset; 2662 const u64 orig_end = offset + len - 1; 2663 int ret = 0; 2664 bool same_block; 2665 u64 ino_size; 2666 bool truncated_block = false; 2667 bool updated_inode = false; 2668 2669 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2670 2671 ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len); 2672 if (ret) 2673 goto out_only_mutex; 2674 2675 ino_size = round_up(inode->i_size, fs_info->sectorsize); 2676 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2677 if (ret < 0) 2678 goto out_only_mutex; 2679 if (ret && !len) { 2680 /* Already in a large hole */ 2681 ret = 0; 2682 goto out_only_mutex; 2683 } 2684 2685 ret = file_modified(file); 2686 if (ret) 2687 goto out_only_mutex; 2688 2689 lockstart = round_up(offset, fs_info->sectorsize); 2690 lockend = round_down(offset + len, fs_info->sectorsize) - 1; 2691 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) 2692 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); 2693 /* 2694 * Only do this if we are in the same block and we aren't doing the 2695 * entire block. 2696 */ 2697 if (same_block && len < fs_info->sectorsize) { 2698 if (offset < ino_size) { 2699 truncated_block = true; 2700 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 2701 orig_start, orig_end); 2702 } else { 2703 ret = 0; 2704 } 2705 goto out_only_mutex; 2706 } 2707 2708 /* zero back part of the first block */ 2709 if (offset < ino_size) { 2710 truncated_block = true; 2711 ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); 2712 if (ret) { 2713 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2714 return ret; 2715 } 2716 } 2717 2718 /* Check the aligned pages after the first unaligned page, 2719 * if offset != orig_start, which means the first unaligned page 2720 * including several following pages are already in holes, 2721 * the extra check can be skipped */ 2722 if (offset == orig_start) { 2723 /* after truncate page, check hole again */ 2724 len = offset + len - lockstart; 2725 offset = lockstart; 2726 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2727 if (ret < 0) 2728 goto out_only_mutex; 2729 if (ret && !len) { 2730 ret = 0; 2731 goto out_only_mutex; 2732 } 2733 lockstart = offset; 2734 } 2735 2736 /* Check the tail unaligned part is in a hole */ 2737 tail_start = lockend + 1; 2738 tail_len = offset + len - tail_start; 2739 if (tail_len) { 2740 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); 2741 if (unlikely(ret < 0)) 2742 goto out_only_mutex; 2743 if (!ret) { 2744 /* zero the front end of the last page */ 2745 if (tail_start + tail_len < ino_size) { 2746 truncated_block = true; 2747 ret = btrfs_truncate_block(BTRFS_I(inode), 2748 tail_start + tail_len - 1, 2749 orig_start, orig_end); 2750 if (ret) 2751 goto out_only_mutex; 2752 } 2753 } 2754 } 2755 2756 if (lockend < lockstart) { 2757 ret = 0; 2758 goto out_only_mutex; 2759 } 2760 2761 btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); 2762 2763 path = btrfs_alloc_path(); 2764 if (!path) { 2765 ret = -ENOMEM; 2766 goto out; 2767 } 2768 2769 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, 2770 lockend, NULL, &trans); 2771 btrfs_free_path(path); 2772 if (ret) 2773 goto out; 2774 2775 ASSERT(trans != NULL); 2776 inode_inc_iversion(inode); 2777 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 2778 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2779 updated_inode = true; 2780 btrfs_end_transaction(trans); 2781 btrfs_btree_balance_dirty(fs_info); 2782 out: 2783 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2784 &cached_state); 2785 out_only_mutex: 2786 if (!updated_inode && truncated_block && !ret) { 2787 /* 2788 * If we only end up zeroing part of a page, we still need to 2789 * update the inode item, so that all the time fields are 2790 * updated as well as the necessary btrfs inode in memory fields 2791 * for detecting, at fsync time, if the inode isn't yet in the 2792 * log tree or it's there but not up to date. 2793 */ 2794 struct timespec64 now = inode_set_ctime_current(inode); 2795 2796 inode_inc_iversion(inode); 2797 inode_set_mtime_to_ts(inode, now); 2798 trans = btrfs_start_transaction(root, 1); 2799 if (IS_ERR(trans)) { 2800 ret = PTR_ERR(trans); 2801 } else { 2802 int ret2; 2803 2804 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2805 ret2 = btrfs_end_transaction(trans); 2806 if (!ret) 2807 ret = ret2; 2808 } 2809 } 2810 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2811 return ret; 2812 } 2813 2814 /* Helper structure to record which range is already reserved */ 2815 struct falloc_range { 2816 struct list_head list; 2817 u64 start; 2818 u64 len; 2819 }; 2820 2821 /* 2822 * Helper function to add falloc range 2823 * 2824 * Caller should have locked the larger range of extent containing 2825 * [start, len) 2826 */ 2827 static int add_falloc_range(struct list_head *head, u64 start, u64 len) 2828 { 2829 struct falloc_range *range = NULL; 2830 2831 if (!list_empty(head)) { 2832 /* 2833 * As fallocate iterates by bytenr order, we only need to check 2834 * the last range. 2835 */ 2836 range = list_last_entry(head, struct falloc_range, list); 2837 if (range->start + range->len == start) { 2838 range->len += len; 2839 return 0; 2840 } 2841 } 2842 2843 range = kmalloc(sizeof(*range), GFP_KERNEL); 2844 if (!range) 2845 return -ENOMEM; 2846 range->start = start; 2847 range->len = len; 2848 list_add_tail(&range->list, head); 2849 return 0; 2850 } 2851 2852 static int btrfs_fallocate_update_isize(struct inode *inode, 2853 const u64 end, 2854 const int mode) 2855 { 2856 struct btrfs_trans_handle *trans; 2857 struct btrfs_root *root = BTRFS_I(inode)->root; 2858 int ret; 2859 int ret2; 2860 2861 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) 2862 return 0; 2863 2864 trans = btrfs_start_transaction(root, 1); 2865 if (IS_ERR(trans)) 2866 return PTR_ERR(trans); 2867 2868 inode_set_ctime_current(inode); 2869 i_size_write(inode, end); 2870 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 2871 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2872 ret2 = btrfs_end_transaction(trans); 2873 2874 return ret ? ret : ret2; 2875 } 2876 2877 enum { 2878 RANGE_BOUNDARY_WRITTEN_EXTENT, 2879 RANGE_BOUNDARY_PREALLOC_EXTENT, 2880 RANGE_BOUNDARY_HOLE, 2881 }; 2882 2883 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, 2884 u64 offset) 2885 { 2886 const u64 sectorsize = inode->root->fs_info->sectorsize; 2887 struct extent_map *em; 2888 int ret; 2889 2890 offset = round_down(offset, sectorsize); 2891 em = btrfs_get_extent(inode, NULL, offset, sectorsize); 2892 if (IS_ERR(em)) 2893 return PTR_ERR(em); 2894 2895 if (em->disk_bytenr == EXTENT_MAP_HOLE) 2896 ret = RANGE_BOUNDARY_HOLE; 2897 else if (em->flags & EXTENT_FLAG_PREALLOC) 2898 ret = RANGE_BOUNDARY_PREALLOC_EXTENT; 2899 else 2900 ret = RANGE_BOUNDARY_WRITTEN_EXTENT; 2901 2902 btrfs_free_extent_map(em); 2903 return ret; 2904 } 2905 2906 static int btrfs_zero_range(struct inode *inode, 2907 loff_t offset, 2908 loff_t len, 2909 const int mode) 2910 { 2911 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2912 struct extent_map *em; 2913 struct extent_changeset *data_reserved = NULL; 2914 int ret; 2915 u64 alloc_hint = 0; 2916 const u64 sectorsize = fs_info->sectorsize; 2917 const u64 orig_start = offset; 2918 const u64 orig_end = offset + len - 1; 2919 u64 alloc_start = round_down(offset, sectorsize); 2920 u64 alloc_end = round_up(offset + len, sectorsize); 2921 u64 bytes_to_reserve = 0; 2922 bool space_reserved = false; 2923 2924 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, 2925 alloc_end - alloc_start); 2926 if (IS_ERR(em)) { 2927 ret = PTR_ERR(em); 2928 goto out; 2929 } 2930 2931 /* 2932 * Avoid hole punching and extent allocation for some cases. More cases 2933 * could be considered, but these are unlikely common and we keep things 2934 * as simple as possible for now. Also, intentionally, if the target 2935 * range contains one or more prealloc extents together with regular 2936 * extents and holes, we drop all the existing extents and allocate a 2937 * new prealloc extent, so that we get a larger contiguous disk extent. 2938 */ 2939 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { 2940 const u64 em_end = em->start + em->len; 2941 2942 if (em_end >= offset + len) { 2943 /* 2944 * The whole range is already a prealloc extent, 2945 * do nothing except updating the inode's i_size if 2946 * needed. 2947 */ 2948 btrfs_free_extent_map(em); 2949 ret = btrfs_fallocate_update_isize(inode, offset + len, 2950 mode); 2951 goto out; 2952 } 2953 /* 2954 * Part of the range is already a prealloc extent, so operate 2955 * only on the remaining part of the range. 2956 */ 2957 alloc_start = em_end; 2958 ASSERT(IS_ALIGNED(alloc_start, sectorsize)); 2959 len = offset + len - alloc_start; 2960 offset = alloc_start; 2961 alloc_hint = btrfs_extent_map_block_start(em) + em->len; 2962 } 2963 btrfs_free_extent_map(em); 2964 2965 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == 2966 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { 2967 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); 2968 if (IS_ERR(em)) { 2969 ret = PTR_ERR(em); 2970 goto out; 2971 } 2972 2973 if (em->flags & EXTENT_FLAG_PREALLOC) { 2974 btrfs_free_extent_map(em); 2975 ret = btrfs_fallocate_update_isize(inode, offset + len, 2976 mode); 2977 goto out; 2978 } 2979 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { 2980 btrfs_free_extent_map(em); 2981 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 2982 orig_start, orig_end); 2983 if (!ret) 2984 ret = btrfs_fallocate_update_isize(inode, 2985 offset + len, 2986 mode); 2987 return ret; 2988 } 2989 btrfs_free_extent_map(em); 2990 alloc_start = round_down(offset, sectorsize); 2991 alloc_end = alloc_start + sectorsize; 2992 goto reserve_space; 2993 } 2994 2995 alloc_start = round_up(offset, sectorsize); 2996 alloc_end = round_down(offset + len, sectorsize); 2997 2998 /* 2999 * For unaligned ranges, check the pages at the boundaries, they might 3000 * map to an extent, in which case we need to partially zero them, or 3001 * they might map to a hole, in which case we need our allocation range 3002 * to cover them. 3003 */ 3004 if (!IS_ALIGNED(offset, sectorsize)) { 3005 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 3006 offset); 3007 if (ret < 0) 3008 goto out; 3009 if (ret == RANGE_BOUNDARY_HOLE) { 3010 alloc_start = round_down(offset, sectorsize); 3011 ret = 0; 3012 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 3013 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 3014 orig_start, orig_end); 3015 if (ret) 3016 goto out; 3017 } else { 3018 ret = 0; 3019 } 3020 } 3021 3022 if (!IS_ALIGNED(offset + len, sectorsize)) { 3023 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 3024 offset + len); 3025 if (ret < 0) 3026 goto out; 3027 if (ret == RANGE_BOUNDARY_HOLE) { 3028 alloc_end = round_up(offset + len, sectorsize); 3029 ret = 0; 3030 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 3031 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 3032 orig_start, orig_end); 3033 if (ret) 3034 goto out; 3035 } else { 3036 ret = 0; 3037 } 3038 } 3039 3040 reserve_space: 3041 if (alloc_start < alloc_end) { 3042 struct extent_state *cached_state = NULL; 3043 const u64 lockstart = alloc_start; 3044 const u64 lockend = alloc_end - 1; 3045 3046 bytes_to_reserve = alloc_end - alloc_start; 3047 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3048 bytes_to_reserve); 3049 if (ret < 0) 3050 goto out; 3051 space_reserved = true; 3052 btrfs_punch_hole_lock_range(inode, lockstart, lockend, 3053 &cached_state); 3054 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, 3055 alloc_start, bytes_to_reserve); 3056 if (ret) { 3057 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, 3058 lockend, &cached_state); 3059 goto out; 3060 } 3061 ret = btrfs_prealloc_file_range(inode, mode, alloc_start, 3062 alloc_end - alloc_start, 3063 fs_info->sectorsize, 3064 offset + len, &alloc_hint); 3065 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 3066 &cached_state); 3067 /* btrfs_prealloc_file_range releases reserved space on error */ 3068 if (ret) { 3069 space_reserved = false; 3070 goto out; 3071 } 3072 } 3073 ret = btrfs_fallocate_update_isize(inode, offset + len, mode); 3074 out: 3075 if (ret && space_reserved) 3076 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, 3077 alloc_start, bytes_to_reserve); 3078 extent_changeset_free(data_reserved); 3079 3080 return ret; 3081 } 3082 3083 static long btrfs_fallocate(struct file *file, int mode, 3084 loff_t offset, loff_t len) 3085 { 3086 struct inode *inode = file_inode(file); 3087 struct extent_state *cached_state = NULL; 3088 struct extent_changeset *data_reserved = NULL; 3089 struct falloc_range *range; 3090 struct falloc_range *tmp; 3091 LIST_HEAD(reserve_list); 3092 u64 cur_offset; 3093 u64 last_byte; 3094 u64 alloc_start; 3095 u64 alloc_end; 3096 u64 alloc_hint = 0; 3097 u64 locked_end; 3098 u64 actual_end = 0; 3099 u64 data_space_needed = 0; 3100 u64 data_space_reserved = 0; 3101 u64 qgroup_reserved = 0; 3102 struct extent_map *em; 3103 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; 3104 int ret; 3105 3106 /* Do not allow fallocate in ZONED mode */ 3107 if (btrfs_is_zoned(inode_to_fs_info(inode))) 3108 return -EOPNOTSUPP; 3109 3110 alloc_start = round_down(offset, blocksize); 3111 alloc_end = round_up(offset + len, blocksize); 3112 cur_offset = alloc_start; 3113 3114 /* Make sure we aren't being give some crap mode */ 3115 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 3116 FALLOC_FL_ZERO_RANGE)) 3117 return -EOPNOTSUPP; 3118 3119 if (mode & FALLOC_FL_PUNCH_HOLE) 3120 return btrfs_punch_hole(file, offset, len); 3121 3122 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3123 3124 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { 3125 ret = inode_newsize_ok(inode, offset + len); 3126 if (ret) 3127 goto out; 3128 } 3129 3130 ret = file_modified(file); 3131 if (ret) 3132 goto out; 3133 3134 /* 3135 * TODO: Move these two operations after we have checked 3136 * accurate reserved space, or fallocate can still fail but 3137 * with page truncated or size expanded. 3138 * 3139 * But that's a minor problem and won't do much harm BTW. 3140 */ 3141 if (alloc_start > inode->i_size) { 3142 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), 3143 alloc_start); 3144 if (ret) 3145 goto out; 3146 } else if (offset + len > inode->i_size) { 3147 /* 3148 * If we are fallocating from the end of the file onward we 3149 * need to zero out the end of the block if i_size lands in the 3150 * middle of a block. 3151 */ 3152 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 3153 inode->i_size, (u64)-1); 3154 if (ret) 3155 goto out; 3156 } 3157 3158 /* 3159 * We have locked the inode at the VFS level (in exclusive mode) and we 3160 * have locked the i_mmap_lock lock (in exclusive mode). Now before 3161 * locking the file range, flush all dealloc in the range and wait for 3162 * all ordered extents in the range to complete. After this we can lock 3163 * the file range and, due to the previous locking we did, we know there 3164 * can't be more delalloc or ordered extents in the range. 3165 */ 3166 ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start, 3167 alloc_end - alloc_start); 3168 if (ret) 3169 goto out; 3170 3171 if (mode & FALLOC_FL_ZERO_RANGE) { 3172 ret = btrfs_zero_range(inode, offset, len, mode); 3173 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3174 return ret; 3175 } 3176 3177 locked_end = alloc_end - 1; 3178 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 3179 &cached_state); 3180 3181 btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); 3182 3183 /* First, check if we exceed the qgroup limit */ 3184 while (cur_offset < alloc_end) { 3185 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, 3186 alloc_end - cur_offset); 3187 if (IS_ERR(em)) { 3188 ret = PTR_ERR(em); 3189 break; 3190 } 3191 last_byte = min(btrfs_extent_map_end(em), alloc_end); 3192 actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); 3193 last_byte = ALIGN(last_byte, blocksize); 3194 if (em->disk_bytenr == EXTENT_MAP_HOLE || 3195 (cur_offset >= inode->i_size && 3196 !(em->flags & EXTENT_FLAG_PREALLOC))) { 3197 const u64 range_len = last_byte - cur_offset; 3198 3199 ret = add_falloc_range(&reserve_list, cur_offset, range_len); 3200 if (ret < 0) { 3201 btrfs_free_extent_map(em); 3202 break; 3203 } 3204 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), 3205 &data_reserved, cur_offset, range_len); 3206 if (ret < 0) { 3207 btrfs_free_extent_map(em); 3208 break; 3209 } 3210 qgroup_reserved += range_len; 3211 data_space_needed += range_len; 3212 } 3213 btrfs_free_extent_map(em); 3214 cur_offset = last_byte; 3215 } 3216 3217 if (!ret && data_space_needed > 0) { 3218 /* 3219 * We are safe to reserve space here as we can't have delalloc 3220 * in the range, see above. 3221 */ 3222 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3223 data_space_needed); 3224 if (!ret) 3225 data_space_reserved = data_space_needed; 3226 } 3227 3228 /* 3229 * If ret is still 0, means we're OK to fallocate. 3230 * Or just cleanup the list and exit. 3231 */ 3232 list_for_each_entry_safe(range, tmp, &reserve_list, list) { 3233 if (!ret) { 3234 ret = btrfs_prealloc_file_range(inode, mode, 3235 range->start, 3236 range->len, blocksize, 3237 offset + len, &alloc_hint); 3238 /* 3239 * btrfs_prealloc_file_range() releases space even 3240 * if it returns an error. 3241 */ 3242 data_space_reserved -= range->len; 3243 qgroup_reserved -= range->len; 3244 } else if (data_space_reserved > 0) { 3245 btrfs_free_reserved_data_space(BTRFS_I(inode), 3246 data_reserved, range->start, 3247 range->len); 3248 data_space_reserved -= range->len; 3249 qgroup_reserved -= range->len; 3250 } else if (qgroup_reserved > 0) { 3251 btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, 3252 range->start, range->len, NULL); 3253 qgroup_reserved -= range->len; 3254 } 3255 list_del(&range->list); 3256 kfree(range); 3257 } 3258 if (ret < 0) 3259 goto out_unlock; 3260 3261 /* 3262 * We didn't need to allocate any more space, but we still extended the 3263 * size of the file so we need to update i_size and the inode item. 3264 */ 3265 ret = btrfs_fallocate_update_isize(inode, actual_end, mode); 3266 out_unlock: 3267 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 3268 &cached_state); 3269 out: 3270 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3271 extent_changeset_free(data_reserved); 3272 return ret; 3273 } 3274 3275 /* 3276 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range 3277 * that has unflushed and/or flushing delalloc. There might be other adjacent 3278 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps 3279 * looping while it gets adjacent subranges, and merging them together. 3280 */ 3281 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, 3282 struct extent_state **cached_state, 3283 bool *search_io_tree, 3284 u64 *delalloc_start_ret, u64 *delalloc_end_ret) 3285 { 3286 u64 len = end + 1 - start; 3287 u64 delalloc_len = 0; 3288 struct btrfs_ordered_extent *oe; 3289 u64 oe_start; 3290 u64 oe_end; 3291 3292 /* 3293 * Search the io tree first for EXTENT_DELALLOC. If we find any, it 3294 * means we have delalloc (dirty pages) for which writeback has not 3295 * started yet. 3296 */ 3297 if (*search_io_tree) { 3298 spin_lock(&inode->lock); 3299 if (inode->delalloc_bytes > 0) { 3300 spin_unlock(&inode->lock); 3301 *delalloc_start_ret = start; 3302 delalloc_len = btrfs_count_range_bits(&inode->io_tree, 3303 delalloc_start_ret, end, 3304 len, EXTENT_DELALLOC, 1, 3305 cached_state); 3306 } else { 3307 spin_unlock(&inode->lock); 3308 } 3309 } 3310 3311 if (delalloc_len > 0) { 3312 /* 3313 * If delalloc was found then *delalloc_start_ret has a sector size 3314 * aligned value (rounded down). 3315 */ 3316 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; 3317 3318 if (*delalloc_start_ret == start) { 3319 /* Delalloc for the whole range, nothing more to do. */ 3320 if (*delalloc_end_ret == end) 3321 return true; 3322 /* Else trim our search range for ordered extents. */ 3323 start = *delalloc_end_ret + 1; 3324 len = end + 1 - start; 3325 } 3326 } else { 3327 /* No delalloc, future calls don't need to search again. */ 3328 *search_io_tree = false; 3329 } 3330 3331 /* 3332 * Now also check if there's any ordered extent in the range. 3333 * We do this because: 3334 * 3335 * 1) When delalloc is flushed, the file range is locked, we clear the 3336 * EXTENT_DELALLOC bit from the io tree and create an extent map and 3337 * an ordered extent for the write. So we might just have been called 3338 * after delalloc is flushed and before the ordered extent completes 3339 * and inserts the new file extent item in the subvolume's btree; 3340 * 3341 * 2) We may have an ordered extent created by flushing delalloc for a 3342 * subrange that starts before the subrange we found marked with 3343 * EXTENT_DELALLOC in the io tree. 3344 * 3345 * We could also use the extent map tree to find such delalloc that is 3346 * being flushed, but using the ordered extents tree is more efficient 3347 * because it's usually much smaller as ordered extents are removed from 3348 * the tree once they complete. With the extent maps, we mau have them 3349 * in the extent map tree for a very long time, and they were either 3350 * created by previous writes or loaded by read operations. 3351 */ 3352 oe = btrfs_lookup_first_ordered_range(inode, start, len); 3353 if (!oe) 3354 return (delalloc_len > 0); 3355 3356 /* The ordered extent may span beyond our search range. */ 3357 oe_start = max(oe->file_offset, start); 3358 oe_end = min(oe->file_offset + oe->num_bytes - 1, end); 3359 3360 btrfs_put_ordered_extent(oe); 3361 3362 /* Don't have unflushed delalloc, return the ordered extent range. */ 3363 if (delalloc_len == 0) { 3364 *delalloc_start_ret = oe_start; 3365 *delalloc_end_ret = oe_end; 3366 return true; 3367 } 3368 3369 /* 3370 * We have both unflushed delalloc (io_tree) and an ordered extent. 3371 * If the ranges are adjacent returned a combined range, otherwise 3372 * return the leftmost range. 3373 */ 3374 if (oe_start < *delalloc_start_ret) { 3375 if (oe_end < *delalloc_start_ret) 3376 *delalloc_end_ret = oe_end; 3377 *delalloc_start_ret = oe_start; 3378 } else if (*delalloc_end_ret + 1 == oe_start) { 3379 *delalloc_end_ret = oe_end; 3380 } 3381 3382 return true; 3383 } 3384 3385 /* 3386 * Check if there's delalloc in a given range. 3387 * 3388 * @inode: The inode. 3389 * @start: The start offset of the range. It does not need to be 3390 * sector size aligned. 3391 * @end: The end offset (inclusive value) of the search range. 3392 * It does not need to be sector size aligned. 3393 * @cached_state: Extent state record used for speeding up delalloc 3394 * searches in the inode's io_tree. Can be NULL. 3395 * @delalloc_start_ret: Output argument, set to the start offset of the 3396 * subrange found with delalloc (may not be sector size 3397 * aligned). 3398 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) 3399 * of the subrange found with delalloc. 3400 * 3401 * Returns true if a subrange with delalloc is found within the given range, and 3402 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and 3403 * end offsets of the subrange. 3404 */ 3405 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, 3406 struct extent_state **cached_state, 3407 u64 *delalloc_start_ret, u64 *delalloc_end_ret) 3408 { 3409 u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); 3410 u64 prev_delalloc_end = 0; 3411 bool search_io_tree = true; 3412 bool ret = false; 3413 3414 while (cur_offset <= end) { 3415 u64 delalloc_start; 3416 u64 delalloc_end; 3417 bool delalloc; 3418 3419 delalloc = find_delalloc_subrange(inode, cur_offset, end, 3420 cached_state, &search_io_tree, 3421 &delalloc_start, 3422 &delalloc_end); 3423 if (!delalloc) 3424 break; 3425 3426 if (prev_delalloc_end == 0) { 3427 /* First subrange found. */ 3428 *delalloc_start_ret = max(delalloc_start, start); 3429 *delalloc_end_ret = delalloc_end; 3430 ret = true; 3431 } else if (delalloc_start == prev_delalloc_end + 1) { 3432 /* Subrange adjacent to the previous one, merge them. */ 3433 *delalloc_end_ret = delalloc_end; 3434 } else { 3435 /* Subrange not adjacent to the previous one, exit. */ 3436 break; 3437 } 3438 3439 prev_delalloc_end = delalloc_end; 3440 cur_offset = delalloc_end + 1; 3441 cond_resched(); 3442 } 3443 3444 return ret; 3445 } 3446 3447 /* 3448 * Check if there's a hole or delalloc range in a range representing a hole (or 3449 * prealloc extent) found in the inode's subvolume btree. 3450 * 3451 * @inode: The inode. 3452 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). 3453 * @start: Start offset of the hole region. It does not need to be sector 3454 * size aligned. 3455 * @end: End offset (inclusive value) of the hole region. It does not 3456 * need to be sector size aligned. 3457 * @start_ret: Return parameter, used to set the start of the subrange in the 3458 * hole that matches the search criteria (seek mode), if such 3459 * subrange is found (return value of the function is true). 3460 * The value returned here may not be sector size aligned. 3461 * 3462 * Returns true if a subrange matching the given seek mode is found, and if one 3463 * is found, it updates @start_ret with the start of the subrange. 3464 */ 3465 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, 3466 struct extent_state **cached_state, 3467 u64 start, u64 end, u64 *start_ret) 3468 { 3469 u64 delalloc_start; 3470 u64 delalloc_end; 3471 bool delalloc; 3472 3473 delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, 3474 &delalloc_start, &delalloc_end); 3475 if (delalloc && whence == SEEK_DATA) { 3476 *start_ret = delalloc_start; 3477 return true; 3478 } 3479 3480 if (delalloc && whence == SEEK_HOLE) { 3481 /* 3482 * We found delalloc but it starts after out start offset. So we 3483 * have a hole between our start offset and the delalloc start. 3484 */ 3485 if (start < delalloc_start) { 3486 *start_ret = start; 3487 return true; 3488 } 3489 /* 3490 * Delalloc range starts at our start offset. 3491 * If the delalloc range's length is smaller than our range, 3492 * then it means we have a hole that starts where the delalloc 3493 * subrange ends. 3494 */ 3495 if (delalloc_end < end) { 3496 *start_ret = delalloc_end + 1; 3497 return true; 3498 } 3499 3500 /* There's delalloc for the whole range. */ 3501 return false; 3502 } 3503 3504 if (!delalloc && whence == SEEK_HOLE) { 3505 *start_ret = start; 3506 return true; 3507 } 3508 3509 /* 3510 * No delalloc in the range and we are seeking for data. The caller has 3511 * to iterate to the next extent item in the subvolume btree. 3512 */ 3513 return false; 3514 } 3515 3516 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) 3517 { 3518 struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); 3519 struct btrfs_file_private *private; 3520 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3521 struct extent_state *cached_state = NULL; 3522 struct extent_state **delalloc_cached_state; 3523 const loff_t i_size = i_size_read(&inode->vfs_inode); 3524 const u64 ino = btrfs_ino(inode); 3525 struct btrfs_root *root = inode->root; 3526 struct btrfs_path *path; 3527 struct btrfs_key key; 3528 u64 last_extent_end; 3529 u64 lockstart; 3530 u64 lockend; 3531 u64 start; 3532 int ret; 3533 bool found = false; 3534 3535 if (i_size == 0 || offset >= i_size) 3536 return -ENXIO; 3537 3538 /* 3539 * Quick path. If the inode has no prealloc extents and its number of 3540 * bytes used matches its i_size, then it can not have holes. 3541 */ 3542 if (whence == SEEK_HOLE && 3543 !(inode->flags & BTRFS_INODE_PREALLOC) && 3544 inode_get_bytes(&inode->vfs_inode) == i_size) 3545 return i_size; 3546 3547 spin_lock(&inode->lock); 3548 private = file->private_data; 3549 spin_unlock(&inode->lock); 3550 3551 if (private && private->owner_task != current) { 3552 /* 3553 * Not allocated by us, don't use it as its cached state is used 3554 * by the task that allocated it and we don't want neither to 3555 * mess with it nor get incorrect results because it reflects an 3556 * invalid state for the current task. 3557 */ 3558 private = NULL; 3559 } else if (!private) { 3560 private = kzalloc(sizeof(*private), GFP_KERNEL); 3561 /* 3562 * No worries if memory allocation failed. 3563 * The private structure is used only for speeding up multiple 3564 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, 3565 * so everything will still be correct. 3566 */ 3567 if (private) { 3568 bool free = false; 3569 3570 private->owner_task = current; 3571 3572 spin_lock(&inode->lock); 3573 if (file->private_data) 3574 free = true; 3575 else 3576 file->private_data = private; 3577 spin_unlock(&inode->lock); 3578 3579 if (free) { 3580 kfree(private); 3581 private = NULL; 3582 } 3583 } 3584 } 3585 3586 if (private) 3587 delalloc_cached_state = &private->llseek_cached_state; 3588 else 3589 delalloc_cached_state = NULL; 3590 3591 /* 3592 * offset can be negative, in this case we start finding DATA/HOLE from 3593 * the very start of the file. 3594 */ 3595 start = max_t(loff_t, 0, offset); 3596 3597 lockstart = round_down(start, fs_info->sectorsize); 3598 lockend = round_up(i_size, fs_info->sectorsize); 3599 if (lockend <= lockstart) 3600 lockend = lockstart + fs_info->sectorsize; 3601 lockend--; 3602 3603 path = btrfs_alloc_path(); 3604 if (!path) 3605 return -ENOMEM; 3606 path->reada = READA_FORWARD; 3607 3608 key.objectid = ino; 3609 key.type = BTRFS_EXTENT_DATA_KEY; 3610 key.offset = start; 3611 3612 last_extent_end = lockstart; 3613 3614 btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 3615 3616 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3617 if (ret < 0) { 3618 goto out; 3619 } else if (ret > 0 && path->slots[0] > 0) { 3620 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 3621 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 3622 path->slots[0]--; 3623 } 3624 3625 while (start < i_size) { 3626 struct extent_buffer *leaf = path->nodes[0]; 3627 struct btrfs_file_extent_item *extent; 3628 u64 extent_end; 3629 u8 type; 3630 3631 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3632 ret = btrfs_next_leaf(root, path); 3633 if (ret < 0) 3634 goto out; 3635 else if (ret > 0) 3636 break; 3637 3638 leaf = path->nodes[0]; 3639 } 3640 3641 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3642 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 3643 break; 3644 3645 extent_end = btrfs_file_extent_end(path); 3646 3647 /* 3648 * In the first iteration we may have a slot that points to an 3649 * extent that ends before our start offset, so skip it. 3650 */ 3651 if (extent_end <= start) { 3652 path->slots[0]++; 3653 continue; 3654 } 3655 3656 /* We have an implicit hole, NO_HOLES feature is likely set. */ 3657 if (last_extent_end < key.offset) { 3658 u64 search_start = last_extent_end; 3659 u64 found_start; 3660 3661 /* 3662 * First iteration, @start matches @offset and it's 3663 * within the hole. 3664 */ 3665 if (start == offset) 3666 search_start = offset; 3667 3668 found = find_desired_extent_in_hole(inode, whence, 3669 delalloc_cached_state, 3670 search_start, 3671 key.offset - 1, 3672 &found_start); 3673 if (found) { 3674 start = found_start; 3675 break; 3676 } 3677 /* 3678 * Didn't find data or a hole (due to delalloc) in the 3679 * implicit hole range, so need to analyze the extent. 3680 */ 3681 } 3682 3683 extent = btrfs_item_ptr(leaf, path->slots[0], 3684 struct btrfs_file_extent_item); 3685 type = btrfs_file_extent_type(leaf, extent); 3686 3687 /* 3688 * Can't access the extent's disk_bytenr field if this is an 3689 * inline extent, since at that offset, it's where the extent 3690 * data starts. 3691 */ 3692 if (type == BTRFS_FILE_EXTENT_PREALLOC || 3693 (type == BTRFS_FILE_EXTENT_REG && 3694 btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { 3695 /* 3696 * Explicit hole or prealloc extent, search for delalloc. 3697 * A prealloc extent is treated like a hole. 3698 */ 3699 u64 search_start = key.offset; 3700 u64 found_start; 3701 3702 /* 3703 * First iteration, @start matches @offset and it's 3704 * within the hole. 3705 */ 3706 if (start == offset) 3707 search_start = offset; 3708 3709 found = find_desired_extent_in_hole(inode, whence, 3710 delalloc_cached_state, 3711 search_start, 3712 extent_end - 1, 3713 &found_start); 3714 if (found) { 3715 start = found_start; 3716 break; 3717 } 3718 /* 3719 * Didn't find data or a hole (due to delalloc) in the 3720 * implicit hole range, so need to analyze the next 3721 * extent item. 3722 */ 3723 } else { 3724 /* 3725 * Found a regular or inline extent. 3726 * If we are seeking for data, adjust the start offset 3727 * and stop, we're done. 3728 */ 3729 if (whence == SEEK_DATA) { 3730 start = max_t(u64, key.offset, offset); 3731 found = true; 3732 break; 3733 } 3734 /* 3735 * Else, we are seeking for a hole, check the next file 3736 * extent item. 3737 */ 3738 } 3739 3740 start = extent_end; 3741 last_extent_end = extent_end; 3742 path->slots[0]++; 3743 if (fatal_signal_pending(current)) { 3744 ret = -EINTR; 3745 goto out; 3746 } 3747 cond_resched(); 3748 } 3749 3750 /* We have an implicit hole from the last extent found up to i_size. */ 3751 if (!found && start < i_size) { 3752 found = find_desired_extent_in_hole(inode, whence, 3753 delalloc_cached_state, start, 3754 i_size - 1, &start); 3755 if (!found) 3756 start = i_size; 3757 } 3758 3759 out: 3760 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 3761 btrfs_free_path(path); 3762 3763 if (ret < 0) 3764 return ret; 3765 3766 if (whence == SEEK_DATA && start >= i_size) 3767 return -ENXIO; 3768 3769 return min_t(loff_t, start, i_size); 3770 } 3771 3772 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) 3773 { 3774 struct inode *inode = file->f_mapping->host; 3775 3776 switch (whence) { 3777 default: 3778 return generic_file_llseek(file, offset, whence); 3779 case SEEK_DATA: 3780 case SEEK_HOLE: 3781 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 3782 offset = find_desired_extent(file, offset, whence); 3783 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 3784 break; 3785 } 3786 3787 if (offset < 0) 3788 return offset; 3789 3790 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 3791 } 3792 3793 static int btrfs_file_open(struct inode *inode, struct file *filp) 3794 { 3795 int ret; 3796 3797 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 3798 3799 ret = fsverity_file_open(inode, filp); 3800 if (ret) 3801 return ret; 3802 return generic_file_open(inode, filp); 3803 } 3804 3805 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 3806 { 3807 ssize_t ret = 0; 3808 3809 if (iocb->ki_flags & IOCB_DIRECT) { 3810 ret = btrfs_direct_read(iocb, to); 3811 if (ret < 0 || !iov_iter_count(to) || 3812 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) 3813 return ret; 3814 } 3815 3816 return filemap_read(iocb, to, ret); 3817 } 3818 3819 const struct file_operations btrfs_file_operations = { 3820 .llseek = btrfs_file_llseek, 3821 .read_iter = btrfs_file_read_iter, 3822 .splice_read = filemap_splice_read, 3823 .write_iter = btrfs_file_write_iter, 3824 .splice_write = iter_file_splice_write, 3825 .mmap_prepare = btrfs_file_mmap_prepare, 3826 .open = btrfs_file_open, 3827 .release = btrfs_release_file, 3828 .get_unmapped_area = thp_get_unmapped_area, 3829 .fsync = btrfs_sync_file, 3830 .fallocate = btrfs_fallocate, 3831 .unlocked_ioctl = btrfs_ioctl, 3832 #ifdef CONFIG_COMPAT 3833 .compat_ioctl = btrfs_compat_ioctl, 3834 #endif 3835 .remap_file_range = btrfs_remap_file_range, 3836 .uring_cmd = btrfs_uring_cmd, 3837 .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, 3838 }; 3839 3840 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) 3841 { 3842 struct address_space *mapping = inode->vfs_inode.i_mapping; 3843 int ret; 3844 3845 /* 3846 * So with compression we will find and lock a dirty page and clear the 3847 * first one as dirty, setup an async extent, and immediately return 3848 * with the entire range locked but with nobody actually marked with 3849 * writeback. So we can't just filemap_write_and_wait_range() and 3850 * expect it to work since it will just kick off a thread to do the 3851 * actual work. So we need to call filemap_fdatawrite_range _again_ 3852 * since it will wait on the page lock, which won't be unlocked until 3853 * after the pages have been marked as writeback and so we're good to go 3854 * from there. We have to do this otherwise we'll miss the ordered 3855 * extents and that results in badness. Please Josef, do not think you 3856 * know better and pull this out at some point in the future, it is 3857 * right and you are wrong. 3858 */ 3859 ret = filemap_fdatawrite_range(mapping, start, end); 3860 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) 3861 ret = filemap_fdatawrite_range(mapping, start, end); 3862 3863 return ret; 3864 } 3865