1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/fs.h> 7 #include <linux/pagemap.h> 8 #include <linux/time.h> 9 #include <linux/init.h> 10 #include <linux/string.h> 11 #include <linux/backing-dev.h> 12 #include <linux/falloc.h> 13 #include <linux/writeback.h> 14 #include <linux/compat.h> 15 #include <linux/slab.h> 16 #include <linux/btrfs.h> 17 #include <linux/uio.h> 18 #include <linux/iversion.h> 19 #include <linux/fsverity.h> 20 #include "ctree.h" 21 #include "direct-io.h" 22 #include "disk-io.h" 23 #include "transaction.h" 24 #include "btrfs_inode.h" 25 #include "tree-log.h" 26 #include "locking.h" 27 #include "qgroup.h" 28 #include "compression.h" 29 #include "delalloc-space.h" 30 #include "reflink.h" 31 #include "subpage.h" 32 #include "fs.h" 33 #include "accessors.h" 34 #include "extent-tree.h" 35 #include "file-item.h" 36 #include "ioctl.h" 37 #include "file.h" 38 #include "super.h" 39 #include "print-tree.h" 40 41 /* 42 * Unlock folio after btrfs_file_write() is done with it. 43 */ 44 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, 45 u64 pos, u64 copied) 46 { 47 u64 block_start = round_down(pos, fs_info->sectorsize); 48 u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; 49 50 ASSERT(block_len <= U32_MAX); 51 /* 52 * Folio checked is some magic around finding folios that have been 53 * modified without going through btrfs_dirty_folio(). Clear it here. 54 * There should be no need to mark the pages accessed as 55 * prepare_one_folio() should have marked them accessed in 56 * prepare_one_folio() via find_or_create_page() 57 */ 58 btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); 59 folio_unlock(folio); 60 folio_put(folio); 61 } 62 63 /* 64 * After copy_folio_from_iter_atomic(), update the following things for delalloc: 65 * - Mark newly dirtied folio as DELALLOC in the io tree. 66 * Used to advise which range is to be written back. 67 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup 68 * - Update inode size for past EOF write 69 */ 70 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, 71 size_t write_bytes, struct extent_state **cached, bool noreserve) 72 { 73 struct btrfs_fs_info *fs_info = inode->root->fs_info; 74 int ret = 0; 75 u64 num_bytes; 76 u64 start_pos; 77 u64 end_of_last_block; 78 u64 end_pos = pos + write_bytes; 79 loff_t isize = i_size_read(&inode->vfs_inode); 80 unsigned int extra_bits = 0; 81 82 if (write_bytes == 0) 83 return 0; 84 85 if (noreserve) 86 extra_bits |= EXTENT_NORESERVE; 87 88 start_pos = round_down(pos, fs_info->sectorsize); 89 num_bytes = round_up(write_bytes + pos - start_pos, 90 fs_info->sectorsize); 91 ASSERT(num_bytes <= U32_MAX); 92 ASSERT(folio_pos(folio) <= pos && 93 folio_next_pos(folio) >= pos + write_bytes); 94 95 end_of_last_block = start_pos + num_bytes - 1; 96 97 /* 98 * The pages may have already been dirty, clear out old accounting so 99 * we can set things up properly 100 */ 101 btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, 102 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 103 cached); 104 105 ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 106 extra_bits, cached); 107 if (ret) 108 return ret; 109 110 btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); 111 btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); 112 btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); 113 114 /* 115 * we've only changed i_size in ram, and we haven't updated 116 * the disk i_size. There is no need to log the inode 117 * at this time. 118 */ 119 if (end_pos > isize) 120 i_size_write(&inode->vfs_inode, end_pos); 121 return 0; 122 } 123 124 /* 125 * this is very complex, but the basic idea is to drop all extents 126 * in the range start - end. hint_block is filled in with a block number 127 * that would be a good hint to the block allocator for this file. 128 * 129 * If an extent intersects the range but is not entirely inside the range 130 * it is either truncated or split. Anything entirely inside the range 131 * is deleted from the tree. 132 * 133 * Note: the VFS' inode number of bytes is not updated, it's up to the caller 134 * to deal with that. We set the field 'bytes_found' of the arguments structure 135 * with the number of allocated bytes found in the target range, so that the 136 * caller can update the inode's number of bytes in an atomic way when 137 * replacing extents in a range to avoid races with stat(2). 138 */ 139 int btrfs_drop_extents(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, struct btrfs_inode *inode, 141 struct btrfs_drop_extents_args *args) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 struct extent_buffer *leaf; 145 struct btrfs_file_extent_item *fi; 146 struct btrfs_key key; 147 struct btrfs_key new_key; 148 u64 ino = btrfs_ino(inode); 149 u64 search_start = args->start; 150 u64 disk_bytenr = 0; 151 u64 num_bytes = 0; 152 u64 extent_offset = 0; 153 u64 extent_end = 0; 154 u64 last_end = args->start; 155 int del_nr = 0; 156 int del_slot = 0; 157 int extent_type; 158 int recow; 159 int ret; 160 int modify_tree = -1; 161 int update_refs; 162 int found = 0; 163 struct btrfs_path *path = args->path; 164 165 args->bytes_found = 0; 166 args->extent_inserted = false; 167 168 /* Must always have a path if ->replace_extent is true */ 169 ASSERT(!(args->replace_extent && !args->path)); 170 171 if (!path) { 172 path = btrfs_alloc_path(); 173 if (!path) { 174 ret = -ENOMEM; 175 goto out; 176 } 177 } 178 179 if (args->drop_cache) 180 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); 181 182 if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) 183 modify_tree = 0; 184 185 update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); 186 while (1) { 187 recow = 0; 188 ret = btrfs_lookup_file_extent(trans, root, path, ino, 189 search_start, modify_tree); 190 if (ret < 0) 191 break; 192 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { 193 leaf = path->nodes[0]; 194 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 195 if (key.objectid == ino && 196 key.type == BTRFS_EXTENT_DATA_KEY) 197 path->slots[0]--; 198 } 199 ret = 0; 200 next_slot: 201 leaf = path->nodes[0]; 202 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 203 if (WARN_ON(del_nr > 0)) { 204 btrfs_print_leaf(leaf); 205 ret = -EINVAL; 206 break; 207 } 208 ret = btrfs_next_leaf(root, path); 209 if (ret < 0) 210 break; 211 if (ret > 0) { 212 ret = 0; 213 break; 214 } 215 leaf = path->nodes[0]; 216 recow = 1; 217 } 218 219 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 220 221 if (key.objectid > ino) 222 break; 223 if (WARN_ON_ONCE(key.objectid < ino) || 224 key.type < BTRFS_EXTENT_DATA_KEY) { 225 ASSERT(del_nr == 0); 226 path->slots[0]++; 227 goto next_slot; 228 } 229 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) 230 break; 231 232 fi = btrfs_item_ptr(leaf, path->slots[0], 233 struct btrfs_file_extent_item); 234 extent_type = btrfs_file_extent_type(leaf, fi); 235 236 if (extent_type == BTRFS_FILE_EXTENT_REG || 237 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 238 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 239 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 240 extent_offset = btrfs_file_extent_offset(leaf, fi); 241 extent_end = key.offset + 242 btrfs_file_extent_num_bytes(leaf, fi); 243 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 244 extent_end = key.offset + 245 btrfs_file_extent_ram_bytes(leaf, fi); 246 } else { 247 /* can't happen */ 248 BUG(); 249 } 250 251 /* 252 * Don't skip extent items representing 0 byte lengths. They 253 * used to be created (bug) if while punching holes we hit 254 * -ENOSPC condition. So if we find one here, just ensure we 255 * delete it, otherwise we would insert a new file extent item 256 * with the same key (offset) as that 0 bytes length file 257 * extent item in the call to setup_items_for_insert() later 258 * in this function. 259 */ 260 if (extent_end == key.offset && extent_end >= search_start) { 261 last_end = extent_end; 262 goto delete_extent_item; 263 } 264 265 if (extent_end <= search_start) { 266 path->slots[0]++; 267 goto next_slot; 268 } 269 270 found = 1; 271 search_start = max(key.offset, args->start); 272 if (recow || !modify_tree) { 273 modify_tree = -1; 274 btrfs_release_path(path); 275 continue; 276 } 277 278 /* 279 * | - range to drop - | 280 * | -------- extent -------- | 281 */ 282 if (args->start > key.offset && args->end < extent_end) { 283 if (WARN_ON(del_nr > 0)) { 284 btrfs_print_leaf(leaf); 285 ret = -EINVAL; 286 break; 287 } 288 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 289 ret = -EOPNOTSUPP; 290 break; 291 } 292 293 memcpy(&new_key, &key, sizeof(new_key)); 294 new_key.offset = args->start; 295 ret = btrfs_duplicate_item(trans, root, path, 296 &new_key); 297 if (ret == -EAGAIN) { 298 btrfs_release_path(path); 299 continue; 300 } 301 if (ret < 0) 302 break; 303 304 leaf = path->nodes[0]; 305 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 306 struct btrfs_file_extent_item); 307 btrfs_set_file_extent_num_bytes(leaf, fi, 308 args->start - key.offset); 309 310 fi = btrfs_item_ptr(leaf, path->slots[0], 311 struct btrfs_file_extent_item); 312 313 extent_offset += args->start - key.offset; 314 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 315 btrfs_set_file_extent_num_bytes(leaf, fi, 316 extent_end - args->start); 317 318 if (update_refs && disk_bytenr > 0) { 319 struct btrfs_ref ref = { 320 .action = BTRFS_ADD_DELAYED_REF, 321 .bytenr = disk_bytenr, 322 .num_bytes = num_bytes, 323 .parent = 0, 324 .owning_root = btrfs_root_id(root), 325 .ref_root = btrfs_root_id(root), 326 }; 327 btrfs_init_data_ref(&ref, new_key.objectid, 328 args->start - extent_offset, 329 0, false); 330 ret = btrfs_inc_extent_ref(trans, &ref); 331 if (unlikely(ret)) { 332 btrfs_abort_transaction(trans, ret); 333 break; 334 } 335 } 336 key.offset = args->start; 337 } 338 /* 339 * From here on out we will have actually dropped something, so 340 * last_end can be updated. 341 */ 342 last_end = extent_end; 343 344 /* 345 * | ---- range to drop ----- | 346 * | -------- extent -------- | 347 */ 348 if (args->start <= key.offset && args->end < extent_end) { 349 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 350 ret = -EOPNOTSUPP; 351 break; 352 } 353 354 memcpy(&new_key, &key, sizeof(new_key)); 355 new_key.offset = args->end; 356 btrfs_set_item_key_safe(trans, path, &new_key); 357 358 extent_offset += args->end - key.offset; 359 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 360 btrfs_set_file_extent_num_bytes(leaf, fi, 361 extent_end - args->end); 362 if (update_refs && disk_bytenr > 0) 363 args->bytes_found += args->end - key.offset; 364 break; 365 } 366 367 search_start = extent_end; 368 /* 369 * | ---- range to drop ----- | 370 * | -------- extent -------- | 371 */ 372 if (args->start > key.offset && args->end >= extent_end) { 373 if (WARN_ON(del_nr > 0)) { 374 btrfs_print_leaf(leaf); 375 ret = -EINVAL; 376 break; 377 } 378 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 379 ret = -EOPNOTSUPP; 380 break; 381 } 382 383 btrfs_set_file_extent_num_bytes(leaf, fi, 384 args->start - key.offset); 385 if (update_refs && disk_bytenr > 0) 386 args->bytes_found += extent_end - args->start; 387 if (args->end == extent_end) 388 break; 389 390 path->slots[0]++; 391 goto next_slot; 392 } 393 394 /* 395 * | ---- range to drop ----- | 396 * | ------ extent ------ | 397 */ 398 if (args->start <= key.offset && args->end >= extent_end) { 399 delete_extent_item: 400 if (del_nr == 0) { 401 del_slot = path->slots[0]; 402 del_nr = 1; 403 } else { 404 if (WARN_ON(del_slot + del_nr != path->slots[0])) { 405 btrfs_print_leaf(leaf); 406 ret = -EINVAL; 407 break; 408 } 409 del_nr++; 410 } 411 412 if (update_refs && 413 extent_type == BTRFS_FILE_EXTENT_INLINE) { 414 args->bytes_found += extent_end - key.offset; 415 extent_end = ALIGN(extent_end, 416 fs_info->sectorsize); 417 } else if (update_refs && disk_bytenr > 0) { 418 struct btrfs_ref ref = { 419 .action = BTRFS_DROP_DELAYED_REF, 420 .bytenr = disk_bytenr, 421 .num_bytes = num_bytes, 422 .parent = 0, 423 .owning_root = btrfs_root_id(root), 424 .ref_root = btrfs_root_id(root), 425 }; 426 btrfs_init_data_ref(&ref, key.objectid, 427 key.offset - extent_offset, 428 0, false); 429 ret = btrfs_free_extent(trans, &ref); 430 if (unlikely(ret)) { 431 btrfs_abort_transaction(trans, ret); 432 break; 433 } 434 args->bytes_found += extent_end - key.offset; 435 } 436 437 if (args->end == extent_end) 438 break; 439 440 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 441 path->slots[0]++; 442 goto next_slot; 443 } 444 445 ret = btrfs_del_items(trans, root, path, del_slot, 446 del_nr); 447 if (unlikely(ret)) { 448 btrfs_abort_transaction(trans, ret); 449 break; 450 } 451 452 del_nr = 0; 453 del_slot = 0; 454 455 btrfs_release_path(path); 456 continue; 457 } 458 459 BUG(); 460 } 461 462 if (!ret && del_nr > 0) { 463 /* 464 * Set path->slots[0] to first slot, so that after the delete 465 * if items are move off from our leaf to its immediate left or 466 * right neighbor leafs, we end up with a correct and adjusted 467 * path->slots[0] for our insertion (if args->replace_extent). 468 */ 469 path->slots[0] = del_slot; 470 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 471 if (ret) 472 btrfs_abort_transaction(trans, ret); 473 } 474 475 leaf = path->nodes[0]; 476 /* 477 * If btrfs_del_items() was called, it might have deleted a leaf, in 478 * which case it unlocked our path, so check path->locks[0] matches a 479 * write lock. 480 */ 481 if (!ret && args->replace_extent && 482 path->locks[0] == BTRFS_WRITE_LOCK && 483 btrfs_leaf_free_space(leaf) >= 484 sizeof(struct btrfs_item) + args->extent_item_size) { 485 486 key.objectid = ino; 487 key.type = BTRFS_EXTENT_DATA_KEY; 488 key.offset = args->start; 489 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { 490 struct btrfs_key slot_key; 491 492 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); 493 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) 494 path->slots[0]++; 495 } 496 btrfs_setup_item_for_insert(trans, root, path, &key, 497 args->extent_item_size); 498 args->extent_inserted = true; 499 } 500 501 if (!args->path) 502 btrfs_free_path(path); 503 else if (!args->extent_inserted) 504 btrfs_release_path(path); 505 out: 506 args->drop_end = found ? min(args->end, last_end) : args->end; 507 508 return ret; 509 } 510 511 static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, 512 u64 bytenr, u64 orig_offset, u64 *start, u64 *end) 513 { 514 struct btrfs_file_extent_item *fi; 515 struct btrfs_key key; 516 u64 extent_end; 517 518 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 519 return false; 520 521 btrfs_item_key_to_cpu(leaf, &key, slot); 522 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 523 return false; 524 525 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 526 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 527 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 528 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 529 btrfs_file_extent_compression(leaf, fi) || 530 btrfs_file_extent_encryption(leaf, fi) || 531 btrfs_file_extent_other_encoding(leaf, fi)) 532 return false; 533 534 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 535 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 536 return false; 537 538 *start = key.offset; 539 *end = extent_end; 540 return true; 541 } 542 543 /* 544 * Mark extent in the range start - end as written. 545 * 546 * This changes extent type from 'pre-allocated' to 'regular'. If only 547 * part of extent is marked as written, the extent will be split into 548 * two or three. 549 */ 550 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 551 struct btrfs_inode *inode, u64 start, u64 end) 552 { 553 struct btrfs_root *root = inode->root; 554 struct extent_buffer *leaf; 555 BTRFS_PATH_AUTO_FREE(path); 556 struct btrfs_file_extent_item *fi; 557 struct btrfs_ref ref = { 0 }; 558 struct btrfs_key key; 559 struct btrfs_key new_key; 560 u64 bytenr; 561 u64 num_bytes; 562 u64 extent_end; 563 u64 orig_offset; 564 u64 other_start; 565 u64 other_end; 566 u64 split; 567 int del_nr = 0; 568 int del_slot = 0; 569 int recow; 570 int ret = 0; 571 u64 ino = btrfs_ino(inode); 572 573 path = btrfs_alloc_path(); 574 if (!path) 575 return -ENOMEM; 576 again: 577 recow = 0; 578 split = start; 579 key.objectid = ino; 580 key.type = BTRFS_EXTENT_DATA_KEY; 581 key.offset = split; 582 583 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 584 if (ret < 0) 585 goto out; 586 if (ret > 0 && path->slots[0] > 0) 587 path->slots[0]--; 588 589 leaf = path->nodes[0]; 590 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 591 if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { 592 ret = -EINVAL; 593 btrfs_abort_transaction(trans, ret); 594 goto out; 595 } 596 fi = btrfs_item_ptr(leaf, path->slots[0], 597 struct btrfs_file_extent_item); 598 if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { 599 ret = -EINVAL; 600 btrfs_abort_transaction(trans, ret); 601 goto out; 602 } 603 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 604 if (unlikely(key.offset > start || extent_end < end)) { 605 ret = -EINVAL; 606 btrfs_abort_transaction(trans, ret); 607 goto out; 608 } 609 610 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 611 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 612 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 613 memcpy(&new_key, &key, sizeof(new_key)); 614 615 if (start == key.offset && end < extent_end) { 616 other_start = 0; 617 other_end = start; 618 if (extent_mergeable(leaf, path->slots[0] - 1, 619 ino, bytenr, orig_offset, 620 &other_start, &other_end)) { 621 new_key.offset = end; 622 btrfs_set_item_key_safe(trans, path, &new_key); 623 fi = btrfs_item_ptr(leaf, path->slots[0], 624 struct btrfs_file_extent_item); 625 btrfs_set_file_extent_generation(leaf, fi, 626 trans->transid); 627 btrfs_set_file_extent_num_bytes(leaf, fi, 628 extent_end - end); 629 btrfs_set_file_extent_offset(leaf, fi, 630 end - orig_offset); 631 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 632 struct btrfs_file_extent_item); 633 btrfs_set_file_extent_generation(leaf, fi, 634 trans->transid); 635 btrfs_set_file_extent_num_bytes(leaf, fi, 636 end - other_start); 637 goto out; 638 } 639 } 640 641 if (start > key.offset && end == extent_end) { 642 other_start = end; 643 other_end = 0; 644 if (extent_mergeable(leaf, path->slots[0] + 1, 645 ino, bytenr, orig_offset, 646 &other_start, &other_end)) { 647 fi = btrfs_item_ptr(leaf, path->slots[0], 648 struct btrfs_file_extent_item); 649 btrfs_set_file_extent_num_bytes(leaf, fi, 650 start - key.offset); 651 btrfs_set_file_extent_generation(leaf, fi, 652 trans->transid); 653 path->slots[0]++; 654 new_key.offset = start; 655 btrfs_set_item_key_safe(trans, path, &new_key); 656 657 fi = btrfs_item_ptr(leaf, path->slots[0], 658 struct btrfs_file_extent_item); 659 btrfs_set_file_extent_generation(leaf, fi, 660 trans->transid); 661 btrfs_set_file_extent_num_bytes(leaf, fi, 662 other_end - start); 663 btrfs_set_file_extent_offset(leaf, fi, 664 start - orig_offset); 665 goto out; 666 } 667 } 668 669 while (start > key.offset || end < extent_end) { 670 if (key.offset == start) 671 split = end; 672 673 new_key.offset = split; 674 ret = btrfs_duplicate_item(trans, root, path, &new_key); 675 if (ret == -EAGAIN) { 676 btrfs_release_path(path); 677 goto again; 678 } 679 if (unlikely(ret < 0)) { 680 btrfs_abort_transaction(trans, ret); 681 goto out; 682 } 683 684 leaf = path->nodes[0]; 685 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 686 struct btrfs_file_extent_item); 687 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 688 btrfs_set_file_extent_num_bytes(leaf, fi, 689 split - key.offset); 690 691 fi = btrfs_item_ptr(leaf, path->slots[0], 692 struct btrfs_file_extent_item); 693 694 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 695 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 696 btrfs_set_file_extent_num_bytes(leaf, fi, 697 extent_end - split); 698 699 ref.action = BTRFS_ADD_DELAYED_REF; 700 ref.bytenr = bytenr; 701 ref.num_bytes = num_bytes; 702 ref.parent = 0; 703 ref.owning_root = btrfs_root_id(root); 704 ref.ref_root = btrfs_root_id(root); 705 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); 706 ret = btrfs_inc_extent_ref(trans, &ref); 707 if (unlikely(ret)) { 708 btrfs_abort_transaction(trans, ret); 709 goto out; 710 } 711 712 if (split == start) { 713 key.offset = start; 714 } else { 715 if (unlikely(start != key.offset)) { 716 ret = -EINVAL; 717 btrfs_abort_transaction(trans, ret); 718 goto out; 719 } 720 path->slots[0]--; 721 extent_end = end; 722 } 723 recow = 1; 724 } 725 726 other_start = end; 727 other_end = 0; 728 729 ref.action = BTRFS_DROP_DELAYED_REF; 730 ref.bytenr = bytenr; 731 ref.num_bytes = num_bytes; 732 ref.parent = 0; 733 ref.owning_root = btrfs_root_id(root); 734 ref.ref_root = btrfs_root_id(root); 735 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); 736 if (extent_mergeable(leaf, path->slots[0] + 1, 737 ino, bytenr, orig_offset, 738 &other_start, &other_end)) { 739 if (recow) { 740 btrfs_release_path(path); 741 goto again; 742 } 743 extent_end = other_end; 744 del_slot = path->slots[0] + 1; 745 del_nr++; 746 ret = btrfs_free_extent(trans, &ref); 747 if (unlikely(ret)) { 748 btrfs_abort_transaction(trans, ret); 749 goto out; 750 } 751 } 752 other_start = 0; 753 other_end = start; 754 if (extent_mergeable(leaf, path->slots[0] - 1, 755 ino, bytenr, orig_offset, 756 &other_start, &other_end)) { 757 if (recow) { 758 btrfs_release_path(path); 759 goto again; 760 } 761 key.offset = other_start; 762 del_slot = path->slots[0]; 763 del_nr++; 764 ret = btrfs_free_extent(trans, &ref); 765 if (unlikely(ret)) { 766 btrfs_abort_transaction(trans, ret); 767 goto out; 768 } 769 } 770 if (del_nr == 0) { 771 fi = btrfs_item_ptr(leaf, path->slots[0], 772 struct btrfs_file_extent_item); 773 btrfs_set_file_extent_type(leaf, fi, 774 BTRFS_FILE_EXTENT_REG); 775 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 776 } else { 777 fi = btrfs_item_ptr(leaf, del_slot - 1, 778 struct btrfs_file_extent_item); 779 btrfs_set_file_extent_type(leaf, fi, 780 BTRFS_FILE_EXTENT_REG); 781 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 782 btrfs_set_file_extent_num_bytes(leaf, fi, 783 extent_end - key.offset); 784 785 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 786 if (unlikely(ret < 0)) { 787 btrfs_abort_transaction(trans, ret); 788 goto out; 789 } 790 } 791 out: 792 return ret; 793 } 794 795 /* 796 * On error return an unlocked folio and the error value 797 * On success return a locked folio and 0 798 */ 799 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, 800 u64 len) 801 { 802 u64 clamp_start = max_t(u64, pos, folio_pos(folio)); 803 u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio)); 804 const u32 blocksize = inode_to_fs_info(inode)->sectorsize; 805 int ret = 0; 806 807 if (folio_test_uptodate(folio)) 808 return 0; 809 810 if (IS_ALIGNED(clamp_start, blocksize) && 811 IS_ALIGNED(clamp_end, blocksize)) 812 return 0; 813 814 ret = btrfs_read_folio(NULL, folio); 815 if (ret) 816 return ret; 817 folio_lock(folio); 818 if (unlikely(!folio_test_uptodate(folio))) { 819 folio_unlock(folio); 820 return -EIO; 821 } 822 823 /* 824 * Since btrfs_read_folio() will unlock the folio before it returns, 825 * there is a window where btrfs_release_folio() can be called to 826 * release the page. Here we check both inode mapping and page 827 * private to make sure the page was not released. 828 * 829 * The private flag check is essential for subpage as we need to store 830 * extra bitmap using folio private. 831 */ 832 if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { 833 folio_unlock(folio); 834 return -EAGAIN; 835 } 836 return 0; 837 } 838 839 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) 840 { 841 gfp_t gfp; 842 843 gfp = btrfs_alloc_write_mask(inode->i_mapping); 844 if (nowait) { 845 gfp &= ~__GFP_DIRECT_RECLAIM; 846 gfp |= GFP_NOWAIT; 847 } 848 849 return gfp; 850 } 851 852 /* 853 * Get folio into the page cache and lock it. 854 */ 855 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, 856 loff_t pos, size_t write_bytes, 857 bool nowait) 858 { 859 const pgoff_t index = pos >> PAGE_SHIFT; 860 gfp_t mask = get_prepare_gfp_flags(inode, nowait); 861 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | 862 fgf_set_order(write_bytes); 863 struct folio *folio; 864 int ret = 0; 865 866 again: 867 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); 868 if (IS_ERR(folio)) 869 return PTR_ERR(folio); 870 871 ret = set_folio_extent_mapped(folio); 872 if (ret < 0) { 873 folio_unlock(folio); 874 folio_put(folio); 875 return ret; 876 } 877 ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); 878 if (ret) { 879 /* The folio is already unlocked. */ 880 folio_put(folio); 881 if (!nowait && ret == -EAGAIN) { 882 ret = 0; 883 goto again; 884 } 885 return ret; 886 } 887 *folio_ret = folio; 888 return 0; 889 } 890 891 /* 892 * Locks the extent and properly waits for data=ordered extents to finish 893 * before allowing the folios to be modified if need. 894 * 895 * Return: 896 * 1 - the extent is locked 897 * 0 - the extent is not locked, and everything is OK 898 * -EAGAIN - need to prepare the folios again 899 */ 900 static noinline int 901 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, 902 loff_t pos, size_t write_bytes, 903 u64 *lockstart, u64 *lockend, bool nowait, 904 struct extent_state **cached_state) 905 { 906 struct btrfs_fs_info *fs_info = inode->root->fs_info; 907 u64 start_pos; 908 u64 last_pos; 909 int ret = 0; 910 911 start_pos = round_down(pos, fs_info->sectorsize); 912 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; 913 914 if (start_pos < inode->vfs_inode.i_size) { 915 struct btrfs_ordered_extent *ordered; 916 917 if (nowait) { 918 if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, 919 last_pos, cached_state)) { 920 folio_unlock(folio); 921 folio_put(folio); 922 return -EAGAIN; 923 } 924 } else { 925 btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, 926 cached_state); 927 } 928 929 ordered = btrfs_lookup_ordered_range(inode, start_pos, 930 last_pos - start_pos + 1); 931 if (ordered && 932 ordered->file_offset + ordered->num_bytes > start_pos && 933 ordered->file_offset <= last_pos) { 934 btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, 935 cached_state); 936 folio_unlock(folio); 937 folio_put(folio); 938 btrfs_start_ordered_extent(ordered); 939 btrfs_put_ordered_extent(ordered); 940 return -EAGAIN; 941 } 942 if (ordered) 943 btrfs_put_ordered_extent(ordered); 944 945 *lockstart = start_pos; 946 *lockend = last_pos; 947 ret = 1; 948 } 949 950 /* 951 * We should be called after prepare_one_folio() which should have locked 952 * all pages in the range. 953 */ 954 WARN_ON(!folio_test_locked(folio)); 955 956 return ret; 957 } 958 959 /* 960 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) 961 * 962 * @pos: File offset. 963 * @write_bytes: The length to write, will be updated to the nocow writeable 964 * range. 965 * @nowait: Indicate if we can block or not (non-blocking IO context). 966 * 967 * This function will flush ordered extents in the range to ensure proper 968 * nocow checks. 969 * 970 * Return: 971 * > 0 If we can nocow, and updates @write_bytes. 972 * 0 If we can't do a nocow write. 973 * -EAGAIN If we can't do a nocow write because snapshotting of the inode's 974 * root is in progress or because we are in a non-blocking IO 975 * context and need to block (@nowait is true). 976 * < 0 If an error happened. 977 * 978 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. 979 */ 980 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, 981 size_t *write_bytes, bool nowait) 982 { 983 struct btrfs_fs_info *fs_info = inode->root->fs_info; 984 struct btrfs_root *root = inode->root; 985 struct extent_state *cached_state = NULL; 986 u64 lockstart, lockend; 987 u64 cur_offset; 988 int ret = 0; 989 990 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 991 return 0; 992 993 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) 994 return -EAGAIN; 995 996 lockstart = round_down(pos, fs_info->sectorsize); 997 lockend = round_up(pos + *write_bytes, 998 fs_info->sectorsize) - 1; 999 1000 if (nowait) { 1001 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, 1002 &cached_state)) { 1003 btrfs_drew_write_unlock(&root->snapshot_lock); 1004 return -EAGAIN; 1005 } 1006 } else { 1007 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, 1008 &cached_state); 1009 } 1010 1011 cur_offset = lockstart; 1012 while (cur_offset < lockend) { 1013 u64 num_bytes = lockend - cur_offset + 1; 1014 1015 ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait); 1016 if (ret <= 0) { 1017 /* 1018 * If cur_offset == lockstart it means we haven't found 1019 * any extent against which we can NOCOW, so unlock the 1020 * snapshot lock. 1021 */ 1022 if (cur_offset == lockstart) 1023 btrfs_drew_write_unlock(&root->snapshot_lock); 1024 break; 1025 } 1026 cur_offset += num_bytes; 1027 } 1028 1029 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 1030 1031 /* 1032 * cur_offset > lockstart means there's at least a partial range we can 1033 * NOCOW, and that range can cover one or more extents. 1034 */ 1035 if (cur_offset > lockstart) { 1036 *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos); 1037 return 1; 1038 } 1039 1040 return ret; 1041 } 1042 1043 void btrfs_check_nocow_unlock(struct btrfs_inode *inode) 1044 { 1045 btrfs_drew_write_unlock(&inode->root->snapshot_lock); 1046 } 1047 1048 int btrfs_write_check(struct kiocb *iocb, size_t count) 1049 { 1050 struct file *file = iocb->ki_filp; 1051 struct inode *inode = file_inode(file); 1052 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1053 loff_t pos = iocb->ki_pos; 1054 int ret; 1055 loff_t oldsize; 1056 1057 /* 1058 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or 1059 * prealloc flags, as without those flags we always have to COW. We will 1060 * later check if we can really COW into the target range (using 1061 * can_nocow_extent() at btrfs_get_blocks_direct_write()). 1062 */ 1063 if ((iocb->ki_flags & IOCB_NOWAIT) && 1064 !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 1065 return -EAGAIN; 1066 1067 ret = file_remove_privs(file); 1068 if (ret) 1069 return ret; 1070 1071 /* 1072 * We reserve space for updating the inode when we reserve space for the 1073 * extent we are going to write, so we will enospc out there. We don't 1074 * need to start yet another transaction to update the inode as we will 1075 * update the inode when we finish writing whatever data we write. 1076 */ 1077 if (!IS_NOCMTIME(inode)) { 1078 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 1079 inode_inc_iversion(inode); 1080 } 1081 1082 oldsize = i_size_read(inode); 1083 if (pos > oldsize) { 1084 /* Expand hole size to cover write data, preventing empty gap */ 1085 loff_t end_pos = round_up(pos + count, fs_info->sectorsize); 1086 1087 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); 1088 if (ret) 1089 return ret; 1090 } 1091 1092 return 0; 1093 } 1094 1095 static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, 1096 u64 start, u64 len, bool only_release_metadata) 1097 { 1098 if (len == 0) 1099 return; 1100 1101 if (only_release_metadata) { 1102 btrfs_check_nocow_unlock(inode); 1103 btrfs_delalloc_release_metadata(inode, len, true); 1104 } else { 1105 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1106 1107 btrfs_delalloc_release_space(inode, data_reserved, 1108 round_down(start, fs_info->sectorsize), 1109 len, true); 1110 } 1111 } 1112 1113 /* 1114 * Reserve data and metadata space for this buffered write range. 1115 * 1116 * Return >0 for the number of bytes reserved, which is always block aligned. 1117 * Return <0 for error. 1118 */ 1119 static ssize_t reserve_space(struct btrfs_inode *inode, 1120 struct extent_changeset **data_reserved, 1121 u64 start, size_t *len, bool nowait, 1122 bool *only_release_metadata) 1123 { 1124 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1125 const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); 1126 size_t reserve_bytes; 1127 int ret; 1128 1129 ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); 1130 if (ret < 0) { 1131 int can_nocow; 1132 1133 if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) 1134 return -EAGAIN; 1135 1136 /* 1137 * If we don't have to COW at the offset, reserve metadata only. 1138 * write_bytes may get smaller than requested here. 1139 */ 1140 can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); 1141 if (can_nocow < 0) 1142 ret = can_nocow; 1143 if (can_nocow > 0) 1144 ret = 0; 1145 if (ret) 1146 return ret; 1147 *only_release_metadata = true; 1148 } 1149 1150 reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); 1151 WARN_ON(reserve_bytes == 0); 1152 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, 1153 reserve_bytes, nowait); 1154 if (ret) { 1155 if (!*only_release_metadata) 1156 btrfs_free_reserved_data_space(inode, *data_reserved, 1157 start, *len); 1158 else 1159 btrfs_check_nocow_unlock(inode); 1160 1161 if (nowait && ret == -ENOSPC) 1162 ret = -EAGAIN; 1163 return ret; 1164 } 1165 return reserve_bytes; 1166 } 1167 1168 /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ 1169 static void shrink_reserved_space(struct btrfs_inode *inode, 1170 struct extent_changeset *data_reserved, 1171 u64 reserved_start, u64 reserved_len, 1172 u64 new_len, bool only_release_metadata) 1173 { 1174 const u64 diff = reserved_len - new_len; 1175 1176 ASSERT(new_len <= reserved_len); 1177 btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); 1178 if (only_release_metadata) 1179 btrfs_delalloc_release_metadata(inode, diff, true); 1180 else 1181 btrfs_delalloc_release_space(inode, data_reserved, 1182 reserved_start + new_len, diff, true); 1183 } 1184 1185 /* Calculate the maximum amount of bytes we can write into one folio. */ 1186 static size_t calc_write_bytes(const struct btrfs_inode *inode, 1187 const struct iov_iter *iter, u64 start) 1188 { 1189 const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); 1190 1191 return min(max_folio_size - (start & (max_folio_size - 1)), 1192 iov_iter_count(iter)); 1193 } 1194 1195 /* 1196 * Do the heavy-lifting work to copy one range into one folio of the page cache. 1197 * 1198 * Return > 0 in case we copied all bytes or just some of them. 1199 * Return 0 if no bytes were copied, in which case the caller should retry. 1200 * Return <0 on error. 1201 */ 1202 static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, 1203 struct extent_changeset **data_reserved, u64 start, 1204 bool nowait) 1205 { 1206 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1207 struct extent_state *cached_state = NULL; 1208 size_t write_bytes = calc_write_bytes(inode, iter, start); 1209 size_t copied; 1210 const u64 reserved_start = round_down(start, fs_info->sectorsize); 1211 u64 reserved_len; 1212 struct folio *folio = NULL; 1213 int extents_locked; 1214 u64 lockstart; 1215 u64 lockend; 1216 bool only_release_metadata = false; 1217 const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); 1218 int ret; 1219 1220 /* 1221 * Fault all pages before locking them in prepare_one_folio() to avoid 1222 * recursive lock. 1223 */ 1224 if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) 1225 return -EFAULT; 1226 extent_changeset_release(*data_reserved); 1227 ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, 1228 &only_release_metadata); 1229 if (ret < 0) 1230 return ret; 1231 reserved_len = ret; 1232 /* Write range must be inside the reserved range. */ 1233 ASSERT(reserved_start <= start); 1234 ASSERT(start + write_bytes <= reserved_start + reserved_len); 1235 1236 again: 1237 ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, 1238 bdp_flags); 1239 if (ret) { 1240 btrfs_delalloc_release_extents(inode, reserved_len); 1241 release_space(inode, *data_reserved, reserved_start, reserved_len, 1242 only_release_metadata); 1243 return ret; 1244 } 1245 1246 ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); 1247 if (ret) { 1248 btrfs_delalloc_release_extents(inode, reserved_len); 1249 release_space(inode, *data_reserved, reserved_start, reserved_len, 1250 only_release_metadata); 1251 return ret; 1252 } 1253 1254 /* 1255 * The reserved range goes beyond the current folio, shrink the reserved 1256 * space to the folio boundary. 1257 */ 1258 if (reserved_start + reserved_len > folio_next_pos(folio)) { 1259 const u64 last_block = folio_next_pos(folio); 1260 1261 shrink_reserved_space(inode, *data_reserved, reserved_start, 1262 reserved_len, last_block - reserved_start, 1263 only_release_metadata); 1264 write_bytes = last_block - start; 1265 reserved_len = last_block - reserved_start; 1266 } 1267 1268 extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, 1269 write_bytes, &lockstart, 1270 &lockend, nowait, 1271 &cached_state); 1272 if (extents_locked < 0) { 1273 if (!nowait && extents_locked == -EAGAIN) 1274 goto again; 1275 1276 btrfs_delalloc_release_extents(inode, reserved_len); 1277 release_space(inode, *data_reserved, reserved_start, reserved_len, 1278 only_release_metadata); 1279 ret = extents_locked; 1280 return ret; 1281 } 1282 1283 copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), 1284 write_bytes, iter); 1285 flush_dcache_folio(folio); 1286 1287 if (unlikely(copied < write_bytes)) { 1288 u64 last_block; 1289 1290 /* 1291 * The original write range doesn't need an uptodate folio as 1292 * the range is block aligned. But now a short copy happened. 1293 * We cannot handle it without an uptodate folio. 1294 * 1295 * So just revert the range and we will retry. 1296 */ 1297 if (!folio_test_uptodate(folio)) { 1298 iov_iter_revert(iter, copied); 1299 copied = 0; 1300 } 1301 1302 /* No copied bytes, unlock, release reserved space and exit. */ 1303 if (copied == 0) { 1304 if (extents_locked) 1305 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, 1306 &cached_state); 1307 else 1308 btrfs_free_extent_state(cached_state); 1309 btrfs_delalloc_release_extents(inode, reserved_len); 1310 release_space(inode, *data_reserved, reserved_start, reserved_len, 1311 only_release_metadata); 1312 btrfs_drop_folio(fs_info, folio, start, copied); 1313 return 0; 1314 } 1315 1316 /* Release the reserved space beyond the last block. */ 1317 last_block = round_up(start + copied, fs_info->sectorsize); 1318 1319 shrink_reserved_space(inode, *data_reserved, reserved_start, 1320 reserved_len, last_block - reserved_start, 1321 only_release_metadata); 1322 reserved_len = last_block - reserved_start; 1323 } 1324 1325 ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, 1326 only_release_metadata); 1327 /* 1328 * If we have not locked the extent range, because the range's start 1329 * offset is >= i_size, we might still have a non-NULL cached extent 1330 * state, acquired while marking the extent range as delalloc through 1331 * btrfs_dirty_page(). Therefore free any possible cached extent state 1332 * to avoid a memory leak. 1333 */ 1334 if (extents_locked) 1335 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 1336 else 1337 btrfs_free_extent_state(cached_state); 1338 1339 btrfs_delalloc_release_extents(inode, reserved_len); 1340 if (ret) { 1341 btrfs_drop_folio(fs_info, folio, start, copied); 1342 release_space(inode, *data_reserved, reserved_start, reserved_len, 1343 only_release_metadata); 1344 return ret; 1345 } 1346 if (only_release_metadata) 1347 btrfs_check_nocow_unlock(inode); 1348 1349 btrfs_drop_folio(fs_info, folio, start, copied); 1350 return copied; 1351 } 1352 1353 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) 1354 { 1355 struct file *file = iocb->ki_filp; 1356 loff_t pos; 1357 struct inode *inode = file_inode(file); 1358 struct extent_changeset *data_reserved = NULL; 1359 size_t num_written = 0; 1360 ssize_t ret; 1361 loff_t old_isize; 1362 unsigned int ilock_flags = 0; 1363 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); 1364 1365 if (nowait) 1366 ilock_flags |= BTRFS_ILOCK_TRY; 1367 1368 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 1369 if (ret < 0) 1370 return ret; 1371 1372 /* 1373 * We can only trust the isize with inode lock held, or it can race with 1374 * other buffered writes and cause incorrect call of 1375 * pagecache_isize_extended() to overwrite existing data. 1376 */ 1377 old_isize = i_size_read(inode); 1378 1379 ret = generic_write_checks(iocb, iter); 1380 if (ret <= 0) 1381 goto out; 1382 1383 ret = btrfs_write_check(iocb, ret); 1384 if (ret < 0) 1385 goto out; 1386 1387 pos = iocb->ki_pos; 1388 while (iov_iter_count(iter) > 0) { 1389 ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); 1390 if (ret < 0) 1391 break; 1392 pos += ret; 1393 num_written += ret; 1394 cond_resched(); 1395 } 1396 1397 extent_changeset_free(data_reserved); 1398 if (num_written > 0) { 1399 pagecache_isize_extended(inode, old_isize, iocb->ki_pos); 1400 iocb->ki_pos += num_written; 1401 } 1402 out: 1403 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1404 return num_written ? num_written : ret; 1405 } 1406 1407 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, 1408 const struct btrfs_ioctl_encoded_io_args *encoded) 1409 { 1410 struct file *file = iocb->ki_filp; 1411 struct inode *inode = file_inode(file); 1412 loff_t count; 1413 ssize_t ret; 1414 1415 btrfs_inode_lock(BTRFS_I(inode), 0); 1416 count = encoded->len; 1417 ret = generic_write_checks_count(iocb, &count); 1418 if (ret == 0 && count != encoded->len) { 1419 /* 1420 * The write got truncated by generic_write_checks_count(). We 1421 * can't do a partial encoded write. 1422 */ 1423 ret = -EFBIG; 1424 } 1425 if (ret || encoded->len == 0) 1426 goto out; 1427 1428 ret = btrfs_write_check(iocb, encoded->len); 1429 if (ret < 0) 1430 goto out; 1431 1432 ret = btrfs_do_encoded_write(iocb, from, encoded); 1433 out: 1434 btrfs_inode_unlock(BTRFS_I(inode), 0); 1435 return ret; 1436 } 1437 1438 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, 1439 const struct btrfs_ioctl_encoded_io_args *encoded) 1440 { 1441 struct file *file = iocb->ki_filp; 1442 struct btrfs_inode *inode = BTRFS_I(file_inode(file)); 1443 ssize_t num_written, num_sync; 1444 1445 /* 1446 * If the fs flips readonly due to some impossible error, although we 1447 * have opened a file as writable, we have to stop this write operation 1448 * to ensure consistency. 1449 */ 1450 if (BTRFS_FS_ERROR(inode->root->fs_info)) 1451 return -EROFS; 1452 1453 if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) 1454 return -EOPNOTSUPP; 1455 1456 if (encoded) { 1457 num_written = btrfs_encoded_write(iocb, from, encoded); 1458 num_sync = encoded->len; 1459 } else if (iocb->ki_flags & IOCB_DIRECT) { 1460 num_written = btrfs_direct_write(iocb, from); 1461 num_sync = num_written; 1462 } else { 1463 num_written = btrfs_buffered_write(iocb, from); 1464 num_sync = num_written; 1465 } 1466 1467 btrfs_set_inode_last_sub_trans(inode); 1468 1469 if (num_sync > 0) { 1470 num_sync = generic_write_sync(iocb, num_sync); 1471 if (num_sync < 0) 1472 num_written = num_sync; 1473 } 1474 1475 return num_written; 1476 } 1477 1478 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1479 { 1480 return btrfs_do_write_iter(iocb, from, NULL); 1481 } 1482 1483 int btrfs_release_file(struct inode *inode, struct file *filp) 1484 { 1485 struct btrfs_file_private *private = filp->private_data; 1486 1487 if (private) { 1488 kfree(private->filldir_buf); 1489 btrfs_free_extent_state(private->llseek_cached_state); 1490 kfree(private); 1491 filp->private_data = NULL; 1492 } 1493 1494 /* 1495 * Set by setattr when we are about to truncate a file from a non-zero 1496 * size to a zero size. This tries to flush down new bytes that may 1497 * have been written if the application were using truncate to replace 1498 * a file in place. 1499 */ 1500 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 1501 &BTRFS_I(inode)->runtime_flags)) 1502 filemap_flush(inode->i_mapping); 1503 return 0; 1504 } 1505 1506 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end) 1507 { 1508 int ret; 1509 struct blk_plug plug; 1510 1511 /* 1512 * This is only called in fsync, which would do synchronous writes, so 1513 * a plug can merge adjacent IOs as much as possible. Esp. in case of 1514 * multiple disks using raid profile, a large IO can be split to 1515 * several segments of stripe length (currently 64K). 1516 */ 1517 blk_start_plug(&plug); 1518 ret = btrfs_fdatawrite_range(inode, start, end); 1519 blk_finish_plug(&plug); 1520 1521 return ret; 1522 } 1523 1524 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) 1525 { 1526 struct btrfs_inode *inode = ctx->inode; 1527 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1528 1529 if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && 1530 list_empty(&ctx->ordered_extents)) 1531 return true; 1532 1533 /* 1534 * If we are doing a fast fsync we can not bail out if the inode's 1535 * last_trans is <= then the last committed transaction, because we only 1536 * update the last_trans of the inode during ordered extent completion, 1537 * and for a fast fsync we don't wait for that, we only wait for the 1538 * writeback to complete. 1539 */ 1540 if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && 1541 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || 1542 list_empty(&ctx->ordered_extents))) 1543 return true; 1544 1545 return false; 1546 } 1547 1548 /* 1549 * fsync call for both files and directories. This logs the inode into 1550 * the tree log instead of forcing full commits whenever possible. 1551 * 1552 * It needs to call filemap_fdatawait so that all ordered extent updates are 1553 * in the metadata btree are up to date for copying to the log. 1554 * 1555 * It drops the inode mutex before doing the tree log commit. This is an 1556 * important optimization for directories because holding the mutex prevents 1557 * new operations on the dir while we write to disk. 1558 */ 1559 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1560 { 1561 struct dentry *dentry = file_dentry(file); 1562 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 1563 struct btrfs_root *root = inode->root; 1564 struct btrfs_fs_info *fs_info = root->fs_info; 1565 struct btrfs_trans_handle *trans; 1566 struct btrfs_log_ctx ctx; 1567 int ret = 0, err; 1568 u64 len; 1569 bool full_sync; 1570 bool skip_ilock = false; 1571 1572 if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { 1573 skip_ilock = true; 1574 current->journal_info = NULL; 1575 btrfs_assert_inode_locked(inode); 1576 } 1577 1578 trace_btrfs_sync_file(file, datasync); 1579 1580 btrfs_init_log_ctx(&ctx, inode); 1581 1582 /* 1583 * Always set the range to a full range, otherwise we can get into 1584 * several problems, from missing file extent items to represent holes 1585 * when not using the NO_HOLES feature, to log tree corruption due to 1586 * races between hole detection during logging and completion of ordered 1587 * extents outside the range, to missing checksums due to ordered extents 1588 * for which we flushed only a subset of their pages. 1589 */ 1590 start = 0; 1591 end = LLONG_MAX; 1592 len = (u64)LLONG_MAX + 1; 1593 1594 /* 1595 * We write the dirty pages in the range and wait until they complete 1596 * out of the ->i_mutex. If so, we can flush the dirty pages by 1597 * multi-task, and make the performance up. See 1598 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1599 */ 1600 ret = start_ordered_ops(inode, start, end); 1601 if (ret) 1602 goto out; 1603 1604 if (skip_ilock) 1605 down_write(&inode->i_mmap_lock); 1606 else 1607 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 1608 1609 atomic_inc(&root->log_batch); 1610 1611 /* 1612 * Before we acquired the inode's lock and the mmap lock, someone may 1613 * have dirtied more pages in the target range. We need to make sure 1614 * that writeback for any such pages does not start while we are logging 1615 * the inode, because if it does, any of the following might happen when 1616 * we are not doing a full inode sync: 1617 * 1618 * 1) We log an extent after its writeback finishes but before its 1619 * checksums are added to the csum tree, leading to -EIO errors 1620 * when attempting to read the extent after a log replay. 1621 * 1622 * 2) We can end up logging an extent before its writeback finishes. 1623 * Therefore after the log replay we will have a file extent item 1624 * pointing to an unwritten extent (and no data checksums as well). 1625 * 1626 * So trigger writeback for any eventual new dirty pages and then we 1627 * wait for all ordered extents to complete below. 1628 */ 1629 ret = start_ordered_ops(inode, start, end); 1630 if (ret) { 1631 if (skip_ilock) 1632 up_write(&inode->i_mmap_lock); 1633 else 1634 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1635 goto out; 1636 } 1637 1638 /* 1639 * Always check for the full sync flag while holding the inode's lock, 1640 * to avoid races with other tasks. The flag must be either set all the 1641 * time during logging or always off all the time while logging. 1642 * We check the flag here after starting delalloc above, because when 1643 * running delalloc the full sync flag may be set if we need to drop 1644 * extra extent map ranges due to temporary memory allocation failures. 1645 */ 1646 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 1647 1648 /* 1649 * We have to do this here to avoid the priority inversion of waiting on 1650 * IO of a lower priority task while holding a transaction open. 1651 * 1652 * For a full fsync we wait for the ordered extents to complete while 1653 * for a fast fsync we wait just for writeback to complete, and then 1654 * attach the ordered extents to the transaction so that a transaction 1655 * commit waits for their completion, to avoid data loss if we fsync, 1656 * the current transaction commits before the ordered extents complete 1657 * and a power failure happens right after that. 1658 * 1659 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the 1660 * logical address recorded in the ordered extent may change. We need 1661 * to wait for the IO to stabilize the logical address. 1662 */ 1663 if (full_sync || btrfs_is_zoned(fs_info)) { 1664 ret = btrfs_wait_ordered_range(inode, start, len); 1665 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); 1666 } else { 1667 /* 1668 * Get our ordered extents as soon as possible to avoid doing 1669 * checksum lookups in the csum tree, and use instead the 1670 * checksums attached to the ordered extents. 1671 */ 1672 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents); 1673 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); 1674 if (ret) 1675 goto out_release_extents; 1676 1677 /* 1678 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after 1679 * starting and waiting for writeback, because for buffered IO 1680 * it may have been set during the end IO callback 1681 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in 1682 * case an error happened and we need to wait for ordered 1683 * extents to complete so that any extent maps that point to 1684 * unwritten locations are dropped and we don't log them. 1685 */ 1686 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags)) 1687 ret = btrfs_wait_ordered_range(inode, start, len); 1688 } 1689 1690 if (ret) 1691 goto out_release_extents; 1692 1693 atomic_inc(&root->log_batch); 1694 1695 if (skip_inode_logging(&ctx)) { 1696 /* 1697 * We've had everything committed since the last time we were 1698 * modified so clear this flag in case it was set for whatever 1699 * reason, it's no longer relevant. 1700 */ 1701 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 1702 /* 1703 * An ordered extent might have started before and completed 1704 * already with io errors, in which case the inode was not 1705 * updated and we end up here. So check the inode's mapping 1706 * for any errors that might have happened since we last 1707 * checked called fsync. 1708 */ 1709 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); 1710 goto out_release_extents; 1711 } 1712 1713 btrfs_init_log_ctx_scratch_eb(&ctx); 1714 1715 /* 1716 * We use start here because we will need to wait on the IO to complete 1717 * in btrfs_sync_log, which could require joining a transaction (for 1718 * example checking cross references in the nocow path). If we use join 1719 * here we could get into a situation where we're waiting on IO to 1720 * happen that is blocked on a transaction trying to commit. With start 1721 * we inc the extwriter counter, so we wait for all extwriters to exit 1722 * before we start blocking joiners. This comment is to keep somebody 1723 * from thinking they are super smart and changing this to 1724 * btrfs_join_transaction *cough*Josef*cough*. 1725 */ 1726 trans = btrfs_start_transaction(root, 0); 1727 if (IS_ERR(trans)) { 1728 ret = PTR_ERR(trans); 1729 goto out_release_extents; 1730 } 1731 trans->in_fsync = true; 1732 1733 ret = btrfs_log_dentry_safe(trans, dentry, &ctx); 1734 /* 1735 * Scratch eb no longer needed, release before syncing log or commit 1736 * transaction, to avoid holding unnecessary memory during such long 1737 * operations. 1738 */ 1739 if (ctx.scratch_eb) { 1740 free_extent_buffer(ctx.scratch_eb); 1741 ctx.scratch_eb = NULL; 1742 } 1743 btrfs_release_log_ctx_extents(&ctx); 1744 if (ret < 0) { 1745 /* Fallthrough and commit/free transaction. */ 1746 ret = BTRFS_LOG_FORCE_COMMIT; 1747 } 1748 1749 /* we've logged all the items and now have a consistent 1750 * version of the file in the log. It is possible that 1751 * someone will come in and modify the file, but that's 1752 * fine because the log is consistent on disk, and we 1753 * have references to all of the file's extents 1754 * 1755 * It is possible that someone will come in and log the 1756 * file again, but that will end up using the synchronization 1757 * inside btrfs_sync_log to keep things safe. 1758 */ 1759 if (skip_ilock) 1760 up_write(&inode->i_mmap_lock); 1761 else 1762 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1763 1764 if (ret == BTRFS_NO_LOG_SYNC) { 1765 ret = btrfs_end_transaction(trans); 1766 goto out; 1767 } 1768 1769 /* We successfully logged the inode, attempt to sync the log. */ 1770 if (!ret) { 1771 ret = btrfs_sync_log(trans, root, &ctx); 1772 if (!ret) { 1773 ret = btrfs_end_transaction(trans); 1774 goto out; 1775 } 1776 } 1777 1778 /* 1779 * At this point we need to commit the transaction because we had 1780 * btrfs_need_log_full_commit() or some other error. 1781 * 1782 * If we didn't do a full sync we have to stop the trans handle, wait on 1783 * the ordered extents, start it again and commit the transaction. If 1784 * we attempt to wait on the ordered extents here we could deadlock with 1785 * something like fallocate() that is holding the extent lock trying to 1786 * start a transaction while some other thread is trying to commit the 1787 * transaction while we (fsync) are currently holding the transaction 1788 * open. 1789 */ 1790 if (!full_sync) { 1791 ret = btrfs_end_transaction(trans); 1792 if (ret) 1793 goto out; 1794 ret = btrfs_wait_ordered_range(inode, start, len); 1795 if (ret) 1796 goto out; 1797 1798 /* 1799 * This is safe to use here because we're only interested in 1800 * making sure the transaction that had the ordered extents is 1801 * committed. We aren't waiting on anything past this point, 1802 * we're purely getting the transaction and committing it. 1803 */ 1804 trans = btrfs_attach_transaction_barrier(root); 1805 if (IS_ERR(trans)) { 1806 ret = PTR_ERR(trans); 1807 1808 /* 1809 * We committed the transaction and there's no currently 1810 * running transaction, this means everything we care 1811 * about made it to disk and we are done. 1812 */ 1813 if (ret == -ENOENT) 1814 ret = 0; 1815 goto out; 1816 } 1817 } 1818 1819 ret = btrfs_commit_transaction(trans); 1820 out: 1821 free_extent_buffer(ctx.scratch_eb); 1822 ASSERT(list_empty(&ctx.list)); 1823 ASSERT(list_empty(&ctx.conflict_inodes)); 1824 err = file_check_and_advance_wb_err(file); 1825 if (!ret) 1826 ret = err; 1827 return ret > 0 ? -EIO : ret; 1828 1829 out_release_extents: 1830 btrfs_release_log_ctx_extents(&ctx); 1831 if (skip_ilock) 1832 up_write(&inode->i_mmap_lock); 1833 else 1834 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1835 goto out; 1836 } 1837 1838 /* 1839 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 1840 * called from a page fault handler when a page is first dirtied. Hence we must 1841 * be careful to check for EOF conditions here. We set the page up correctly 1842 * for a written page which means we get ENOSPC checking when writing into 1843 * holes and correct delalloc and unwritten extent mapping on filesystems that 1844 * support these features. 1845 * 1846 * We are not allowed to take the i_mutex here so we have to play games to 1847 * protect against truncate races as the page could now be beyond EOF. Because 1848 * truncate_setsize() writes the inode size before removing pages, once we have 1849 * the page lock we can determine safely if the page is beyond EOF. If it is not 1850 * beyond EOF, then the page is guaranteed safe against truncation until we 1851 * unlock the page. 1852 */ 1853 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) 1854 { 1855 struct page *page = vmf->page; 1856 struct folio *folio = page_folio(page); 1857 struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file)); 1858 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1859 struct extent_io_tree *io_tree = &inode->io_tree; 1860 struct btrfs_ordered_extent *ordered; 1861 struct extent_state *cached_state = NULL; 1862 struct extent_changeset *data_reserved = NULL; 1863 unsigned long zero_start; 1864 loff_t size; 1865 size_t fsize = folio_size(folio); 1866 int ret; 1867 bool only_release_metadata = false; 1868 u64 reserved_space; 1869 u64 page_start; 1870 u64 page_end; 1871 u64 end; 1872 1873 reserved_space = fsize; 1874 1875 sb_start_pagefault(inode->vfs_inode.i_sb); 1876 page_start = folio_pos(folio); 1877 page_end = page_start + folio_size(folio) - 1; 1878 end = page_end; 1879 1880 /* 1881 * Reserving delalloc space after obtaining the page lock can lead to 1882 * deadlock. For example, if a dirty page is locked by this function 1883 * and the call to btrfs_delalloc_reserve_space() ends up triggering 1884 * dirty page write out, then the btrfs_writepages() function could 1885 * end up waiting indefinitely to get a lock on the page currently 1886 * being processed by btrfs_page_mkwrite() function. 1887 */ 1888 ret = btrfs_check_data_free_space(inode, &data_reserved, page_start, 1889 reserved_space, false); 1890 if (ret < 0) { 1891 size_t write_bytes = reserved_space; 1892 1893 if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0) 1894 goto out_noreserve; 1895 1896 only_release_metadata = true; 1897 1898 /* 1899 * Can't write the whole range, there may be shared extents or 1900 * holes in the range, bail out with @only_release_metadata set 1901 * to true so that we unlock the nocow lock before returning the 1902 * error. 1903 */ 1904 if (write_bytes < reserved_space) 1905 goto out_noreserve; 1906 } 1907 ret = btrfs_delalloc_reserve_metadata(inode, reserved_space, 1908 reserved_space, false); 1909 if (ret < 0) { 1910 if (!only_release_metadata) 1911 btrfs_free_reserved_data_space(inode, data_reserved, 1912 page_start, reserved_space); 1913 goto out_noreserve; 1914 } 1915 1916 ret = file_update_time(vmf->vma->vm_file); 1917 if (ret < 0) 1918 goto out; 1919 again: 1920 down_read(&inode->i_mmap_lock); 1921 folio_lock(folio); 1922 size = i_size_read(&inode->vfs_inode); 1923 1924 if ((folio->mapping != inode->vfs_inode.i_mapping) || 1925 (page_start >= size)) { 1926 /* Page got truncated out from underneath us. */ 1927 goto out_unlock; 1928 } 1929 folio_wait_writeback(folio); 1930 1931 btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); 1932 ret = set_folio_extent_mapped(folio); 1933 if (ret < 0) { 1934 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1935 goto out_unlock; 1936 } 1937 1938 /* 1939 * We can't set the delalloc bits if there are pending ordered 1940 * extents. Drop our locks and wait for them to finish. 1941 */ 1942 ordered = btrfs_lookup_ordered_range(inode, page_start, fsize); 1943 if (ordered) { 1944 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1945 folio_unlock(folio); 1946 up_read(&inode->i_mmap_lock); 1947 btrfs_start_ordered_extent(ordered); 1948 btrfs_put_ordered_extent(ordered); 1949 goto again; 1950 } 1951 1952 if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { 1953 reserved_space = round_up(size - page_start, fs_info->sectorsize); 1954 if (reserved_space < fsize) { 1955 const u64 to_free = fsize - reserved_space; 1956 1957 end = page_start + reserved_space - 1; 1958 if (only_release_metadata) 1959 btrfs_delalloc_release_metadata(inode, to_free, true); 1960 else 1961 btrfs_delalloc_release_space(inode, data_reserved, 1962 end + 1, to_free, true); 1963 } 1964 } 1965 1966 /* 1967 * page_mkwrite gets called when the page is firstly dirtied after it's 1968 * faulted in, but write(2) could also dirty a page and set delalloc 1969 * bits, thus in this case for space account reason, we still need to 1970 * clear any delalloc bits within this page range since we have to 1971 * reserve data&meta space before lock_page() (see above comments). 1972 */ 1973 btrfs_clear_extent_bit(io_tree, page_start, end, 1974 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1975 EXTENT_DEFRAG, &cached_state); 1976 1977 ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state); 1978 if (ret < 0) { 1979 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1980 goto out_unlock; 1981 } 1982 1983 /* Page is wholly or partially inside EOF. */ 1984 if (page_start + folio_size(folio) > size) 1985 zero_start = offset_in_folio(folio, size); 1986 else 1987 zero_start = fsize; 1988 1989 if (zero_start != fsize) 1990 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); 1991 1992 btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); 1993 btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); 1994 btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); 1995 1996 btrfs_set_inode_last_sub_trans(inode); 1997 1998 if (only_release_metadata) 1999 btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE, 2000 &cached_state); 2001 2002 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 2003 up_read(&inode->i_mmap_lock); 2004 2005 btrfs_delalloc_release_extents(inode, fsize); 2006 if (only_release_metadata) 2007 btrfs_check_nocow_unlock(inode); 2008 sb_end_pagefault(inode->vfs_inode.i_sb); 2009 extent_changeset_free(data_reserved); 2010 return VM_FAULT_LOCKED; 2011 2012 out_unlock: 2013 folio_unlock(folio); 2014 up_read(&inode->i_mmap_lock); 2015 out: 2016 btrfs_delalloc_release_extents(inode, fsize); 2017 if (only_release_metadata) 2018 btrfs_delalloc_release_metadata(inode, reserved_space, true); 2019 else 2020 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2021 reserved_space, true); 2022 extent_changeset_free(data_reserved); 2023 out_noreserve: 2024 if (only_release_metadata) 2025 btrfs_check_nocow_unlock(inode); 2026 2027 sb_end_pagefault(inode->vfs_inode.i_sb); 2028 2029 if (ret < 0) 2030 return vmf_error(ret); 2031 2032 /* Make the VM retry the fault. */ 2033 return VM_FAULT_NOPAGE; 2034 } 2035 2036 static const struct vm_operations_struct btrfs_file_vm_ops = { 2037 .fault = filemap_fault, 2038 .map_pages = filemap_map_pages, 2039 .page_mkwrite = btrfs_page_mkwrite, 2040 }; 2041 2042 static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) 2043 { 2044 struct file *filp = desc->file; 2045 struct address_space *mapping = filp->f_mapping; 2046 2047 if (!mapping->a_ops->read_folio) 2048 return -ENOEXEC; 2049 2050 file_accessed(filp); 2051 desc->vm_ops = &btrfs_file_vm_ops; 2052 2053 return 0; 2054 } 2055 2056 static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, 2057 int slot, u64 start, u64 end) 2058 { 2059 struct btrfs_file_extent_item *fi; 2060 struct btrfs_key key; 2061 2062 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 2063 return false; 2064 2065 btrfs_item_key_to_cpu(leaf, &key, slot); 2066 if (key.objectid != btrfs_ino(inode) || 2067 key.type != BTRFS_EXTENT_DATA_KEY) 2068 return false; 2069 2070 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2071 2072 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2073 return false; 2074 2075 if (btrfs_file_extent_disk_bytenr(leaf, fi)) 2076 return false; 2077 2078 if (key.offset == end) 2079 return true; 2080 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) 2081 return true; 2082 return false; 2083 } 2084 2085 static int fill_holes(struct btrfs_trans_handle *trans, 2086 struct btrfs_inode *inode, 2087 struct btrfs_path *path, u64 offset, u64 end) 2088 { 2089 struct btrfs_fs_info *fs_info = trans->fs_info; 2090 struct btrfs_root *root = inode->root; 2091 struct extent_buffer *leaf; 2092 struct btrfs_file_extent_item *fi; 2093 struct extent_map *hole_em; 2094 struct btrfs_key key; 2095 int ret; 2096 2097 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 2098 goto out; 2099 2100 key.objectid = btrfs_ino(inode); 2101 key.type = BTRFS_EXTENT_DATA_KEY; 2102 key.offset = offset; 2103 2104 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2105 if (ret <= 0) { 2106 /* 2107 * We should have dropped this offset, so if we find it then 2108 * something has gone horribly wrong. 2109 */ 2110 if (ret == 0) 2111 ret = -EINVAL; 2112 return ret; 2113 } 2114 2115 leaf = path->nodes[0]; 2116 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { 2117 u64 num_bytes; 2118 2119 path->slots[0]--; 2120 fi = btrfs_item_ptr(leaf, path->slots[0], 2121 struct btrfs_file_extent_item); 2122 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + 2123 end - offset; 2124 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2125 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2126 btrfs_set_file_extent_offset(leaf, fi, 0); 2127 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2128 goto out; 2129 } 2130 2131 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { 2132 u64 num_bytes; 2133 2134 key.offset = offset; 2135 btrfs_set_item_key_safe(trans, path, &key); 2136 fi = btrfs_item_ptr(leaf, path->slots[0], 2137 struct btrfs_file_extent_item); 2138 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 2139 offset; 2140 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2141 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2142 btrfs_set_file_extent_offset(leaf, fi, 0); 2143 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2144 goto out; 2145 } 2146 btrfs_release_path(path); 2147 2148 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, 2149 end - offset); 2150 if (ret) 2151 return ret; 2152 2153 out: 2154 btrfs_release_path(path); 2155 2156 hole_em = btrfs_alloc_extent_map(); 2157 if (!hole_em) { 2158 btrfs_drop_extent_map_range(inode, offset, end - 1, false); 2159 btrfs_set_inode_full_sync(inode); 2160 } else { 2161 hole_em->start = offset; 2162 hole_em->len = end - offset; 2163 hole_em->ram_bytes = hole_em->len; 2164 2165 hole_em->disk_bytenr = EXTENT_MAP_HOLE; 2166 hole_em->disk_num_bytes = 0; 2167 hole_em->generation = trans->transid; 2168 2169 ret = btrfs_replace_extent_map_range(inode, hole_em, true); 2170 btrfs_free_extent_map(hole_em); 2171 if (ret) 2172 btrfs_set_inode_full_sync(inode); 2173 } 2174 2175 return 0; 2176 } 2177 2178 /* 2179 * Find a hole extent on given inode and change start/len to the end of hole 2180 * extent.(hole/vacuum extent whose em->start <= start && 2181 * em->start + em->len > start) 2182 * When a hole extent is found, return 1 and modify start/len. 2183 */ 2184 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 2185 { 2186 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2187 struct extent_map *em; 2188 int ret = 0; 2189 2190 em = btrfs_get_extent(inode, NULL, 2191 round_down(*start, fs_info->sectorsize), 2192 round_up(*len, fs_info->sectorsize)); 2193 if (IS_ERR(em)) 2194 return PTR_ERR(em); 2195 2196 /* Hole or vacuum extent(only exists in no-hole mode) */ 2197 if (em->disk_bytenr == EXTENT_MAP_HOLE) { 2198 ret = 1; 2199 *len = em->start + em->len > *start + *len ? 2200 0 : *start + *len - em->start - em->len; 2201 *start = em->start + em->len; 2202 } 2203 btrfs_free_extent_map(em); 2204 return ret; 2205 } 2206 2207 /* 2208 * Check if there is no folio in the range. 2209 * 2210 * We cannot utilize filemap_range_has_page() in a filemap with large folios 2211 * as we can hit the following false positive: 2212 * 2213 * start end 2214 * | | 2215 * |//|//|//|//| | | | | | | | |//|//| 2216 * \ / \ / 2217 * Folio A Folio B 2218 * 2219 * That large folio A and B cover the start and end indexes. 2220 * In that case filemap_range_has_page() will always return true, but the above 2221 * case is fine for btrfs_punch_hole_lock_range() usage. 2222 * 2223 * So here we only ensure that no other folios is in the range, excluding the 2224 * head/tail large folio. 2225 */ 2226 static bool check_range_has_page(struct inode *inode, u64 start, u64 end) 2227 { 2228 struct folio_batch fbatch; 2229 bool ret = false; 2230 /* 2231 * For subpage case, if the range is not at page boundary, we could 2232 * have pages at the leading/tailing part of the range. 2233 * This could lead to dead loop since filemap_range_has_page() 2234 * will always return true. 2235 * So here we need to do extra page alignment for 2236 * filemap_range_has_page(). 2237 * 2238 * And do not decrease page_lockend right now, as it can be 0. 2239 */ 2240 const u64 page_lockstart = round_up(start, PAGE_SIZE); 2241 const u64 page_lockend = round_down(end + 1, PAGE_SIZE); 2242 const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; 2243 const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; 2244 pgoff_t tmp = start_index; 2245 int found_folios; 2246 2247 /* The same page or adjacent pages. */ 2248 if (page_lockend <= page_lockstart) 2249 return false; 2250 2251 folio_batch_init(&fbatch); 2252 found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); 2253 for (int i = 0; i < found_folios; i++) { 2254 struct folio *folio = fbatch.folios[i]; 2255 2256 /* A large folio begins before the start. Not a target. */ 2257 if (folio->index < start_index) 2258 continue; 2259 /* A large folio extends beyond the end. Not a target. */ 2260 if (folio_next_index(folio) > end_index) 2261 continue; 2262 /* A folio doesn't cover the head/tail index. Found a target. */ 2263 ret = true; 2264 break; 2265 } 2266 folio_batch_release(&fbatch); 2267 return ret; 2268 } 2269 2270 static void btrfs_punch_hole_lock_range(struct inode *inode, 2271 const u64 lockstart, const u64 lockend, 2272 struct extent_state **cached_state) 2273 { 2274 while (1) { 2275 truncate_pagecache_range(inode, lockstart, lockend); 2276 2277 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2278 cached_state); 2279 /* 2280 * We can't have ordered extents in the range, nor dirty/writeback 2281 * pages, because we have locked the inode's VFS lock in exclusive 2282 * mode, we have locked the inode's i_mmap_lock in exclusive mode, 2283 * we have flushed all delalloc in the range and we have waited 2284 * for any ordered extents in the range to complete. 2285 * We can race with anyone reading pages from this range, so after 2286 * locking the range check if we have pages in the range, and if 2287 * we do, unlock the range and retry. 2288 */ 2289 if (!check_range_has_page(inode, lockstart, lockend)) 2290 break; 2291 2292 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2293 cached_state); 2294 } 2295 2296 btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); 2297 } 2298 2299 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, 2300 struct btrfs_inode *inode, 2301 struct btrfs_path *path, 2302 struct btrfs_replace_extent_info *extent_info, 2303 const u64 replace_len, 2304 const u64 bytes_to_drop) 2305 { 2306 struct btrfs_fs_info *fs_info = trans->fs_info; 2307 struct btrfs_root *root = inode->root; 2308 struct btrfs_file_extent_item *extent; 2309 struct extent_buffer *leaf; 2310 struct btrfs_key key; 2311 int slot; 2312 int ret; 2313 2314 if (replace_len == 0) 2315 return 0; 2316 2317 if (extent_info->disk_offset == 0 && 2318 btrfs_fs_incompat(fs_info, NO_HOLES)) { 2319 btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 2320 return 0; 2321 } 2322 2323 key.objectid = btrfs_ino(inode); 2324 key.type = BTRFS_EXTENT_DATA_KEY; 2325 key.offset = extent_info->file_offset; 2326 ret = btrfs_insert_empty_item(trans, root, path, &key, 2327 sizeof(struct btrfs_file_extent_item)); 2328 if (ret) 2329 return ret; 2330 leaf = path->nodes[0]; 2331 slot = path->slots[0]; 2332 write_extent_buffer(leaf, extent_info->extent_buf, 2333 btrfs_item_ptr_offset(leaf, slot), 2334 sizeof(struct btrfs_file_extent_item)); 2335 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2336 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); 2337 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); 2338 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); 2339 if (extent_info->is_new_extent) 2340 btrfs_set_file_extent_generation(leaf, extent, trans->transid); 2341 btrfs_release_path(path); 2342 2343 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, 2344 replace_len); 2345 if (ret) 2346 return ret; 2347 2348 /* If it's a hole, nothing more needs to be done. */ 2349 if (extent_info->disk_offset == 0) { 2350 btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 2351 return 0; 2352 } 2353 2354 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); 2355 2356 if (extent_info->is_new_extent && extent_info->insertions == 0) { 2357 key.objectid = extent_info->disk_offset; 2358 key.type = BTRFS_EXTENT_ITEM_KEY; 2359 key.offset = extent_info->disk_len; 2360 ret = btrfs_alloc_reserved_file_extent(trans, root, 2361 btrfs_ino(inode), 2362 extent_info->file_offset, 2363 extent_info->qgroup_reserved, 2364 &key); 2365 } else { 2366 struct btrfs_ref ref = { 2367 .action = BTRFS_ADD_DELAYED_REF, 2368 .bytenr = extent_info->disk_offset, 2369 .num_bytes = extent_info->disk_len, 2370 .owning_root = btrfs_root_id(root), 2371 .ref_root = btrfs_root_id(root), 2372 }; 2373 u64 ref_offset; 2374 2375 ref_offset = extent_info->file_offset - extent_info->data_offset; 2376 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); 2377 ret = btrfs_inc_extent_ref(trans, &ref); 2378 } 2379 2380 extent_info->insertions++; 2381 2382 return ret; 2383 } 2384 2385 /* 2386 * The respective range must have been previously locked, as well as the inode. 2387 * The end offset is inclusive (last byte of the range). 2388 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing 2389 * the file range with an extent. 2390 * When not punching a hole, we don't want to end up in a state where we dropped 2391 * extents without inserting a new one, so we must abort the transaction to avoid 2392 * a corruption. 2393 */ 2394 int btrfs_replace_file_extents(struct btrfs_inode *inode, 2395 struct btrfs_path *path, const u64 start, 2396 const u64 end, 2397 struct btrfs_replace_extent_info *extent_info, 2398 struct btrfs_trans_handle **trans_out) 2399 { 2400 struct btrfs_drop_extents_args drop_args = { 0 }; 2401 struct btrfs_root *root = inode->root; 2402 struct btrfs_fs_info *fs_info = root->fs_info; 2403 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); 2404 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); 2405 struct btrfs_trans_handle *trans = NULL; 2406 struct btrfs_block_rsv rsv; 2407 unsigned int rsv_count; 2408 u64 cur_offset; 2409 u64 len = end - start; 2410 int ret = 0; 2411 2412 if (end <= start) 2413 return -EINVAL; 2414 2415 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); 2416 rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1); 2417 rsv.failfast = true; 2418 2419 /* 2420 * 1 - update the inode 2421 * 1 - removing the extents in the range 2422 * 1 - adding the hole extent if no_holes isn't set or if we are 2423 * replacing the range with a new extent 2424 */ 2425 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) 2426 rsv_count = 3; 2427 else 2428 rsv_count = 2; 2429 2430 trans = btrfs_start_transaction(root, rsv_count); 2431 if (IS_ERR(trans)) { 2432 ret = PTR_ERR(trans); 2433 trans = NULL; 2434 goto out_release; 2435 } 2436 2437 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, 2438 min_size, false); 2439 if (WARN_ON(ret)) 2440 goto out_trans; 2441 trans->block_rsv = &rsv; 2442 2443 cur_offset = start; 2444 drop_args.path = path; 2445 drop_args.end = end + 1; 2446 drop_args.drop_cache = true; 2447 while (cur_offset < end) { 2448 drop_args.start = cur_offset; 2449 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2450 /* If we are punching a hole decrement the inode's byte count */ 2451 if (!extent_info) 2452 btrfs_update_inode_bytes(inode, 0, 2453 drop_args.bytes_found); 2454 if (ret != -ENOSPC) { 2455 /* 2456 * The only time we don't want to abort is if we are 2457 * attempting to clone a partial inline extent, in which 2458 * case we'll get EOPNOTSUPP. However if we aren't 2459 * clone we need to abort no matter what, because if we 2460 * got EOPNOTSUPP via prealloc then we messed up and 2461 * need to abort. 2462 */ 2463 if (unlikely(ret && 2464 (ret != -EOPNOTSUPP || 2465 (extent_info && extent_info->is_new_extent)))) 2466 btrfs_abort_transaction(trans, ret); 2467 break; 2468 } 2469 2470 trans->block_rsv = &fs_info->trans_block_rsv; 2471 2472 if (!extent_info && cur_offset < drop_args.drop_end && 2473 cur_offset < ino_size) { 2474 ret = fill_holes(trans, inode, path, cur_offset, 2475 drop_args.drop_end); 2476 if (unlikely(ret)) { 2477 /* 2478 * If we failed then we didn't insert our hole 2479 * entries for the area we dropped, so now the 2480 * fs is corrupted, so we must abort the 2481 * transaction. 2482 */ 2483 btrfs_abort_transaction(trans, ret); 2484 break; 2485 } 2486 } else if (!extent_info && cur_offset < drop_args.drop_end) { 2487 /* 2488 * We are past the i_size here, but since we didn't 2489 * insert holes we need to clear the mapped area so we 2490 * know to not set disk_i_size in this area until a new 2491 * file extent is inserted here. 2492 */ 2493 ret = btrfs_inode_clear_file_extent_range(inode, 2494 cur_offset, 2495 drop_args.drop_end - cur_offset); 2496 if (unlikely(ret)) { 2497 /* 2498 * We couldn't clear our area, so we could 2499 * presumably adjust up and corrupt the fs, so 2500 * we need to abort. 2501 */ 2502 btrfs_abort_transaction(trans, ret); 2503 break; 2504 } 2505 } 2506 2507 if (extent_info && 2508 drop_args.drop_end > extent_info->file_offset) { 2509 u64 replace_len = drop_args.drop_end - 2510 extent_info->file_offset; 2511 2512 ret = btrfs_insert_replace_extent(trans, inode, path, 2513 extent_info, replace_len, 2514 drop_args.bytes_found); 2515 if (unlikely(ret)) { 2516 btrfs_abort_transaction(trans, ret); 2517 break; 2518 } 2519 extent_info->data_len -= replace_len; 2520 extent_info->data_offset += replace_len; 2521 extent_info->file_offset += replace_len; 2522 } 2523 2524 /* 2525 * We are releasing our handle on the transaction, balance the 2526 * dirty pages of the btree inode and flush delayed items, and 2527 * then get a new transaction handle, which may now point to a 2528 * new transaction in case someone else may have committed the 2529 * transaction we used to replace/drop file extent items. So 2530 * bump the inode's iversion and update mtime and ctime except 2531 * if we are called from a dedupe context. This is because a 2532 * power failure/crash may happen after the transaction is 2533 * committed and before we finish replacing/dropping all the 2534 * file extent items we need. 2535 */ 2536 inode_inc_iversion(&inode->vfs_inode); 2537 2538 if (!extent_info || extent_info->update_times) 2539 inode_set_mtime_to_ts(&inode->vfs_inode, 2540 inode_set_ctime_current(&inode->vfs_inode)); 2541 2542 ret = btrfs_update_inode(trans, inode); 2543 if (ret) 2544 break; 2545 2546 btrfs_end_transaction(trans); 2547 btrfs_btree_balance_dirty(fs_info); 2548 2549 trans = btrfs_start_transaction(root, rsv_count); 2550 if (IS_ERR(trans)) { 2551 ret = PTR_ERR(trans); 2552 trans = NULL; 2553 break; 2554 } 2555 2556 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 2557 &rsv, min_size, false); 2558 if (WARN_ON(ret)) 2559 break; 2560 trans->block_rsv = &rsv; 2561 2562 cur_offset = drop_args.drop_end; 2563 len = end - cur_offset; 2564 if (!extent_info && len) { 2565 ret = find_first_non_hole(inode, &cur_offset, &len); 2566 if (unlikely(ret < 0)) 2567 break; 2568 if (ret && !len) { 2569 ret = 0; 2570 break; 2571 } 2572 } 2573 } 2574 2575 /* 2576 * If we were cloning, force the next fsync to be a full one since we 2577 * we replaced (or just dropped in the case of cloning holes when 2578 * NO_HOLES is enabled) file extent items and did not setup new extent 2579 * maps for the replacement extents (or holes). 2580 */ 2581 if (extent_info && !extent_info->is_new_extent) 2582 btrfs_set_inode_full_sync(inode); 2583 2584 if (ret) 2585 goto out_trans; 2586 2587 trans->block_rsv = &fs_info->trans_block_rsv; 2588 /* 2589 * If we are using the NO_HOLES feature we might have had already an 2590 * hole that overlaps a part of the region [lockstart, lockend] and 2591 * ends at (or beyond) lockend. Since we have no file extent items to 2592 * represent holes, drop_end can be less than lockend and so we must 2593 * make sure we have an extent map representing the existing hole (the 2594 * call to __btrfs_drop_extents() might have dropped the existing extent 2595 * map representing the existing hole), otherwise the fast fsync path 2596 * will not record the existence of the hole region 2597 * [existing_hole_start, lockend]. 2598 */ 2599 if (drop_args.drop_end <= end) 2600 drop_args.drop_end = end + 1; 2601 /* 2602 * Don't insert file hole extent item if it's for a range beyond eof 2603 * (because it's useless) or if it represents a 0 bytes range (when 2604 * cur_offset == drop_end). 2605 */ 2606 if (!extent_info && cur_offset < ino_size && 2607 cur_offset < drop_args.drop_end) { 2608 ret = fill_holes(trans, inode, path, cur_offset, 2609 drop_args.drop_end); 2610 if (unlikely(ret)) { 2611 /* Same comment as above. */ 2612 btrfs_abort_transaction(trans, ret); 2613 goto out_trans; 2614 } 2615 } else if (!extent_info && cur_offset < drop_args.drop_end) { 2616 /* See the comment in the loop above for the reasoning here. */ 2617 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, 2618 drop_args.drop_end - cur_offset); 2619 if (unlikely(ret)) { 2620 btrfs_abort_transaction(trans, ret); 2621 goto out_trans; 2622 } 2623 2624 } 2625 if (extent_info) { 2626 ret = btrfs_insert_replace_extent(trans, inode, path, 2627 extent_info, extent_info->data_len, 2628 drop_args.bytes_found); 2629 if (unlikely(ret)) { 2630 btrfs_abort_transaction(trans, ret); 2631 goto out_trans; 2632 } 2633 } 2634 2635 out_trans: 2636 if (!trans) 2637 goto out_release; 2638 2639 trans->block_rsv = &fs_info->trans_block_rsv; 2640 if (ret) 2641 btrfs_end_transaction(trans); 2642 else 2643 *trans_out = trans; 2644 out_release: 2645 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); 2646 return ret; 2647 } 2648 2649 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) 2650 { 2651 struct inode *inode = file_inode(file); 2652 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2653 struct btrfs_root *root = BTRFS_I(inode)->root; 2654 struct extent_state *cached_state = NULL; 2655 struct btrfs_path *path; 2656 struct btrfs_trans_handle *trans = NULL; 2657 u64 lockstart; 2658 u64 lockend; 2659 u64 tail_start; 2660 u64 tail_len; 2661 const u64 orig_start = offset; 2662 const u64 orig_end = offset + len - 1; 2663 int ret = 0; 2664 bool same_block; 2665 u64 ino_size; 2666 bool truncated_block = false; 2667 bool updated_inode = false; 2668 2669 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2670 2671 ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len); 2672 if (ret) 2673 goto out_only_mutex; 2674 2675 ino_size = round_up(inode->i_size, fs_info->sectorsize); 2676 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2677 if (ret < 0) 2678 goto out_only_mutex; 2679 if (ret && !len) { 2680 /* Already in a large hole */ 2681 ret = 0; 2682 goto out_only_mutex; 2683 } 2684 2685 ret = file_modified(file); 2686 if (ret) 2687 goto out_only_mutex; 2688 2689 lockstart = round_up(offset, fs_info->sectorsize); 2690 lockend = round_down(offset + len, fs_info->sectorsize) - 1; 2691 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) 2692 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); 2693 /* 2694 * Only do this if we are in the same block and we aren't doing the 2695 * entire block. 2696 */ 2697 if (same_block && len < fs_info->sectorsize) { 2698 if (offset < ino_size) { 2699 truncated_block = true; 2700 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 2701 orig_start, orig_end); 2702 } else { 2703 ret = 0; 2704 } 2705 goto out_only_mutex; 2706 } 2707 2708 /* zero back part of the first block */ 2709 if (offset < ino_size) { 2710 truncated_block = true; 2711 ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); 2712 if (ret) { 2713 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2714 return ret; 2715 } 2716 } 2717 2718 /* Check the aligned pages after the first unaligned page, 2719 * if offset != orig_start, which means the first unaligned page 2720 * including several following pages are already in holes, 2721 * the extra check can be skipped */ 2722 if (offset == orig_start) { 2723 /* after truncate page, check hole again */ 2724 len = offset + len - lockstart; 2725 offset = lockstart; 2726 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2727 if (ret < 0) 2728 goto out_only_mutex; 2729 if (ret && !len) { 2730 ret = 0; 2731 goto out_only_mutex; 2732 } 2733 lockstart = offset; 2734 } 2735 2736 /* Check the tail unaligned part is in a hole */ 2737 tail_start = lockend + 1; 2738 tail_len = offset + len - tail_start; 2739 if (tail_len) { 2740 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); 2741 if (unlikely(ret < 0)) 2742 goto out_only_mutex; 2743 if (!ret) { 2744 /* zero the front end of the last page */ 2745 if (tail_start + tail_len < ino_size) { 2746 truncated_block = true; 2747 ret = btrfs_truncate_block(BTRFS_I(inode), 2748 tail_start + tail_len - 1, 2749 orig_start, orig_end); 2750 if (ret) 2751 goto out_only_mutex; 2752 } 2753 } 2754 } 2755 2756 if (lockend < lockstart) { 2757 ret = 0; 2758 goto out_only_mutex; 2759 } 2760 2761 btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); 2762 2763 path = btrfs_alloc_path(); 2764 if (!path) { 2765 ret = -ENOMEM; 2766 goto out; 2767 } 2768 2769 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, 2770 lockend, NULL, &trans); 2771 btrfs_free_path(path); 2772 if (ret) 2773 goto out; 2774 2775 ASSERT(trans != NULL); 2776 inode_inc_iversion(inode); 2777 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 2778 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2779 updated_inode = true; 2780 btrfs_end_transaction(trans); 2781 btrfs_btree_balance_dirty(fs_info); 2782 out: 2783 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2784 &cached_state); 2785 out_only_mutex: 2786 if (!updated_inode && truncated_block && !ret) { 2787 /* 2788 * If we only end up zeroing part of a page, we still need to 2789 * update the inode item, so that all the time fields are 2790 * updated as well as the necessary btrfs inode in memory fields 2791 * for detecting, at fsync time, if the inode isn't yet in the 2792 * log tree or it's there but not up to date. 2793 */ 2794 struct timespec64 now = inode_set_ctime_current(inode); 2795 2796 inode_inc_iversion(inode); 2797 inode_set_mtime_to_ts(inode, now); 2798 trans = btrfs_start_transaction(root, 1); 2799 if (IS_ERR(trans)) { 2800 ret = PTR_ERR(trans); 2801 } else { 2802 int ret2; 2803 2804 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2805 ret2 = btrfs_end_transaction(trans); 2806 if (!ret) 2807 ret = ret2; 2808 } 2809 } 2810 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2811 return ret; 2812 } 2813 2814 /* Helper structure to record which range is already reserved */ 2815 struct falloc_range { 2816 struct list_head list; 2817 u64 start; 2818 u64 len; 2819 }; 2820 2821 /* 2822 * Helper function to add falloc range 2823 * 2824 * Caller should have locked the larger range of extent containing 2825 * [start, len) 2826 */ 2827 static int add_falloc_range(struct list_head *head, u64 start, u64 len) 2828 { 2829 struct falloc_range *range = NULL; 2830 2831 if (!list_empty(head)) { 2832 /* 2833 * As fallocate iterates by bytenr order, we only need to check 2834 * the last range. 2835 */ 2836 range = list_last_entry(head, struct falloc_range, list); 2837 if (range->start + range->len == start) { 2838 range->len += len; 2839 return 0; 2840 } 2841 } 2842 2843 range = kmalloc(sizeof(*range), GFP_KERNEL); 2844 if (!range) 2845 return -ENOMEM; 2846 range->start = start; 2847 range->len = len; 2848 list_add_tail(&range->list, head); 2849 return 0; 2850 } 2851 2852 static int btrfs_fallocate_update_isize(struct inode *inode, 2853 const u64 end, 2854 const int mode) 2855 { 2856 struct btrfs_trans_handle *trans; 2857 struct btrfs_root *root = BTRFS_I(inode)->root; 2858 u64 range_start; 2859 u64 range_end; 2860 int ret; 2861 int ret2; 2862 2863 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) 2864 return 0; 2865 2866 range_start = round_down(i_size_read(inode), root->fs_info->sectorsize); 2867 range_end = round_up(end, root->fs_info->sectorsize); 2868 2869 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start, 2870 range_end - range_start); 2871 if (ret) 2872 return ret; 2873 2874 trans = btrfs_start_transaction(root, 1); 2875 if (IS_ERR(trans)) 2876 return PTR_ERR(trans); 2877 2878 inode_set_ctime_current(inode); 2879 i_size_write(inode, end); 2880 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 2881 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2882 ret2 = btrfs_end_transaction(trans); 2883 2884 return ret ? ret : ret2; 2885 } 2886 2887 enum { 2888 RANGE_BOUNDARY_WRITTEN_EXTENT, 2889 RANGE_BOUNDARY_PREALLOC_EXTENT, 2890 RANGE_BOUNDARY_HOLE, 2891 }; 2892 2893 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, 2894 u64 offset) 2895 { 2896 const u64 sectorsize = inode->root->fs_info->sectorsize; 2897 struct extent_map *em; 2898 int ret; 2899 2900 offset = round_down(offset, sectorsize); 2901 em = btrfs_get_extent(inode, NULL, offset, sectorsize); 2902 if (IS_ERR(em)) 2903 return PTR_ERR(em); 2904 2905 if (em->disk_bytenr == EXTENT_MAP_HOLE) 2906 ret = RANGE_BOUNDARY_HOLE; 2907 else if (em->flags & EXTENT_FLAG_PREALLOC) 2908 ret = RANGE_BOUNDARY_PREALLOC_EXTENT; 2909 else 2910 ret = RANGE_BOUNDARY_WRITTEN_EXTENT; 2911 2912 btrfs_free_extent_map(em); 2913 return ret; 2914 } 2915 2916 static int btrfs_zero_range(struct inode *inode, 2917 loff_t offset, 2918 loff_t len, 2919 const int mode) 2920 { 2921 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2922 struct extent_map *em; 2923 struct extent_changeset *data_reserved = NULL; 2924 int ret; 2925 u64 alloc_hint = 0; 2926 const u64 sectorsize = fs_info->sectorsize; 2927 const u64 orig_start = offset; 2928 const u64 orig_end = offset + len - 1; 2929 u64 alloc_start = round_down(offset, sectorsize); 2930 u64 alloc_end = round_up(offset + len, sectorsize); 2931 u64 bytes_to_reserve = 0; 2932 bool space_reserved = false; 2933 2934 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, 2935 alloc_end - alloc_start); 2936 if (IS_ERR(em)) { 2937 ret = PTR_ERR(em); 2938 goto out; 2939 } 2940 2941 /* 2942 * Avoid hole punching and extent allocation for some cases. More cases 2943 * could be considered, but these are unlikely common and we keep things 2944 * as simple as possible for now. Also, intentionally, if the target 2945 * range contains one or more prealloc extents together with regular 2946 * extents and holes, we drop all the existing extents and allocate a 2947 * new prealloc extent, so that we get a larger contiguous disk extent. 2948 */ 2949 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { 2950 const u64 em_end = em->start + em->len; 2951 2952 if (em_end >= offset + len) { 2953 /* 2954 * The whole range is already a prealloc extent, 2955 * do nothing except updating the inode's i_size if 2956 * needed. 2957 */ 2958 btrfs_free_extent_map(em); 2959 ret = btrfs_fallocate_update_isize(inode, offset + len, 2960 mode); 2961 goto out; 2962 } 2963 /* 2964 * Part of the range is already a prealloc extent, so operate 2965 * only on the remaining part of the range. 2966 */ 2967 alloc_start = em_end; 2968 ASSERT(IS_ALIGNED(alloc_start, sectorsize)); 2969 len = offset + len - alloc_start; 2970 offset = alloc_start; 2971 alloc_hint = btrfs_extent_map_block_start(em) + em->len; 2972 } 2973 btrfs_free_extent_map(em); 2974 2975 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == 2976 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { 2977 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); 2978 if (IS_ERR(em)) { 2979 ret = PTR_ERR(em); 2980 goto out; 2981 } 2982 2983 if (em->flags & EXTENT_FLAG_PREALLOC) { 2984 btrfs_free_extent_map(em); 2985 ret = btrfs_fallocate_update_isize(inode, offset + len, 2986 mode); 2987 goto out; 2988 } 2989 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { 2990 btrfs_free_extent_map(em); 2991 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 2992 orig_start, orig_end); 2993 if (!ret) 2994 ret = btrfs_fallocate_update_isize(inode, 2995 offset + len, 2996 mode); 2997 return ret; 2998 } 2999 btrfs_free_extent_map(em); 3000 alloc_start = round_down(offset, sectorsize); 3001 alloc_end = alloc_start + sectorsize; 3002 goto reserve_space; 3003 } 3004 3005 alloc_start = round_up(offset, sectorsize); 3006 alloc_end = round_down(offset + len, sectorsize); 3007 3008 /* 3009 * For unaligned ranges, check the pages at the boundaries, they might 3010 * map to an extent, in which case we need to partially zero them, or 3011 * they might map to a hole, in which case we need our allocation range 3012 * to cover them. 3013 */ 3014 if (!IS_ALIGNED(offset, sectorsize)) { 3015 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 3016 offset); 3017 if (ret < 0) 3018 goto out; 3019 if (ret == RANGE_BOUNDARY_HOLE) { 3020 alloc_start = round_down(offset, sectorsize); 3021 ret = 0; 3022 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 3023 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 3024 orig_start, orig_end); 3025 if (ret) 3026 goto out; 3027 } else { 3028 ret = 0; 3029 } 3030 } 3031 3032 if (!IS_ALIGNED(offset + len, sectorsize)) { 3033 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 3034 offset + len); 3035 if (ret < 0) 3036 goto out; 3037 if (ret == RANGE_BOUNDARY_HOLE) { 3038 alloc_end = round_up(offset + len, sectorsize); 3039 ret = 0; 3040 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 3041 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 3042 orig_start, orig_end); 3043 if (ret) 3044 goto out; 3045 } else { 3046 ret = 0; 3047 } 3048 } 3049 3050 reserve_space: 3051 if (alloc_start < alloc_end) { 3052 struct extent_state *cached_state = NULL; 3053 const u64 lockstart = alloc_start; 3054 const u64 lockend = alloc_end - 1; 3055 3056 bytes_to_reserve = alloc_end - alloc_start; 3057 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3058 bytes_to_reserve); 3059 if (ret < 0) 3060 goto out; 3061 space_reserved = true; 3062 btrfs_punch_hole_lock_range(inode, lockstart, lockend, 3063 &cached_state); 3064 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, 3065 alloc_start, bytes_to_reserve); 3066 if (ret) { 3067 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, 3068 lockend, &cached_state); 3069 goto out; 3070 } 3071 ret = btrfs_prealloc_file_range(inode, mode, alloc_start, 3072 alloc_end - alloc_start, 3073 fs_info->sectorsize, 3074 offset + len, &alloc_hint); 3075 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 3076 &cached_state); 3077 /* btrfs_prealloc_file_range releases reserved space on error */ 3078 if (ret) { 3079 space_reserved = false; 3080 goto out; 3081 } 3082 } 3083 ret = btrfs_fallocate_update_isize(inode, offset + len, mode); 3084 out: 3085 if (ret && space_reserved) 3086 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, 3087 alloc_start, bytes_to_reserve); 3088 extent_changeset_free(data_reserved); 3089 3090 return ret; 3091 } 3092 3093 static long btrfs_fallocate(struct file *file, int mode, 3094 loff_t offset, loff_t len) 3095 { 3096 struct inode *inode = file_inode(file); 3097 struct extent_state *cached_state = NULL; 3098 struct extent_changeset *data_reserved = NULL; 3099 struct falloc_range *range; 3100 struct falloc_range *tmp; 3101 LIST_HEAD(reserve_list); 3102 u64 cur_offset; 3103 u64 last_byte; 3104 u64 alloc_start; 3105 u64 alloc_end; 3106 u64 alloc_hint = 0; 3107 u64 locked_end; 3108 u64 actual_end = 0; 3109 u64 data_space_needed = 0; 3110 u64 data_space_reserved = 0; 3111 u64 qgroup_reserved = 0; 3112 struct extent_map *em; 3113 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; 3114 int ret; 3115 3116 /* Do not allow fallocate in ZONED mode */ 3117 if (btrfs_is_zoned(inode_to_fs_info(inode))) 3118 return -EOPNOTSUPP; 3119 3120 alloc_start = round_down(offset, blocksize); 3121 alloc_end = round_up(offset + len, blocksize); 3122 cur_offset = alloc_start; 3123 3124 /* Make sure we aren't being give some crap mode */ 3125 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 3126 FALLOC_FL_ZERO_RANGE)) 3127 return -EOPNOTSUPP; 3128 3129 if (mode & FALLOC_FL_PUNCH_HOLE) 3130 return btrfs_punch_hole(file, offset, len); 3131 3132 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3133 3134 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { 3135 ret = inode_newsize_ok(inode, offset + len); 3136 if (ret) 3137 goto out; 3138 } 3139 3140 ret = file_modified(file); 3141 if (ret) 3142 goto out; 3143 3144 /* 3145 * TODO: Move these two operations after we have checked 3146 * accurate reserved space, or fallocate can still fail but 3147 * with page truncated or size expanded. 3148 * 3149 * But that's a minor problem and won't do much harm BTW. 3150 */ 3151 if (alloc_start > inode->i_size) { 3152 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), 3153 alloc_start); 3154 if (ret) 3155 goto out; 3156 } else if (offset + len > inode->i_size) { 3157 /* 3158 * If we are fallocating from the end of the file onward we 3159 * need to zero out the end of the block if i_size lands in the 3160 * middle of a block. 3161 */ 3162 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 3163 inode->i_size, (u64)-1); 3164 if (ret) 3165 goto out; 3166 } 3167 3168 /* 3169 * We have locked the inode at the VFS level (in exclusive mode) and we 3170 * have locked the i_mmap_lock lock (in exclusive mode). Now before 3171 * locking the file range, flush all dealloc in the range and wait for 3172 * all ordered extents in the range to complete. After this we can lock 3173 * the file range and, due to the previous locking we did, we know there 3174 * can't be more delalloc or ordered extents in the range. 3175 */ 3176 ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start, 3177 alloc_end - alloc_start); 3178 if (ret) 3179 goto out; 3180 3181 if (mode & FALLOC_FL_ZERO_RANGE) { 3182 ret = btrfs_zero_range(inode, offset, len, mode); 3183 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3184 return ret; 3185 } 3186 3187 locked_end = alloc_end - 1; 3188 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 3189 &cached_state); 3190 3191 btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); 3192 3193 /* First, check if we exceed the qgroup limit */ 3194 while (cur_offset < alloc_end) { 3195 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, 3196 alloc_end - cur_offset); 3197 if (IS_ERR(em)) { 3198 ret = PTR_ERR(em); 3199 break; 3200 } 3201 last_byte = min(btrfs_extent_map_end(em), alloc_end); 3202 actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); 3203 last_byte = ALIGN(last_byte, blocksize); 3204 if (em->disk_bytenr == EXTENT_MAP_HOLE || 3205 (cur_offset >= inode->i_size && 3206 !(em->flags & EXTENT_FLAG_PREALLOC))) { 3207 const u64 range_len = last_byte - cur_offset; 3208 3209 ret = add_falloc_range(&reserve_list, cur_offset, range_len); 3210 if (ret < 0) { 3211 btrfs_free_extent_map(em); 3212 break; 3213 } 3214 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), 3215 &data_reserved, cur_offset, range_len); 3216 if (ret < 0) { 3217 btrfs_free_extent_map(em); 3218 break; 3219 } 3220 qgroup_reserved += range_len; 3221 data_space_needed += range_len; 3222 } 3223 btrfs_free_extent_map(em); 3224 cur_offset = last_byte; 3225 } 3226 3227 if (!ret && data_space_needed > 0) { 3228 /* 3229 * We are safe to reserve space here as we can't have delalloc 3230 * in the range, see above. 3231 */ 3232 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3233 data_space_needed); 3234 if (!ret) 3235 data_space_reserved = data_space_needed; 3236 } 3237 3238 /* 3239 * If ret is still 0, means we're OK to fallocate. 3240 * Or just cleanup the list and exit. 3241 */ 3242 list_for_each_entry_safe(range, tmp, &reserve_list, list) { 3243 if (!ret) { 3244 ret = btrfs_prealloc_file_range(inode, mode, 3245 range->start, 3246 range->len, blocksize, 3247 offset + len, &alloc_hint); 3248 /* 3249 * btrfs_prealloc_file_range() releases space even 3250 * if it returns an error. 3251 */ 3252 data_space_reserved -= range->len; 3253 qgroup_reserved -= range->len; 3254 } else if (data_space_reserved > 0) { 3255 btrfs_free_reserved_data_space(BTRFS_I(inode), 3256 data_reserved, range->start, 3257 range->len); 3258 data_space_reserved -= range->len; 3259 qgroup_reserved -= range->len; 3260 } else if (qgroup_reserved > 0) { 3261 btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, 3262 range->start, range->len, NULL); 3263 qgroup_reserved -= range->len; 3264 } 3265 list_del(&range->list); 3266 kfree(range); 3267 } 3268 if (ret < 0) 3269 goto out_unlock; 3270 3271 /* 3272 * We didn't need to allocate any more space, but we still extended the 3273 * size of the file so we need to update i_size and the inode item. 3274 */ 3275 ret = btrfs_fallocate_update_isize(inode, actual_end, mode); 3276 out_unlock: 3277 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 3278 &cached_state); 3279 out: 3280 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3281 extent_changeset_free(data_reserved); 3282 return ret; 3283 } 3284 3285 /* 3286 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range 3287 * that has unflushed and/or flushing delalloc. There might be other adjacent 3288 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps 3289 * looping while it gets adjacent subranges, and merging them together. 3290 */ 3291 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, 3292 struct extent_state **cached_state, 3293 bool *search_io_tree, 3294 u64 *delalloc_start_ret, u64 *delalloc_end_ret) 3295 { 3296 u64 len = end + 1 - start; 3297 u64 delalloc_len = 0; 3298 struct btrfs_ordered_extent *oe; 3299 u64 oe_start; 3300 u64 oe_end; 3301 3302 /* 3303 * Search the io tree first for EXTENT_DELALLOC. If we find any, it 3304 * means we have delalloc (dirty pages) for which writeback has not 3305 * started yet. 3306 */ 3307 if (*search_io_tree) { 3308 spin_lock(&inode->lock); 3309 if (inode->delalloc_bytes > 0) { 3310 spin_unlock(&inode->lock); 3311 *delalloc_start_ret = start; 3312 delalloc_len = btrfs_count_range_bits(&inode->io_tree, 3313 delalloc_start_ret, end, 3314 len, EXTENT_DELALLOC, 1, 3315 cached_state); 3316 } else { 3317 spin_unlock(&inode->lock); 3318 } 3319 } 3320 3321 if (delalloc_len > 0) { 3322 /* 3323 * If delalloc was found then *delalloc_start_ret has a sector size 3324 * aligned value (rounded down). 3325 */ 3326 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; 3327 3328 if (*delalloc_start_ret == start) { 3329 /* Delalloc for the whole range, nothing more to do. */ 3330 if (*delalloc_end_ret == end) 3331 return true; 3332 /* Else trim our search range for ordered extents. */ 3333 start = *delalloc_end_ret + 1; 3334 len = end + 1 - start; 3335 } 3336 } else { 3337 /* No delalloc, future calls don't need to search again. */ 3338 *search_io_tree = false; 3339 } 3340 3341 /* 3342 * Now also check if there's any ordered extent in the range. 3343 * We do this because: 3344 * 3345 * 1) When delalloc is flushed, the file range is locked, we clear the 3346 * EXTENT_DELALLOC bit from the io tree and create an extent map and 3347 * an ordered extent for the write. So we might just have been called 3348 * after delalloc is flushed and before the ordered extent completes 3349 * and inserts the new file extent item in the subvolume's btree; 3350 * 3351 * 2) We may have an ordered extent created by flushing delalloc for a 3352 * subrange that starts before the subrange we found marked with 3353 * EXTENT_DELALLOC in the io tree. 3354 * 3355 * We could also use the extent map tree to find such delalloc that is 3356 * being flushed, but using the ordered extents tree is more efficient 3357 * because it's usually much smaller as ordered extents are removed from 3358 * the tree once they complete. With the extent maps, we may have them 3359 * in the extent map tree for a very long time, and they were either 3360 * created by previous writes or loaded by read operations. 3361 */ 3362 oe = btrfs_lookup_first_ordered_range(inode, start, len); 3363 if (!oe) 3364 return (delalloc_len > 0); 3365 3366 /* The ordered extent may span beyond our search range. */ 3367 oe_start = max(oe->file_offset, start); 3368 oe_end = min(oe->file_offset + oe->num_bytes - 1, end); 3369 3370 btrfs_put_ordered_extent(oe); 3371 3372 /* Don't have unflushed delalloc, return the ordered extent range. */ 3373 if (delalloc_len == 0) { 3374 *delalloc_start_ret = oe_start; 3375 *delalloc_end_ret = oe_end; 3376 return true; 3377 } 3378 3379 /* 3380 * We have both unflushed delalloc (io_tree) and an ordered extent. 3381 * If the ranges are adjacent returned a combined range, otherwise 3382 * return the leftmost range. 3383 */ 3384 if (oe_start < *delalloc_start_ret) { 3385 if (oe_end < *delalloc_start_ret) 3386 *delalloc_end_ret = oe_end; 3387 *delalloc_start_ret = oe_start; 3388 } else if (*delalloc_end_ret + 1 == oe_start) { 3389 *delalloc_end_ret = oe_end; 3390 } 3391 3392 return true; 3393 } 3394 3395 /* 3396 * Check if there's delalloc in a given range. 3397 * 3398 * @inode: The inode. 3399 * @start: The start offset of the range. It does not need to be 3400 * sector size aligned. 3401 * @end: The end offset (inclusive value) of the search range. 3402 * It does not need to be sector size aligned. 3403 * @cached_state: Extent state record used for speeding up delalloc 3404 * searches in the inode's io_tree. Can be NULL. 3405 * @delalloc_start_ret: Output argument, set to the start offset of the 3406 * subrange found with delalloc (may not be sector size 3407 * aligned). 3408 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) 3409 * of the subrange found with delalloc. 3410 * 3411 * Returns true if a subrange with delalloc is found within the given range, and 3412 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and 3413 * end offsets of the subrange. 3414 */ 3415 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, 3416 struct extent_state **cached_state, 3417 u64 *delalloc_start_ret, u64 *delalloc_end_ret) 3418 { 3419 u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); 3420 u64 prev_delalloc_end = 0; 3421 bool search_io_tree = true; 3422 bool ret = false; 3423 3424 while (cur_offset <= end) { 3425 u64 delalloc_start; 3426 u64 delalloc_end; 3427 bool delalloc; 3428 3429 delalloc = find_delalloc_subrange(inode, cur_offset, end, 3430 cached_state, &search_io_tree, 3431 &delalloc_start, 3432 &delalloc_end); 3433 if (!delalloc) 3434 break; 3435 3436 if (prev_delalloc_end == 0) { 3437 /* First subrange found. */ 3438 *delalloc_start_ret = max(delalloc_start, start); 3439 *delalloc_end_ret = delalloc_end; 3440 ret = true; 3441 } else if (delalloc_start == prev_delalloc_end + 1) { 3442 /* Subrange adjacent to the previous one, merge them. */ 3443 *delalloc_end_ret = delalloc_end; 3444 } else { 3445 /* Subrange not adjacent to the previous one, exit. */ 3446 break; 3447 } 3448 3449 prev_delalloc_end = delalloc_end; 3450 cur_offset = delalloc_end + 1; 3451 cond_resched(); 3452 } 3453 3454 return ret; 3455 } 3456 3457 /* 3458 * Check if there's a hole or delalloc range in a range representing a hole (or 3459 * prealloc extent) found in the inode's subvolume btree. 3460 * 3461 * @inode: The inode. 3462 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). 3463 * @start: Start offset of the hole region. It does not need to be sector 3464 * size aligned. 3465 * @end: End offset (inclusive value) of the hole region. It does not 3466 * need to be sector size aligned. 3467 * @start_ret: Return parameter, used to set the start of the subrange in the 3468 * hole that matches the search criteria (seek mode), if such 3469 * subrange is found (return value of the function is true). 3470 * The value returned here may not be sector size aligned. 3471 * 3472 * Returns true if a subrange matching the given seek mode is found, and if one 3473 * is found, it updates @start_ret with the start of the subrange. 3474 */ 3475 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, 3476 struct extent_state **cached_state, 3477 u64 start, u64 end, u64 *start_ret) 3478 { 3479 u64 delalloc_start; 3480 u64 delalloc_end; 3481 bool delalloc; 3482 3483 delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, 3484 &delalloc_start, &delalloc_end); 3485 if (delalloc && whence == SEEK_DATA) { 3486 *start_ret = delalloc_start; 3487 return true; 3488 } 3489 3490 if (delalloc && whence == SEEK_HOLE) { 3491 /* 3492 * We found delalloc but it starts after out start offset. So we 3493 * have a hole between our start offset and the delalloc start. 3494 */ 3495 if (start < delalloc_start) { 3496 *start_ret = start; 3497 return true; 3498 } 3499 /* 3500 * Delalloc range starts at our start offset. 3501 * If the delalloc range's length is smaller than our range, 3502 * then it means we have a hole that starts where the delalloc 3503 * subrange ends. 3504 */ 3505 if (delalloc_end < end) { 3506 *start_ret = delalloc_end + 1; 3507 return true; 3508 } 3509 3510 /* There's delalloc for the whole range. */ 3511 return false; 3512 } 3513 3514 if (!delalloc && whence == SEEK_HOLE) { 3515 *start_ret = start; 3516 return true; 3517 } 3518 3519 /* 3520 * No delalloc in the range and we are seeking for data. The caller has 3521 * to iterate to the next extent item in the subvolume btree. 3522 */ 3523 return false; 3524 } 3525 3526 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) 3527 { 3528 struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); 3529 struct btrfs_file_private *private; 3530 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3531 struct extent_state *cached_state = NULL; 3532 struct extent_state **delalloc_cached_state; 3533 const loff_t i_size = i_size_read(&inode->vfs_inode); 3534 const u64 ino = btrfs_ino(inode); 3535 struct btrfs_root *root = inode->root; 3536 struct btrfs_path *path; 3537 struct btrfs_key key; 3538 u64 last_extent_end; 3539 u64 lockstart; 3540 u64 lockend; 3541 u64 start; 3542 int ret; 3543 bool found = false; 3544 3545 if (i_size == 0 || offset >= i_size) 3546 return -ENXIO; 3547 3548 /* 3549 * Quick path. If the inode has no prealloc extents and its number of 3550 * bytes used matches its i_size, then it can not have holes. 3551 */ 3552 if (whence == SEEK_HOLE && 3553 !(inode->flags & BTRFS_INODE_PREALLOC) && 3554 inode_get_bytes(&inode->vfs_inode) == i_size) 3555 return i_size; 3556 3557 spin_lock(&inode->lock); 3558 private = file->private_data; 3559 spin_unlock(&inode->lock); 3560 3561 if (private && private->owner_task != current) { 3562 /* 3563 * Not allocated by us, don't use it as its cached state is used 3564 * by the task that allocated it and we don't want neither to 3565 * mess with it nor get incorrect results because it reflects an 3566 * invalid state for the current task. 3567 */ 3568 private = NULL; 3569 } else if (!private) { 3570 private = kzalloc(sizeof(*private), GFP_KERNEL); 3571 /* 3572 * No worries if memory allocation failed. 3573 * The private structure is used only for speeding up multiple 3574 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, 3575 * so everything will still be correct. 3576 */ 3577 if (private) { 3578 bool free = false; 3579 3580 private->owner_task = current; 3581 3582 spin_lock(&inode->lock); 3583 if (file->private_data) 3584 free = true; 3585 else 3586 file->private_data = private; 3587 spin_unlock(&inode->lock); 3588 3589 if (free) { 3590 kfree(private); 3591 private = NULL; 3592 } 3593 } 3594 } 3595 3596 if (private) 3597 delalloc_cached_state = &private->llseek_cached_state; 3598 else 3599 delalloc_cached_state = NULL; 3600 3601 /* 3602 * offset can be negative, in this case we start finding DATA/HOLE from 3603 * the very start of the file. 3604 */ 3605 start = max_t(loff_t, 0, offset); 3606 3607 lockstart = round_down(start, fs_info->sectorsize); 3608 lockend = round_up(i_size, fs_info->sectorsize); 3609 if (lockend <= lockstart) 3610 lockend = lockstart + fs_info->sectorsize; 3611 lockend--; 3612 3613 path = btrfs_alloc_path(); 3614 if (!path) 3615 return -ENOMEM; 3616 path->reada = READA_FORWARD; 3617 3618 key.objectid = ino; 3619 key.type = BTRFS_EXTENT_DATA_KEY; 3620 key.offset = start; 3621 3622 last_extent_end = lockstart; 3623 3624 btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 3625 3626 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3627 if (ret < 0) { 3628 goto out; 3629 } else if (ret > 0 && path->slots[0] > 0) { 3630 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 3631 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 3632 path->slots[0]--; 3633 } 3634 3635 while (start < i_size) { 3636 struct extent_buffer *leaf = path->nodes[0]; 3637 struct btrfs_file_extent_item *extent; 3638 u64 extent_end; 3639 u8 type; 3640 3641 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3642 ret = btrfs_next_leaf(root, path); 3643 if (ret < 0) 3644 goto out; 3645 else if (ret > 0) 3646 break; 3647 3648 leaf = path->nodes[0]; 3649 } 3650 3651 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3652 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 3653 break; 3654 3655 extent_end = btrfs_file_extent_end(path); 3656 3657 /* 3658 * In the first iteration we may have a slot that points to an 3659 * extent that ends before our start offset, so skip it. 3660 */ 3661 if (extent_end <= start) { 3662 path->slots[0]++; 3663 continue; 3664 } 3665 3666 /* We have an implicit hole, NO_HOLES feature is likely set. */ 3667 if (last_extent_end < key.offset) { 3668 u64 search_start = last_extent_end; 3669 u64 found_start; 3670 3671 /* 3672 * First iteration, @start matches @offset and it's 3673 * within the hole. 3674 */ 3675 if (start == offset) 3676 search_start = offset; 3677 3678 found = find_desired_extent_in_hole(inode, whence, 3679 delalloc_cached_state, 3680 search_start, 3681 key.offset - 1, 3682 &found_start); 3683 if (found) { 3684 start = found_start; 3685 break; 3686 } 3687 /* 3688 * Didn't find data or a hole (due to delalloc) in the 3689 * implicit hole range, so need to analyze the extent. 3690 */ 3691 } 3692 3693 extent = btrfs_item_ptr(leaf, path->slots[0], 3694 struct btrfs_file_extent_item); 3695 type = btrfs_file_extent_type(leaf, extent); 3696 3697 /* 3698 * Can't access the extent's disk_bytenr field if this is an 3699 * inline extent, since at that offset, it's where the extent 3700 * data starts. 3701 */ 3702 if (type == BTRFS_FILE_EXTENT_PREALLOC || 3703 (type == BTRFS_FILE_EXTENT_REG && 3704 btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { 3705 /* 3706 * Explicit hole or prealloc extent, search for delalloc. 3707 * A prealloc extent is treated like a hole. 3708 */ 3709 u64 search_start = key.offset; 3710 u64 found_start; 3711 3712 /* 3713 * First iteration, @start matches @offset and it's 3714 * within the hole. 3715 */ 3716 if (start == offset) 3717 search_start = offset; 3718 3719 found = find_desired_extent_in_hole(inode, whence, 3720 delalloc_cached_state, 3721 search_start, 3722 extent_end - 1, 3723 &found_start); 3724 if (found) { 3725 start = found_start; 3726 break; 3727 } 3728 /* 3729 * Didn't find data or a hole (due to delalloc) in the 3730 * implicit hole range, so need to analyze the next 3731 * extent item. 3732 */ 3733 } else { 3734 /* 3735 * Found a regular or inline extent. 3736 * If we are seeking for data, adjust the start offset 3737 * and stop, we're done. 3738 */ 3739 if (whence == SEEK_DATA) { 3740 start = max_t(u64, key.offset, offset); 3741 found = true; 3742 break; 3743 } 3744 /* 3745 * Else, we are seeking for a hole, check the next file 3746 * extent item. 3747 */ 3748 } 3749 3750 start = extent_end; 3751 last_extent_end = extent_end; 3752 path->slots[0]++; 3753 if (fatal_signal_pending(current)) { 3754 ret = -EINTR; 3755 goto out; 3756 } 3757 cond_resched(); 3758 } 3759 3760 /* We have an implicit hole from the last extent found up to i_size. */ 3761 if (!found && start < i_size) { 3762 found = find_desired_extent_in_hole(inode, whence, 3763 delalloc_cached_state, start, 3764 i_size - 1, &start); 3765 if (!found) 3766 start = i_size; 3767 } 3768 3769 out: 3770 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 3771 btrfs_free_path(path); 3772 3773 if (ret < 0) 3774 return ret; 3775 3776 if (whence == SEEK_DATA && start >= i_size) 3777 return -ENXIO; 3778 3779 return min_t(loff_t, start, i_size); 3780 } 3781 3782 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) 3783 { 3784 struct inode *inode = file->f_mapping->host; 3785 3786 switch (whence) { 3787 default: 3788 return generic_file_llseek(file, offset, whence); 3789 case SEEK_DATA: 3790 case SEEK_HOLE: 3791 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 3792 offset = find_desired_extent(file, offset, whence); 3793 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 3794 break; 3795 } 3796 3797 if (offset < 0) 3798 return offset; 3799 3800 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 3801 } 3802 3803 static int btrfs_file_open(struct inode *inode, struct file *filp) 3804 { 3805 int ret; 3806 3807 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 3808 3809 ret = fsverity_file_open(inode, filp); 3810 if (ret) 3811 return ret; 3812 return generic_file_open(inode, filp); 3813 } 3814 3815 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 3816 { 3817 ssize_t ret = 0; 3818 3819 if (iocb->ki_flags & IOCB_DIRECT) { 3820 ret = btrfs_direct_read(iocb, to); 3821 if (ret < 0 || !iov_iter_count(to) || 3822 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) 3823 return ret; 3824 } 3825 3826 return filemap_read(iocb, to, ret); 3827 } 3828 3829 const struct file_operations btrfs_file_operations = { 3830 .llseek = btrfs_file_llseek, 3831 .read_iter = btrfs_file_read_iter, 3832 .splice_read = filemap_splice_read, 3833 .write_iter = btrfs_file_write_iter, 3834 .splice_write = iter_file_splice_write, 3835 .mmap_prepare = btrfs_file_mmap_prepare, 3836 .open = btrfs_file_open, 3837 .release = btrfs_release_file, 3838 .get_unmapped_area = thp_get_unmapped_area, 3839 .fsync = btrfs_sync_file, 3840 .fallocate = btrfs_fallocate, 3841 .unlocked_ioctl = btrfs_ioctl, 3842 #ifdef CONFIG_COMPAT 3843 .compat_ioctl = btrfs_compat_ioctl, 3844 #endif 3845 .remap_file_range = btrfs_remap_file_range, 3846 .uring_cmd = btrfs_uring_cmd, 3847 .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, 3848 }; 3849 3850 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) 3851 { 3852 struct address_space *mapping = inode->vfs_inode.i_mapping; 3853 int ret; 3854 3855 /* 3856 * So with compression we will find and lock a dirty page and clear the 3857 * first one as dirty, setup an async extent, and immediately return 3858 * with the entire range locked but with nobody actually marked with 3859 * writeback. So we can't just filemap_write_and_wait_range() and 3860 * expect it to work since it will just kick off a thread to do the 3861 * actual work. So we need to call filemap_fdatawrite_range _again_ 3862 * since it will wait on the page lock, which won't be unlocked until 3863 * after the pages have been marked as writeback and so we're good to go 3864 * from there. We have to do this otherwise we'll miss the ordered 3865 * extents and that results in badness. Please Josef, do not think you 3866 * know better and pull this out at some point in the future, it is 3867 * right and you are wrong. 3868 */ 3869 ret = filemap_fdatawrite_range(mapping, start, end); 3870 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) 3871 ret = filemap_fdatawrite_range(mapping, start, end); 3872 3873 return ret; 3874 } 3875