1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/fs.h> 7 #include <linux/pagemap.h> 8 #include <linux/time.h> 9 #include <linux/init.h> 10 #include <linux/string.h> 11 #include <linux/backing-dev.h> 12 #include <linux/falloc.h> 13 #include <linux/writeback.h> 14 #include <linux/compat.h> 15 #include <linux/slab.h> 16 #include <linux/btrfs.h> 17 #include <linux/uio.h> 18 #include <linux/iversion.h> 19 #include <linux/fsverity.h> 20 #include "ctree.h" 21 #include "direct-io.h" 22 #include "disk-io.h" 23 #include "transaction.h" 24 #include "btrfs_inode.h" 25 #include "tree-log.h" 26 #include "locking.h" 27 #include "qgroup.h" 28 #include "compression.h" 29 #include "delalloc-space.h" 30 #include "reflink.h" 31 #include "subpage.h" 32 #include "fs.h" 33 #include "accessors.h" 34 #include "extent-tree.h" 35 #include "file-item.h" 36 #include "ioctl.h" 37 #include "file.h" 38 #include "super.h" 39 #include "print-tree.h" 40 41 /* 42 * Unlock folio after btrfs_file_write() is done with it. 43 */ 44 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, 45 u64 pos, u64 copied) 46 { 47 u64 block_start = round_down(pos, fs_info->sectorsize); 48 u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; 49 50 ASSERT(block_len <= U32_MAX); 51 /* 52 * Folio checked is some magic around finding folios that have been 53 * modified without going through btrfs_dirty_folio(). Clear it here. 54 * There should be no need to mark the pages accessed as 55 * prepare_one_folio() should have marked them accessed in 56 * prepare_one_folio() via find_or_create_page() 57 */ 58 btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); 59 folio_unlock(folio); 60 folio_put(folio); 61 } 62 63 /* 64 * After copy_folio_from_iter_atomic(), update the following things for delalloc: 65 * - Mark newly dirtied folio as DELALLOC in the io tree. 66 * Used to advise which range is to be written back. 67 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup 68 * - Update inode size for past EOF write 69 */ 70 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, 71 size_t write_bytes, struct extent_state **cached, bool noreserve) 72 { 73 struct btrfs_fs_info *fs_info = inode->root->fs_info; 74 int ret = 0; 75 u64 num_bytes; 76 u64 start_pos; 77 u64 end_of_last_block; 78 u64 end_pos = pos + write_bytes; 79 loff_t isize = i_size_read(&inode->vfs_inode); 80 unsigned int extra_bits = 0; 81 82 if (write_bytes == 0) 83 return 0; 84 85 if (noreserve) 86 extra_bits |= EXTENT_NORESERVE; 87 88 start_pos = round_down(pos, fs_info->sectorsize); 89 num_bytes = round_up(write_bytes + pos - start_pos, 90 fs_info->sectorsize); 91 ASSERT(num_bytes <= U32_MAX); 92 ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); 93 94 end_of_last_block = start_pos + num_bytes - 1; 95 96 /* 97 * The pages may have already been dirty, clear out old accounting so 98 * we can set things up properly 99 */ 100 btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, 101 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 102 cached); 103 104 ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 105 extra_bits, cached); 106 if (ret) 107 return ret; 108 109 btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); 110 btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); 111 btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); 112 113 /* 114 * we've only changed i_size in ram, and we haven't updated 115 * the disk i_size. There is no need to log the inode 116 * at this time. 117 */ 118 if (end_pos > isize) 119 i_size_write(&inode->vfs_inode, end_pos); 120 return 0; 121 } 122 123 /* 124 * this is very complex, but the basic idea is to drop all extents 125 * in the range start - end. hint_block is filled in with a block number 126 * that would be a good hint to the block allocator for this file. 127 * 128 * If an extent intersects the range but is not entirely inside the range 129 * it is either truncated or split. Anything entirely inside the range 130 * is deleted from the tree. 131 * 132 * Note: the VFS' inode number of bytes is not updated, it's up to the caller 133 * to deal with that. We set the field 'bytes_found' of the arguments structure 134 * with the number of allocated bytes found in the target range, so that the 135 * caller can update the inode's number of bytes in an atomic way when 136 * replacing extents in a range to avoid races with stat(2). 137 */ 138 int btrfs_drop_extents(struct btrfs_trans_handle *trans, 139 struct btrfs_root *root, struct btrfs_inode *inode, 140 struct btrfs_drop_extents_args *args) 141 { 142 struct btrfs_fs_info *fs_info = root->fs_info; 143 struct extent_buffer *leaf; 144 struct btrfs_file_extent_item *fi; 145 struct btrfs_key key; 146 struct btrfs_key new_key; 147 u64 ino = btrfs_ino(inode); 148 u64 search_start = args->start; 149 u64 disk_bytenr = 0; 150 u64 num_bytes = 0; 151 u64 extent_offset = 0; 152 u64 extent_end = 0; 153 u64 last_end = args->start; 154 int del_nr = 0; 155 int del_slot = 0; 156 int extent_type; 157 int recow; 158 int ret; 159 int modify_tree = -1; 160 int update_refs; 161 int found = 0; 162 struct btrfs_path *path = args->path; 163 164 args->bytes_found = 0; 165 args->extent_inserted = false; 166 167 /* Must always have a path if ->replace_extent is true */ 168 ASSERT(!(args->replace_extent && !args->path)); 169 170 if (!path) { 171 path = btrfs_alloc_path(); 172 if (!path) { 173 ret = -ENOMEM; 174 goto out; 175 } 176 } 177 178 if (args->drop_cache) 179 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); 180 181 if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) 182 modify_tree = 0; 183 184 update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); 185 while (1) { 186 recow = 0; 187 ret = btrfs_lookup_file_extent(trans, root, path, ino, 188 search_start, modify_tree); 189 if (ret < 0) 190 break; 191 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { 192 leaf = path->nodes[0]; 193 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 194 if (key.objectid == ino && 195 key.type == BTRFS_EXTENT_DATA_KEY) 196 path->slots[0]--; 197 } 198 ret = 0; 199 next_slot: 200 leaf = path->nodes[0]; 201 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 202 if (WARN_ON(del_nr > 0)) { 203 btrfs_print_leaf(leaf); 204 ret = -EINVAL; 205 break; 206 } 207 ret = btrfs_next_leaf(root, path); 208 if (ret < 0) 209 break; 210 if (ret > 0) { 211 ret = 0; 212 break; 213 } 214 leaf = path->nodes[0]; 215 recow = 1; 216 } 217 218 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 219 220 if (key.objectid > ino) 221 break; 222 if (WARN_ON_ONCE(key.objectid < ino) || 223 key.type < BTRFS_EXTENT_DATA_KEY) { 224 ASSERT(del_nr == 0); 225 path->slots[0]++; 226 goto next_slot; 227 } 228 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) 229 break; 230 231 fi = btrfs_item_ptr(leaf, path->slots[0], 232 struct btrfs_file_extent_item); 233 extent_type = btrfs_file_extent_type(leaf, fi); 234 235 if (extent_type == BTRFS_FILE_EXTENT_REG || 236 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 237 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 238 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 239 extent_offset = btrfs_file_extent_offset(leaf, fi); 240 extent_end = key.offset + 241 btrfs_file_extent_num_bytes(leaf, fi); 242 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 243 extent_end = key.offset + 244 btrfs_file_extent_ram_bytes(leaf, fi); 245 } else { 246 /* can't happen */ 247 BUG(); 248 } 249 250 /* 251 * Don't skip extent items representing 0 byte lengths. They 252 * used to be created (bug) if while punching holes we hit 253 * -ENOSPC condition. So if we find one here, just ensure we 254 * delete it, otherwise we would insert a new file extent item 255 * with the same key (offset) as that 0 bytes length file 256 * extent item in the call to setup_items_for_insert() later 257 * in this function. 258 */ 259 if (extent_end == key.offset && extent_end >= search_start) { 260 last_end = extent_end; 261 goto delete_extent_item; 262 } 263 264 if (extent_end <= search_start) { 265 path->slots[0]++; 266 goto next_slot; 267 } 268 269 found = 1; 270 search_start = max(key.offset, args->start); 271 if (recow || !modify_tree) { 272 modify_tree = -1; 273 btrfs_release_path(path); 274 continue; 275 } 276 277 /* 278 * | - range to drop - | 279 * | -------- extent -------- | 280 */ 281 if (args->start > key.offset && args->end < extent_end) { 282 if (WARN_ON(del_nr > 0)) { 283 btrfs_print_leaf(leaf); 284 ret = -EINVAL; 285 break; 286 } 287 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 288 ret = -EOPNOTSUPP; 289 break; 290 } 291 292 memcpy(&new_key, &key, sizeof(new_key)); 293 new_key.offset = args->start; 294 ret = btrfs_duplicate_item(trans, root, path, 295 &new_key); 296 if (ret == -EAGAIN) { 297 btrfs_release_path(path); 298 continue; 299 } 300 if (ret < 0) 301 break; 302 303 leaf = path->nodes[0]; 304 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 305 struct btrfs_file_extent_item); 306 btrfs_set_file_extent_num_bytes(leaf, fi, 307 args->start - key.offset); 308 309 fi = btrfs_item_ptr(leaf, path->slots[0], 310 struct btrfs_file_extent_item); 311 312 extent_offset += args->start - key.offset; 313 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 314 btrfs_set_file_extent_num_bytes(leaf, fi, 315 extent_end - args->start); 316 317 if (update_refs && disk_bytenr > 0) { 318 struct btrfs_ref ref = { 319 .action = BTRFS_ADD_DELAYED_REF, 320 .bytenr = disk_bytenr, 321 .num_bytes = num_bytes, 322 .parent = 0, 323 .owning_root = btrfs_root_id(root), 324 .ref_root = btrfs_root_id(root), 325 }; 326 btrfs_init_data_ref(&ref, new_key.objectid, 327 args->start - extent_offset, 328 0, false); 329 ret = btrfs_inc_extent_ref(trans, &ref); 330 if (unlikely(ret)) { 331 btrfs_abort_transaction(trans, ret); 332 break; 333 } 334 } 335 key.offset = args->start; 336 } 337 /* 338 * From here on out we will have actually dropped something, so 339 * last_end can be updated. 340 */ 341 last_end = extent_end; 342 343 /* 344 * | ---- range to drop ----- | 345 * | -------- extent -------- | 346 */ 347 if (args->start <= key.offset && args->end < extent_end) { 348 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 349 ret = -EOPNOTSUPP; 350 break; 351 } 352 353 memcpy(&new_key, &key, sizeof(new_key)); 354 new_key.offset = args->end; 355 btrfs_set_item_key_safe(trans, path, &new_key); 356 357 extent_offset += args->end - key.offset; 358 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 359 btrfs_set_file_extent_num_bytes(leaf, fi, 360 extent_end - args->end); 361 if (update_refs && disk_bytenr > 0) 362 args->bytes_found += args->end - key.offset; 363 break; 364 } 365 366 search_start = extent_end; 367 /* 368 * | ---- range to drop ----- | 369 * | -------- extent -------- | 370 */ 371 if (args->start > key.offset && args->end >= extent_end) { 372 if (WARN_ON(del_nr > 0)) { 373 btrfs_print_leaf(leaf); 374 ret = -EINVAL; 375 break; 376 } 377 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 378 ret = -EOPNOTSUPP; 379 break; 380 } 381 382 btrfs_set_file_extent_num_bytes(leaf, fi, 383 args->start - key.offset); 384 if (update_refs && disk_bytenr > 0) 385 args->bytes_found += extent_end - args->start; 386 if (args->end == extent_end) 387 break; 388 389 path->slots[0]++; 390 goto next_slot; 391 } 392 393 /* 394 * | ---- range to drop ----- | 395 * | ------ extent ------ | 396 */ 397 if (args->start <= key.offset && args->end >= extent_end) { 398 delete_extent_item: 399 if (del_nr == 0) { 400 del_slot = path->slots[0]; 401 del_nr = 1; 402 } else { 403 if (WARN_ON(del_slot + del_nr != path->slots[0])) { 404 btrfs_print_leaf(leaf); 405 ret = -EINVAL; 406 break; 407 } 408 del_nr++; 409 } 410 411 if (update_refs && 412 extent_type == BTRFS_FILE_EXTENT_INLINE) { 413 args->bytes_found += extent_end - key.offset; 414 extent_end = ALIGN(extent_end, 415 fs_info->sectorsize); 416 } else if (update_refs && disk_bytenr > 0) { 417 struct btrfs_ref ref = { 418 .action = BTRFS_DROP_DELAYED_REF, 419 .bytenr = disk_bytenr, 420 .num_bytes = num_bytes, 421 .parent = 0, 422 .owning_root = btrfs_root_id(root), 423 .ref_root = btrfs_root_id(root), 424 }; 425 btrfs_init_data_ref(&ref, key.objectid, 426 key.offset - extent_offset, 427 0, false); 428 ret = btrfs_free_extent(trans, &ref); 429 if (unlikely(ret)) { 430 btrfs_abort_transaction(trans, ret); 431 break; 432 } 433 args->bytes_found += extent_end - key.offset; 434 } 435 436 if (args->end == extent_end) 437 break; 438 439 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 440 path->slots[0]++; 441 goto next_slot; 442 } 443 444 ret = btrfs_del_items(trans, root, path, del_slot, 445 del_nr); 446 if (unlikely(ret)) { 447 btrfs_abort_transaction(trans, ret); 448 break; 449 } 450 451 del_nr = 0; 452 del_slot = 0; 453 454 btrfs_release_path(path); 455 continue; 456 } 457 458 BUG(); 459 } 460 461 if (!ret && del_nr > 0) { 462 /* 463 * Set path->slots[0] to first slot, so that after the delete 464 * if items are move off from our leaf to its immediate left or 465 * right neighbor leafs, we end up with a correct and adjusted 466 * path->slots[0] for our insertion (if args->replace_extent). 467 */ 468 path->slots[0] = del_slot; 469 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 470 if (ret) 471 btrfs_abort_transaction(trans, ret); 472 } 473 474 leaf = path->nodes[0]; 475 /* 476 * If btrfs_del_items() was called, it might have deleted a leaf, in 477 * which case it unlocked our path, so check path->locks[0] matches a 478 * write lock. 479 */ 480 if (!ret && args->replace_extent && 481 path->locks[0] == BTRFS_WRITE_LOCK && 482 btrfs_leaf_free_space(leaf) >= 483 sizeof(struct btrfs_item) + args->extent_item_size) { 484 485 key.objectid = ino; 486 key.type = BTRFS_EXTENT_DATA_KEY; 487 key.offset = args->start; 488 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { 489 struct btrfs_key slot_key; 490 491 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); 492 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) 493 path->slots[0]++; 494 } 495 btrfs_setup_item_for_insert(trans, root, path, &key, 496 args->extent_item_size); 497 args->extent_inserted = true; 498 } 499 500 if (!args->path) 501 btrfs_free_path(path); 502 else if (!args->extent_inserted) 503 btrfs_release_path(path); 504 out: 505 args->drop_end = found ? min(args->end, last_end) : args->end; 506 507 return ret; 508 } 509 510 static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, 511 u64 bytenr, u64 orig_offset, u64 *start, u64 *end) 512 { 513 struct btrfs_file_extent_item *fi; 514 struct btrfs_key key; 515 u64 extent_end; 516 517 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 518 return false; 519 520 btrfs_item_key_to_cpu(leaf, &key, slot); 521 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 522 return false; 523 524 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 525 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 526 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 527 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 528 btrfs_file_extent_compression(leaf, fi) || 529 btrfs_file_extent_encryption(leaf, fi) || 530 btrfs_file_extent_other_encoding(leaf, fi)) 531 return false; 532 533 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 534 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 535 return false; 536 537 *start = key.offset; 538 *end = extent_end; 539 return true; 540 } 541 542 /* 543 * Mark extent in the range start - end as written. 544 * 545 * This changes extent type from 'pre-allocated' to 'regular'. If only 546 * part of extent is marked as written, the extent will be split into 547 * two or three. 548 */ 549 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 550 struct btrfs_inode *inode, u64 start, u64 end) 551 { 552 struct btrfs_root *root = inode->root; 553 struct extent_buffer *leaf; 554 BTRFS_PATH_AUTO_FREE(path); 555 struct btrfs_file_extent_item *fi; 556 struct btrfs_ref ref = { 0 }; 557 struct btrfs_key key; 558 struct btrfs_key new_key; 559 u64 bytenr; 560 u64 num_bytes; 561 u64 extent_end; 562 u64 orig_offset; 563 u64 other_start; 564 u64 other_end; 565 u64 split; 566 int del_nr = 0; 567 int del_slot = 0; 568 int recow; 569 int ret = 0; 570 u64 ino = btrfs_ino(inode); 571 572 path = btrfs_alloc_path(); 573 if (!path) 574 return -ENOMEM; 575 again: 576 recow = 0; 577 split = start; 578 key.objectid = ino; 579 key.type = BTRFS_EXTENT_DATA_KEY; 580 key.offset = split; 581 582 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 583 if (ret < 0) 584 goto out; 585 if (ret > 0 && path->slots[0] > 0) 586 path->slots[0]--; 587 588 leaf = path->nodes[0]; 589 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 590 if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { 591 ret = -EINVAL; 592 btrfs_abort_transaction(trans, ret); 593 goto out; 594 } 595 fi = btrfs_item_ptr(leaf, path->slots[0], 596 struct btrfs_file_extent_item); 597 if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { 598 ret = -EINVAL; 599 btrfs_abort_transaction(trans, ret); 600 goto out; 601 } 602 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 603 if (unlikely(key.offset > start || extent_end < end)) { 604 ret = -EINVAL; 605 btrfs_abort_transaction(trans, ret); 606 goto out; 607 } 608 609 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 610 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 611 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 612 memcpy(&new_key, &key, sizeof(new_key)); 613 614 if (start == key.offset && end < extent_end) { 615 other_start = 0; 616 other_end = start; 617 if (extent_mergeable(leaf, path->slots[0] - 1, 618 ino, bytenr, orig_offset, 619 &other_start, &other_end)) { 620 new_key.offset = end; 621 btrfs_set_item_key_safe(trans, path, &new_key); 622 fi = btrfs_item_ptr(leaf, path->slots[0], 623 struct btrfs_file_extent_item); 624 btrfs_set_file_extent_generation(leaf, fi, 625 trans->transid); 626 btrfs_set_file_extent_num_bytes(leaf, fi, 627 extent_end - end); 628 btrfs_set_file_extent_offset(leaf, fi, 629 end - orig_offset); 630 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 631 struct btrfs_file_extent_item); 632 btrfs_set_file_extent_generation(leaf, fi, 633 trans->transid); 634 btrfs_set_file_extent_num_bytes(leaf, fi, 635 end - other_start); 636 goto out; 637 } 638 } 639 640 if (start > key.offset && end == extent_end) { 641 other_start = end; 642 other_end = 0; 643 if (extent_mergeable(leaf, path->slots[0] + 1, 644 ino, bytenr, orig_offset, 645 &other_start, &other_end)) { 646 fi = btrfs_item_ptr(leaf, path->slots[0], 647 struct btrfs_file_extent_item); 648 btrfs_set_file_extent_num_bytes(leaf, fi, 649 start - key.offset); 650 btrfs_set_file_extent_generation(leaf, fi, 651 trans->transid); 652 path->slots[0]++; 653 new_key.offset = start; 654 btrfs_set_item_key_safe(trans, path, &new_key); 655 656 fi = btrfs_item_ptr(leaf, path->slots[0], 657 struct btrfs_file_extent_item); 658 btrfs_set_file_extent_generation(leaf, fi, 659 trans->transid); 660 btrfs_set_file_extent_num_bytes(leaf, fi, 661 other_end - start); 662 btrfs_set_file_extent_offset(leaf, fi, 663 start - orig_offset); 664 goto out; 665 } 666 } 667 668 while (start > key.offset || end < extent_end) { 669 if (key.offset == start) 670 split = end; 671 672 new_key.offset = split; 673 ret = btrfs_duplicate_item(trans, root, path, &new_key); 674 if (ret == -EAGAIN) { 675 btrfs_release_path(path); 676 goto again; 677 } 678 if (unlikely(ret < 0)) { 679 btrfs_abort_transaction(trans, ret); 680 goto out; 681 } 682 683 leaf = path->nodes[0]; 684 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 685 struct btrfs_file_extent_item); 686 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 687 btrfs_set_file_extent_num_bytes(leaf, fi, 688 split - key.offset); 689 690 fi = btrfs_item_ptr(leaf, path->slots[0], 691 struct btrfs_file_extent_item); 692 693 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 694 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 695 btrfs_set_file_extent_num_bytes(leaf, fi, 696 extent_end - split); 697 698 ref.action = BTRFS_ADD_DELAYED_REF; 699 ref.bytenr = bytenr; 700 ref.num_bytes = num_bytes; 701 ref.parent = 0; 702 ref.owning_root = btrfs_root_id(root); 703 ref.ref_root = btrfs_root_id(root); 704 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); 705 ret = btrfs_inc_extent_ref(trans, &ref); 706 if (unlikely(ret)) { 707 btrfs_abort_transaction(trans, ret); 708 goto out; 709 } 710 711 if (split == start) { 712 key.offset = start; 713 } else { 714 if (unlikely(start != key.offset)) { 715 ret = -EINVAL; 716 btrfs_abort_transaction(trans, ret); 717 goto out; 718 } 719 path->slots[0]--; 720 extent_end = end; 721 } 722 recow = 1; 723 } 724 725 other_start = end; 726 other_end = 0; 727 728 ref.action = BTRFS_DROP_DELAYED_REF; 729 ref.bytenr = bytenr; 730 ref.num_bytes = num_bytes; 731 ref.parent = 0; 732 ref.owning_root = btrfs_root_id(root); 733 ref.ref_root = btrfs_root_id(root); 734 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); 735 if (extent_mergeable(leaf, path->slots[0] + 1, 736 ino, bytenr, orig_offset, 737 &other_start, &other_end)) { 738 if (recow) { 739 btrfs_release_path(path); 740 goto again; 741 } 742 extent_end = other_end; 743 del_slot = path->slots[0] + 1; 744 del_nr++; 745 ret = btrfs_free_extent(trans, &ref); 746 if (unlikely(ret)) { 747 btrfs_abort_transaction(trans, ret); 748 goto out; 749 } 750 } 751 other_start = 0; 752 other_end = start; 753 if (extent_mergeable(leaf, path->slots[0] - 1, 754 ino, bytenr, orig_offset, 755 &other_start, &other_end)) { 756 if (recow) { 757 btrfs_release_path(path); 758 goto again; 759 } 760 key.offset = other_start; 761 del_slot = path->slots[0]; 762 del_nr++; 763 ret = btrfs_free_extent(trans, &ref); 764 if (unlikely(ret)) { 765 btrfs_abort_transaction(trans, ret); 766 goto out; 767 } 768 } 769 if (del_nr == 0) { 770 fi = btrfs_item_ptr(leaf, path->slots[0], 771 struct btrfs_file_extent_item); 772 btrfs_set_file_extent_type(leaf, fi, 773 BTRFS_FILE_EXTENT_REG); 774 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 775 } else { 776 fi = btrfs_item_ptr(leaf, del_slot - 1, 777 struct btrfs_file_extent_item); 778 btrfs_set_file_extent_type(leaf, fi, 779 BTRFS_FILE_EXTENT_REG); 780 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 781 btrfs_set_file_extent_num_bytes(leaf, fi, 782 extent_end - key.offset); 783 784 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 785 if (unlikely(ret < 0)) { 786 btrfs_abort_transaction(trans, ret); 787 goto out; 788 } 789 } 790 out: 791 return ret; 792 } 793 794 /* 795 * On error return an unlocked folio and the error value 796 * On success return a locked folio and 0 797 */ 798 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, 799 u64 len) 800 { 801 u64 clamp_start = max_t(u64, pos, folio_pos(folio)); 802 u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); 803 const u32 blocksize = inode_to_fs_info(inode)->sectorsize; 804 int ret = 0; 805 806 if (folio_test_uptodate(folio)) 807 return 0; 808 809 if (IS_ALIGNED(clamp_start, blocksize) && 810 IS_ALIGNED(clamp_end, blocksize)) 811 return 0; 812 813 ret = btrfs_read_folio(NULL, folio); 814 if (ret) 815 return ret; 816 folio_lock(folio); 817 if (unlikely(!folio_test_uptodate(folio))) { 818 folio_unlock(folio); 819 return -EIO; 820 } 821 822 /* 823 * Since btrfs_read_folio() will unlock the folio before it returns, 824 * there is a window where btrfs_release_folio() can be called to 825 * release the page. Here we check both inode mapping and page 826 * private to make sure the page was not released. 827 * 828 * The private flag check is essential for subpage as we need to store 829 * extra bitmap using folio private. 830 */ 831 if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { 832 folio_unlock(folio); 833 return -EAGAIN; 834 } 835 return 0; 836 } 837 838 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) 839 { 840 gfp_t gfp; 841 842 gfp = btrfs_alloc_write_mask(inode->i_mapping); 843 if (nowait) { 844 gfp &= ~__GFP_DIRECT_RECLAIM; 845 gfp |= GFP_NOWAIT; 846 } 847 848 return gfp; 849 } 850 851 /* 852 * Get folio into the page cache and lock it. 853 */ 854 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, 855 loff_t pos, size_t write_bytes, 856 bool nowait) 857 { 858 const pgoff_t index = pos >> PAGE_SHIFT; 859 gfp_t mask = get_prepare_gfp_flags(inode, nowait); 860 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | 861 fgf_set_order(write_bytes); 862 struct folio *folio; 863 int ret = 0; 864 865 again: 866 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); 867 if (IS_ERR(folio)) 868 return PTR_ERR(folio); 869 870 ret = set_folio_extent_mapped(folio); 871 if (ret < 0) { 872 folio_unlock(folio); 873 folio_put(folio); 874 return ret; 875 } 876 ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); 877 if (ret) { 878 /* The folio is already unlocked. */ 879 folio_put(folio); 880 if (!nowait && ret == -EAGAIN) { 881 ret = 0; 882 goto again; 883 } 884 return ret; 885 } 886 *folio_ret = folio; 887 return 0; 888 } 889 890 /* 891 * Locks the extent and properly waits for data=ordered extents to finish 892 * before allowing the folios to be modified if need. 893 * 894 * Return: 895 * 1 - the extent is locked 896 * 0 - the extent is not locked, and everything is OK 897 * -EAGAIN - need to prepare the folios again 898 */ 899 static noinline int 900 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, 901 loff_t pos, size_t write_bytes, 902 u64 *lockstart, u64 *lockend, bool nowait, 903 struct extent_state **cached_state) 904 { 905 struct btrfs_fs_info *fs_info = inode->root->fs_info; 906 u64 start_pos; 907 u64 last_pos; 908 int ret = 0; 909 910 start_pos = round_down(pos, fs_info->sectorsize); 911 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; 912 913 if (start_pos < inode->vfs_inode.i_size) { 914 struct btrfs_ordered_extent *ordered; 915 916 if (nowait) { 917 if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, 918 last_pos, cached_state)) { 919 folio_unlock(folio); 920 folio_put(folio); 921 return -EAGAIN; 922 } 923 } else { 924 btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, 925 cached_state); 926 } 927 928 ordered = btrfs_lookup_ordered_range(inode, start_pos, 929 last_pos - start_pos + 1); 930 if (ordered && 931 ordered->file_offset + ordered->num_bytes > start_pos && 932 ordered->file_offset <= last_pos) { 933 btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, 934 cached_state); 935 folio_unlock(folio); 936 folio_put(folio); 937 btrfs_start_ordered_extent(ordered); 938 btrfs_put_ordered_extent(ordered); 939 return -EAGAIN; 940 } 941 if (ordered) 942 btrfs_put_ordered_extent(ordered); 943 944 *lockstart = start_pos; 945 *lockend = last_pos; 946 ret = 1; 947 } 948 949 /* 950 * We should be called after prepare_one_folio() which should have locked 951 * all pages in the range. 952 */ 953 WARN_ON(!folio_test_locked(folio)); 954 955 return ret; 956 } 957 958 /* 959 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) 960 * 961 * @pos: File offset. 962 * @write_bytes: The length to write, will be updated to the nocow writeable 963 * range. 964 * @nowait: Indicate if we can block or not (non-blocking IO context). 965 * 966 * This function will flush ordered extents in the range to ensure proper 967 * nocow checks. 968 * 969 * Return: 970 * > 0 If we can nocow, and updates @write_bytes. 971 * 0 If we can't do a nocow write. 972 * -EAGAIN If we can't do a nocow write because snapshotting of the inode's 973 * root is in progress or because we are in a non-blocking IO 974 * context and need to block (@nowait is true). 975 * < 0 If an error happened. 976 * 977 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. 978 */ 979 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, 980 size_t *write_bytes, bool nowait) 981 { 982 struct btrfs_fs_info *fs_info = inode->root->fs_info; 983 struct btrfs_root *root = inode->root; 984 struct extent_state *cached_state = NULL; 985 u64 lockstart, lockend; 986 u64 cur_offset; 987 int ret = 0; 988 989 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 990 return 0; 991 992 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) 993 return -EAGAIN; 994 995 lockstart = round_down(pos, fs_info->sectorsize); 996 lockend = round_up(pos + *write_bytes, 997 fs_info->sectorsize) - 1; 998 999 if (nowait) { 1000 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, 1001 &cached_state)) { 1002 btrfs_drew_write_unlock(&root->snapshot_lock); 1003 return -EAGAIN; 1004 } 1005 } else { 1006 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, 1007 &cached_state); 1008 } 1009 1010 cur_offset = lockstart; 1011 while (cur_offset < lockend) { 1012 u64 num_bytes = lockend - cur_offset + 1; 1013 1014 ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait); 1015 if (ret <= 0) { 1016 /* 1017 * If cur_offset == lockstart it means we haven't found 1018 * any extent against which we can NOCOW, so unlock the 1019 * snapshot lock. 1020 */ 1021 if (cur_offset == lockstart) 1022 btrfs_drew_write_unlock(&root->snapshot_lock); 1023 break; 1024 } 1025 cur_offset += num_bytes; 1026 } 1027 1028 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 1029 1030 /* 1031 * cur_offset > lockstart means there's at least a partial range we can 1032 * NOCOW, and that range can cover one or more extents. 1033 */ 1034 if (cur_offset > lockstart) { 1035 *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos); 1036 return 1; 1037 } 1038 1039 return ret; 1040 } 1041 1042 void btrfs_check_nocow_unlock(struct btrfs_inode *inode) 1043 { 1044 btrfs_drew_write_unlock(&inode->root->snapshot_lock); 1045 } 1046 1047 int btrfs_write_check(struct kiocb *iocb, size_t count) 1048 { 1049 struct file *file = iocb->ki_filp; 1050 struct inode *inode = file_inode(file); 1051 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1052 loff_t pos = iocb->ki_pos; 1053 int ret; 1054 loff_t oldsize; 1055 1056 /* 1057 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or 1058 * prealloc flags, as without those flags we always have to COW. We will 1059 * later check if we can really COW into the target range (using 1060 * can_nocow_extent() at btrfs_get_blocks_direct_write()). 1061 */ 1062 if ((iocb->ki_flags & IOCB_NOWAIT) && 1063 !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 1064 return -EAGAIN; 1065 1066 ret = file_remove_privs(file); 1067 if (ret) 1068 return ret; 1069 1070 /* 1071 * We reserve space for updating the inode when we reserve space for the 1072 * extent we are going to write, so we will enospc out there. We don't 1073 * need to start yet another transaction to update the inode as we will 1074 * update the inode when we finish writing whatever data we write. 1075 */ 1076 if (!IS_NOCMTIME(inode)) { 1077 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 1078 inode_inc_iversion(inode); 1079 } 1080 1081 oldsize = i_size_read(inode); 1082 if (pos > oldsize) { 1083 /* Expand hole size to cover write data, preventing empty gap */ 1084 loff_t end_pos = round_up(pos + count, fs_info->sectorsize); 1085 1086 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); 1087 if (ret) 1088 return ret; 1089 } 1090 1091 return 0; 1092 } 1093 1094 static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, 1095 u64 start, u64 len, bool only_release_metadata) 1096 { 1097 if (len == 0) 1098 return; 1099 1100 if (only_release_metadata) { 1101 btrfs_check_nocow_unlock(inode); 1102 btrfs_delalloc_release_metadata(inode, len, true); 1103 } else { 1104 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1105 1106 btrfs_delalloc_release_space(inode, data_reserved, 1107 round_down(start, fs_info->sectorsize), 1108 len, true); 1109 } 1110 } 1111 1112 /* 1113 * Reserve data and metadata space for this buffered write range. 1114 * 1115 * Return >0 for the number of bytes reserved, which is always block aligned. 1116 * Return <0 for error. 1117 */ 1118 static ssize_t reserve_space(struct btrfs_inode *inode, 1119 struct extent_changeset **data_reserved, 1120 u64 start, size_t *len, bool nowait, 1121 bool *only_release_metadata) 1122 { 1123 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1124 const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); 1125 size_t reserve_bytes; 1126 int ret; 1127 1128 ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); 1129 if (ret < 0) { 1130 int can_nocow; 1131 1132 if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) 1133 return -EAGAIN; 1134 1135 /* 1136 * If we don't have to COW at the offset, reserve metadata only. 1137 * write_bytes may get smaller than requested here. 1138 */ 1139 can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); 1140 if (can_nocow < 0) 1141 ret = can_nocow; 1142 if (can_nocow > 0) 1143 ret = 0; 1144 if (ret) 1145 return ret; 1146 *only_release_metadata = true; 1147 } 1148 1149 reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); 1150 WARN_ON(reserve_bytes == 0); 1151 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, 1152 reserve_bytes, nowait); 1153 if (ret) { 1154 if (!*only_release_metadata) 1155 btrfs_free_reserved_data_space(inode, *data_reserved, 1156 start, *len); 1157 else 1158 btrfs_check_nocow_unlock(inode); 1159 1160 if (nowait && ret == -ENOSPC) 1161 ret = -EAGAIN; 1162 return ret; 1163 } 1164 return reserve_bytes; 1165 } 1166 1167 /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ 1168 static void shrink_reserved_space(struct btrfs_inode *inode, 1169 struct extent_changeset *data_reserved, 1170 u64 reserved_start, u64 reserved_len, 1171 u64 new_len, bool only_release_metadata) 1172 { 1173 const u64 diff = reserved_len - new_len; 1174 1175 ASSERT(new_len <= reserved_len); 1176 btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); 1177 if (only_release_metadata) 1178 btrfs_delalloc_release_metadata(inode, diff, true); 1179 else 1180 btrfs_delalloc_release_space(inode, data_reserved, 1181 reserved_start + new_len, diff, true); 1182 } 1183 1184 /* Calculate the maximum amount of bytes we can write into one folio. */ 1185 static size_t calc_write_bytes(const struct btrfs_inode *inode, 1186 const struct iov_iter *iter, u64 start) 1187 { 1188 const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); 1189 1190 return min(max_folio_size - (start & (max_folio_size - 1)), 1191 iov_iter_count(iter)); 1192 } 1193 1194 /* 1195 * Do the heavy-lifting work to copy one range into one folio of the page cache. 1196 * 1197 * Return > 0 in case we copied all bytes or just some of them. 1198 * Return 0 if no bytes were copied, in which case the caller should retry. 1199 * Return <0 on error. 1200 */ 1201 static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, 1202 struct extent_changeset **data_reserved, u64 start, 1203 bool nowait) 1204 { 1205 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1206 struct extent_state *cached_state = NULL; 1207 size_t write_bytes = calc_write_bytes(inode, iter, start); 1208 size_t copied; 1209 const u64 reserved_start = round_down(start, fs_info->sectorsize); 1210 u64 reserved_len; 1211 struct folio *folio = NULL; 1212 int extents_locked; 1213 u64 lockstart; 1214 u64 lockend; 1215 bool only_release_metadata = false; 1216 const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); 1217 int ret; 1218 1219 /* 1220 * Fault all pages before locking them in prepare_one_folio() to avoid 1221 * recursive lock. 1222 */ 1223 if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) 1224 return -EFAULT; 1225 extent_changeset_release(*data_reserved); 1226 ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, 1227 &only_release_metadata); 1228 if (ret < 0) 1229 return ret; 1230 reserved_len = ret; 1231 /* Write range must be inside the reserved range. */ 1232 ASSERT(reserved_start <= start); 1233 ASSERT(start + write_bytes <= reserved_start + reserved_len); 1234 1235 again: 1236 ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, 1237 bdp_flags); 1238 if (ret) { 1239 btrfs_delalloc_release_extents(inode, reserved_len); 1240 release_space(inode, *data_reserved, reserved_start, reserved_len, 1241 only_release_metadata); 1242 return ret; 1243 } 1244 1245 ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); 1246 if (ret) { 1247 btrfs_delalloc_release_extents(inode, reserved_len); 1248 release_space(inode, *data_reserved, reserved_start, reserved_len, 1249 only_release_metadata); 1250 return ret; 1251 } 1252 1253 /* 1254 * The reserved range goes beyond the current folio, shrink the reserved 1255 * space to the folio boundary. 1256 */ 1257 if (reserved_start + reserved_len > folio_end(folio)) { 1258 const u64 last_block = folio_end(folio); 1259 1260 shrink_reserved_space(inode, *data_reserved, reserved_start, 1261 reserved_len, last_block - reserved_start, 1262 only_release_metadata); 1263 write_bytes = last_block - start; 1264 reserved_len = last_block - reserved_start; 1265 } 1266 1267 extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, 1268 write_bytes, &lockstart, 1269 &lockend, nowait, 1270 &cached_state); 1271 if (extents_locked < 0) { 1272 if (!nowait && extents_locked == -EAGAIN) 1273 goto again; 1274 1275 btrfs_delalloc_release_extents(inode, reserved_len); 1276 release_space(inode, *data_reserved, reserved_start, reserved_len, 1277 only_release_metadata); 1278 ret = extents_locked; 1279 return ret; 1280 } 1281 1282 copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), 1283 write_bytes, iter); 1284 flush_dcache_folio(folio); 1285 1286 if (unlikely(copied < write_bytes)) { 1287 u64 last_block; 1288 1289 /* 1290 * The original write range doesn't need an uptodate folio as 1291 * the range is block aligned. But now a short copy happened. 1292 * We cannot handle it without an uptodate folio. 1293 * 1294 * So just revert the range and we will retry. 1295 */ 1296 if (!folio_test_uptodate(folio)) { 1297 iov_iter_revert(iter, copied); 1298 copied = 0; 1299 } 1300 1301 /* No copied bytes, unlock, release reserved space and exit. */ 1302 if (copied == 0) { 1303 if (extents_locked) 1304 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, 1305 &cached_state); 1306 else 1307 btrfs_free_extent_state(cached_state); 1308 btrfs_delalloc_release_extents(inode, reserved_len); 1309 release_space(inode, *data_reserved, reserved_start, reserved_len, 1310 only_release_metadata); 1311 btrfs_drop_folio(fs_info, folio, start, copied); 1312 return 0; 1313 } 1314 1315 /* Release the reserved space beyond the last block. */ 1316 last_block = round_up(start + copied, fs_info->sectorsize); 1317 1318 shrink_reserved_space(inode, *data_reserved, reserved_start, 1319 reserved_len, last_block - reserved_start, 1320 only_release_metadata); 1321 reserved_len = last_block - reserved_start; 1322 } 1323 1324 ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, 1325 only_release_metadata); 1326 /* 1327 * If we have not locked the extent range, because the range's start 1328 * offset is >= i_size, we might still have a non-NULL cached extent 1329 * state, acquired while marking the extent range as delalloc through 1330 * btrfs_dirty_page(). Therefore free any possible cached extent state 1331 * to avoid a memory leak. 1332 */ 1333 if (extents_locked) 1334 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 1335 else 1336 btrfs_free_extent_state(cached_state); 1337 1338 btrfs_delalloc_release_extents(inode, reserved_len); 1339 if (ret) { 1340 btrfs_drop_folio(fs_info, folio, start, copied); 1341 release_space(inode, *data_reserved, reserved_start, reserved_len, 1342 only_release_metadata); 1343 return ret; 1344 } 1345 if (only_release_metadata) 1346 btrfs_check_nocow_unlock(inode); 1347 1348 btrfs_drop_folio(fs_info, folio, start, copied); 1349 return copied; 1350 } 1351 1352 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) 1353 { 1354 struct file *file = iocb->ki_filp; 1355 loff_t pos; 1356 struct inode *inode = file_inode(file); 1357 struct extent_changeset *data_reserved = NULL; 1358 size_t num_written = 0; 1359 ssize_t ret; 1360 loff_t old_isize; 1361 unsigned int ilock_flags = 0; 1362 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); 1363 1364 if (nowait) 1365 ilock_flags |= BTRFS_ILOCK_TRY; 1366 1367 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 1368 if (ret < 0) 1369 return ret; 1370 1371 /* 1372 * We can only trust the isize with inode lock held, or it can race with 1373 * other buffered writes and cause incorrect call of 1374 * pagecache_isize_extended() to overwrite existing data. 1375 */ 1376 old_isize = i_size_read(inode); 1377 1378 ret = generic_write_checks(iocb, iter); 1379 if (ret <= 0) 1380 goto out; 1381 1382 ret = btrfs_write_check(iocb, ret); 1383 if (ret < 0) 1384 goto out; 1385 1386 pos = iocb->ki_pos; 1387 while (iov_iter_count(iter) > 0) { 1388 ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); 1389 if (ret < 0) 1390 break; 1391 pos += ret; 1392 num_written += ret; 1393 cond_resched(); 1394 } 1395 1396 extent_changeset_free(data_reserved); 1397 if (num_written > 0) { 1398 pagecache_isize_extended(inode, old_isize, iocb->ki_pos); 1399 iocb->ki_pos += num_written; 1400 } 1401 out: 1402 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 1403 return num_written ? num_written : ret; 1404 } 1405 1406 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, 1407 const struct btrfs_ioctl_encoded_io_args *encoded) 1408 { 1409 struct file *file = iocb->ki_filp; 1410 struct inode *inode = file_inode(file); 1411 loff_t count; 1412 ssize_t ret; 1413 1414 btrfs_inode_lock(BTRFS_I(inode), 0); 1415 count = encoded->len; 1416 ret = generic_write_checks_count(iocb, &count); 1417 if (ret == 0 && count != encoded->len) { 1418 /* 1419 * The write got truncated by generic_write_checks_count(). We 1420 * can't do a partial encoded write. 1421 */ 1422 ret = -EFBIG; 1423 } 1424 if (ret || encoded->len == 0) 1425 goto out; 1426 1427 ret = btrfs_write_check(iocb, encoded->len); 1428 if (ret < 0) 1429 goto out; 1430 1431 ret = btrfs_do_encoded_write(iocb, from, encoded); 1432 out: 1433 btrfs_inode_unlock(BTRFS_I(inode), 0); 1434 return ret; 1435 } 1436 1437 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, 1438 const struct btrfs_ioctl_encoded_io_args *encoded) 1439 { 1440 struct file *file = iocb->ki_filp; 1441 struct btrfs_inode *inode = BTRFS_I(file_inode(file)); 1442 ssize_t num_written, num_sync; 1443 1444 /* 1445 * If the fs flips readonly due to some impossible error, although we 1446 * have opened a file as writable, we have to stop this write operation 1447 * to ensure consistency. 1448 */ 1449 if (BTRFS_FS_ERROR(inode->root->fs_info)) 1450 return -EROFS; 1451 1452 if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) 1453 return -EOPNOTSUPP; 1454 1455 if (encoded) { 1456 num_written = btrfs_encoded_write(iocb, from, encoded); 1457 num_sync = encoded->len; 1458 } else if (iocb->ki_flags & IOCB_DIRECT) { 1459 num_written = btrfs_direct_write(iocb, from); 1460 num_sync = num_written; 1461 } else { 1462 num_written = btrfs_buffered_write(iocb, from); 1463 num_sync = num_written; 1464 } 1465 1466 btrfs_set_inode_last_sub_trans(inode); 1467 1468 if (num_sync > 0) { 1469 num_sync = generic_write_sync(iocb, num_sync); 1470 if (num_sync < 0) 1471 num_written = num_sync; 1472 } 1473 1474 return num_written; 1475 } 1476 1477 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1478 { 1479 return btrfs_do_write_iter(iocb, from, NULL); 1480 } 1481 1482 int btrfs_release_file(struct inode *inode, struct file *filp) 1483 { 1484 struct btrfs_file_private *private = filp->private_data; 1485 1486 if (private) { 1487 kfree(private->filldir_buf); 1488 btrfs_free_extent_state(private->llseek_cached_state); 1489 kfree(private); 1490 filp->private_data = NULL; 1491 } 1492 1493 /* 1494 * Set by setattr when we are about to truncate a file from a non-zero 1495 * size to a zero size. This tries to flush down new bytes that may 1496 * have been written if the application were using truncate to replace 1497 * a file in place. 1498 */ 1499 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 1500 &BTRFS_I(inode)->runtime_flags)) 1501 filemap_flush(inode->i_mapping); 1502 return 0; 1503 } 1504 1505 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end) 1506 { 1507 int ret; 1508 struct blk_plug plug; 1509 1510 /* 1511 * This is only called in fsync, which would do synchronous writes, so 1512 * a plug can merge adjacent IOs as much as possible. Esp. in case of 1513 * multiple disks using raid profile, a large IO can be split to 1514 * several segments of stripe length (currently 64K). 1515 */ 1516 blk_start_plug(&plug); 1517 ret = btrfs_fdatawrite_range(inode, start, end); 1518 blk_finish_plug(&plug); 1519 1520 return ret; 1521 } 1522 1523 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) 1524 { 1525 struct btrfs_inode *inode = ctx->inode; 1526 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1527 1528 if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && 1529 list_empty(&ctx->ordered_extents)) 1530 return true; 1531 1532 /* 1533 * If we are doing a fast fsync we can not bail out if the inode's 1534 * last_trans is <= then the last committed transaction, because we only 1535 * update the last_trans of the inode during ordered extent completion, 1536 * and for a fast fsync we don't wait for that, we only wait for the 1537 * writeback to complete. 1538 */ 1539 if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && 1540 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || 1541 list_empty(&ctx->ordered_extents))) 1542 return true; 1543 1544 return false; 1545 } 1546 1547 /* 1548 * fsync call for both files and directories. This logs the inode into 1549 * the tree log instead of forcing full commits whenever possible. 1550 * 1551 * It needs to call filemap_fdatawait so that all ordered extent updates are 1552 * in the metadata btree are up to date for copying to the log. 1553 * 1554 * It drops the inode mutex before doing the tree log commit. This is an 1555 * important optimization for directories because holding the mutex prevents 1556 * new operations on the dir while we write to disk. 1557 */ 1558 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1559 { 1560 struct dentry *dentry = file_dentry(file); 1561 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 1562 struct btrfs_root *root = inode->root; 1563 struct btrfs_fs_info *fs_info = root->fs_info; 1564 struct btrfs_trans_handle *trans; 1565 struct btrfs_log_ctx ctx; 1566 int ret = 0, err; 1567 u64 len; 1568 bool full_sync; 1569 bool skip_ilock = false; 1570 1571 if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { 1572 skip_ilock = true; 1573 current->journal_info = NULL; 1574 btrfs_assert_inode_locked(inode); 1575 } 1576 1577 trace_btrfs_sync_file(file, datasync); 1578 1579 btrfs_init_log_ctx(&ctx, inode); 1580 1581 /* 1582 * Always set the range to a full range, otherwise we can get into 1583 * several problems, from missing file extent items to represent holes 1584 * when not using the NO_HOLES feature, to log tree corruption due to 1585 * races between hole detection during logging and completion of ordered 1586 * extents outside the range, to missing checksums due to ordered extents 1587 * for which we flushed only a subset of their pages. 1588 */ 1589 start = 0; 1590 end = LLONG_MAX; 1591 len = (u64)LLONG_MAX + 1; 1592 1593 /* 1594 * We write the dirty pages in the range and wait until they complete 1595 * out of the ->i_mutex. If so, we can flush the dirty pages by 1596 * multi-task, and make the performance up. See 1597 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1598 */ 1599 ret = start_ordered_ops(inode, start, end); 1600 if (ret) 1601 goto out; 1602 1603 if (skip_ilock) 1604 down_write(&inode->i_mmap_lock); 1605 else 1606 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 1607 1608 atomic_inc(&root->log_batch); 1609 1610 /* 1611 * Before we acquired the inode's lock and the mmap lock, someone may 1612 * have dirtied more pages in the target range. We need to make sure 1613 * that writeback for any such pages does not start while we are logging 1614 * the inode, because if it does, any of the following might happen when 1615 * we are not doing a full inode sync: 1616 * 1617 * 1) We log an extent after its writeback finishes but before its 1618 * checksums are added to the csum tree, leading to -EIO errors 1619 * when attempting to read the extent after a log replay. 1620 * 1621 * 2) We can end up logging an extent before its writeback finishes. 1622 * Therefore after the log replay we will have a file extent item 1623 * pointing to an unwritten extent (and no data checksums as well). 1624 * 1625 * So trigger writeback for any eventual new dirty pages and then we 1626 * wait for all ordered extents to complete below. 1627 */ 1628 ret = start_ordered_ops(inode, start, end); 1629 if (ret) { 1630 if (skip_ilock) 1631 up_write(&inode->i_mmap_lock); 1632 else 1633 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1634 goto out; 1635 } 1636 1637 /* 1638 * Always check for the full sync flag while holding the inode's lock, 1639 * to avoid races with other tasks. The flag must be either set all the 1640 * time during logging or always off all the time while logging. 1641 * We check the flag here after starting delalloc above, because when 1642 * running delalloc the full sync flag may be set if we need to drop 1643 * extra extent map ranges due to temporary memory allocation failures. 1644 */ 1645 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 1646 1647 /* 1648 * We have to do this here to avoid the priority inversion of waiting on 1649 * IO of a lower priority task while holding a transaction open. 1650 * 1651 * For a full fsync we wait for the ordered extents to complete while 1652 * for a fast fsync we wait just for writeback to complete, and then 1653 * attach the ordered extents to the transaction so that a transaction 1654 * commit waits for their completion, to avoid data loss if we fsync, 1655 * the current transaction commits before the ordered extents complete 1656 * and a power failure happens right after that. 1657 * 1658 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the 1659 * logical address recorded in the ordered extent may change. We need 1660 * to wait for the IO to stabilize the logical address. 1661 */ 1662 if (full_sync || btrfs_is_zoned(fs_info)) { 1663 ret = btrfs_wait_ordered_range(inode, start, len); 1664 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); 1665 } else { 1666 /* 1667 * Get our ordered extents as soon as possible to avoid doing 1668 * checksum lookups in the csum tree, and use instead the 1669 * checksums attached to the ordered extents. 1670 */ 1671 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents); 1672 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); 1673 if (ret) 1674 goto out_release_extents; 1675 1676 /* 1677 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after 1678 * starting and waiting for writeback, because for buffered IO 1679 * it may have been set during the end IO callback 1680 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in 1681 * case an error happened and we need to wait for ordered 1682 * extents to complete so that any extent maps that point to 1683 * unwritten locations are dropped and we don't log them. 1684 */ 1685 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags)) 1686 ret = btrfs_wait_ordered_range(inode, start, len); 1687 } 1688 1689 if (ret) 1690 goto out_release_extents; 1691 1692 atomic_inc(&root->log_batch); 1693 1694 if (skip_inode_logging(&ctx)) { 1695 /* 1696 * We've had everything committed since the last time we were 1697 * modified so clear this flag in case it was set for whatever 1698 * reason, it's no longer relevant. 1699 */ 1700 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 1701 /* 1702 * An ordered extent might have started before and completed 1703 * already with io errors, in which case the inode was not 1704 * updated and we end up here. So check the inode's mapping 1705 * for any errors that might have happened since we last 1706 * checked called fsync. 1707 */ 1708 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); 1709 goto out_release_extents; 1710 } 1711 1712 btrfs_init_log_ctx_scratch_eb(&ctx); 1713 1714 /* 1715 * We use start here because we will need to wait on the IO to complete 1716 * in btrfs_sync_log, which could require joining a transaction (for 1717 * example checking cross references in the nocow path). If we use join 1718 * here we could get into a situation where we're waiting on IO to 1719 * happen that is blocked on a transaction trying to commit. With start 1720 * we inc the extwriter counter, so we wait for all extwriters to exit 1721 * before we start blocking joiners. This comment is to keep somebody 1722 * from thinking they are super smart and changing this to 1723 * btrfs_join_transaction *cough*Josef*cough*. 1724 */ 1725 trans = btrfs_start_transaction(root, 0); 1726 if (IS_ERR(trans)) { 1727 ret = PTR_ERR(trans); 1728 goto out_release_extents; 1729 } 1730 trans->in_fsync = true; 1731 1732 ret = btrfs_log_dentry_safe(trans, dentry, &ctx); 1733 /* 1734 * Scratch eb no longer needed, release before syncing log or commit 1735 * transaction, to avoid holding unnecessary memory during such long 1736 * operations. 1737 */ 1738 if (ctx.scratch_eb) { 1739 free_extent_buffer(ctx.scratch_eb); 1740 ctx.scratch_eb = NULL; 1741 } 1742 btrfs_release_log_ctx_extents(&ctx); 1743 if (ret < 0) { 1744 /* Fallthrough and commit/free transaction. */ 1745 ret = BTRFS_LOG_FORCE_COMMIT; 1746 } 1747 1748 /* we've logged all the items and now have a consistent 1749 * version of the file in the log. It is possible that 1750 * someone will come in and modify the file, but that's 1751 * fine because the log is consistent on disk, and we 1752 * have references to all of the file's extents 1753 * 1754 * It is possible that someone will come in and log the 1755 * file again, but that will end up using the synchronization 1756 * inside btrfs_sync_log to keep things safe. 1757 */ 1758 if (skip_ilock) 1759 up_write(&inode->i_mmap_lock); 1760 else 1761 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1762 1763 if (ret == BTRFS_NO_LOG_SYNC) { 1764 ret = btrfs_end_transaction(trans); 1765 goto out; 1766 } 1767 1768 /* We successfully logged the inode, attempt to sync the log. */ 1769 if (!ret) { 1770 ret = btrfs_sync_log(trans, root, &ctx); 1771 if (!ret) { 1772 ret = btrfs_end_transaction(trans); 1773 goto out; 1774 } 1775 } 1776 1777 /* 1778 * At this point we need to commit the transaction because we had 1779 * btrfs_need_log_full_commit() or some other error. 1780 * 1781 * If we didn't do a full sync we have to stop the trans handle, wait on 1782 * the ordered extents, start it again and commit the transaction. If 1783 * we attempt to wait on the ordered extents here we could deadlock with 1784 * something like fallocate() that is holding the extent lock trying to 1785 * start a transaction while some other thread is trying to commit the 1786 * transaction while we (fsync) are currently holding the transaction 1787 * open. 1788 */ 1789 if (!full_sync) { 1790 ret = btrfs_end_transaction(trans); 1791 if (ret) 1792 goto out; 1793 ret = btrfs_wait_ordered_range(inode, start, len); 1794 if (ret) 1795 goto out; 1796 1797 /* 1798 * This is safe to use here because we're only interested in 1799 * making sure the transaction that had the ordered extents is 1800 * committed. We aren't waiting on anything past this point, 1801 * we're purely getting the transaction and committing it. 1802 */ 1803 trans = btrfs_attach_transaction_barrier(root); 1804 if (IS_ERR(trans)) { 1805 ret = PTR_ERR(trans); 1806 1807 /* 1808 * We committed the transaction and there's no currently 1809 * running transaction, this means everything we care 1810 * about made it to disk and we are done. 1811 */ 1812 if (ret == -ENOENT) 1813 ret = 0; 1814 goto out; 1815 } 1816 } 1817 1818 ret = btrfs_commit_transaction(trans); 1819 out: 1820 free_extent_buffer(ctx.scratch_eb); 1821 ASSERT(list_empty(&ctx.list)); 1822 ASSERT(list_empty(&ctx.conflict_inodes)); 1823 err = file_check_and_advance_wb_err(file); 1824 if (!ret) 1825 ret = err; 1826 return ret > 0 ? -EIO : ret; 1827 1828 out_release_extents: 1829 btrfs_release_log_ctx_extents(&ctx); 1830 if (skip_ilock) 1831 up_write(&inode->i_mmap_lock); 1832 else 1833 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1834 goto out; 1835 } 1836 1837 /* 1838 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 1839 * called from a page fault handler when a page is first dirtied. Hence we must 1840 * be careful to check for EOF conditions here. We set the page up correctly 1841 * for a written page which means we get ENOSPC checking when writing into 1842 * holes and correct delalloc and unwritten extent mapping on filesystems that 1843 * support these features. 1844 * 1845 * We are not allowed to take the i_mutex here so we have to play games to 1846 * protect against truncate races as the page could now be beyond EOF. Because 1847 * truncate_setsize() writes the inode size before removing pages, once we have 1848 * the page lock we can determine safely if the page is beyond EOF. If it is not 1849 * beyond EOF, then the page is guaranteed safe against truncation until we 1850 * unlock the page. 1851 */ 1852 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) 1853 { 1854 struct page *page = vmf->page; 1855 struct folio *folio = page_folio(page); 1856 struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file)); 1857 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1858 struct extent_io_tree *io_tree = &inode->io_tree; 1859 struct btrfs_ordered_extent *ordered; 1860 struct extent_state *cached_state = NULL; 1861 struct extent_changeset *data_reserved = NULL; 1862 unsigned long zero_start; 1863 loff_t size; 1864 size_t fsize = folio_size(folio); 1865 int ret; 1866 bool only_release_metadata = false; 1867 u64 reserved_space; 1868 u64 page_start; 1869 u64 page_end; 1870 u64 end; 1871 1872 reserved_space = fsize; 1873 1874 sb_start_pagefault(inode->vfs_inode.i_sb); 1875 page_start = folio_pos(folio); 1876 page_end = page_start + folio_size(folio) - 1; 1877 end = page_end; 1878 1879 /* 1880 * Reserving delalloc space after obtaining the page lock can lead to 1881 * deadlock. For example, if a dirty page is locked by this function 1882 * and the call to btrfs_delalloc_reserve_space() ends up triggering 1883 * dirty page write out, then the btrfs_writepages() function could 1884 * end up waiting indefinitely to get a lock on the page currently 1885 * being processed by btrfs_page_mkwrite() function. 1886 */ 1887 ret = btrfs_check_data_free_space(inode, &data_reserved, page_start, 1888 reserved_space, false); 1889 if (ret < 0) { 1890 size_t write_bytes = reserved_space; 1891 1892 if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0) 1893 goto out_noreserve; 1894 1895 only_release_metadata = true; 1896 1897 /* 1898 * Can't write the whole range, there may be shared extents or 1899 * holes in the range, bail out with @only_release_metadata set 1900 * to true so that we unlock the nocow lock before returning the 1901 * error. 1902 */ 1903 if (write_bytes < reserved_space) 1904 goto out_noreserve; 1905 } 1906 ret = btrfs_delalloc_reserve_metadata(inode, reserved_space, 1907 reserved_space, false); 1908 if (ret < 0) { 1909 if (!only_release_metadata) 1910 btrfs_free_reserved_data_space(inode, data_reserved, 1911 page_start, reserved_space); 1912 goto out_noreserve; 1913 } 1914 1915 ret = file_update_time(vmf->vma->vm_file); 1916 if (ret < 0) 1917 goto out; 1918 again: 1919 down_read(&inode->i_mmap_lock); 1920 folio_lock(folio); 1921 size = i_size_read(&inode->vfs_inode); 1922 1923 if ((folio->mapping != inode->vfs_inode.i_mapping) || 1924 (page_start >= size)) { 1925 /* Page got truncated out from underneath us. */ 1926 goto out_unlock; 1927 } 1928 folio_wait_writeback(folio); 1929 1930 btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); 1931 ret = set_folio_extent_mapped(folio); 1932 if (ret < 0) { 1933 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1934 goto out_unlock; 1935 } 1936 1937 /* 1938 * We can't set the delalloc bits if there are pending ordered 1939 * extents. Drop our locks and wait for them to finish. 1940 */ 1941 ordered = btrfs_lookup_ordered_range(inode, page_start, fsize); 1942 if (ordered) { 1943 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1944 folio_unlock(folio); 1945 up_read(&inode->i_mmap_lock); 1946 btrfs_start_ordered_extent(ordered); 1947 btrfs_put_ordered_extent(ordered); 1948 goto again; 1949 } 1950 1951 if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { 1952 reserved_space = round_up(size - page_start, fs_info->sectorsize); 1953 if (reserved_space < fsize) { 1954 const u64 to_free = fsize - reserved_space; 1955 1956 end = page_start + reserved_space - 1; 1957 if (only_release_metadata) 1958 btrfs_delalloc_release_metadata(inode, to_free, true); 1959 else 1960 btrfs_delalloc_release_space(inode, data_reserved, 1961 end + 1, to_free, true); 1962 } 1963 } 1964 1965 /* 1966 * page_mkwrite gets called when the page is firstly dirtied after it's 1967 * faulted in, but write(2) could also dirty a page and set delalloc 1968 * bits, thus in this case for space account reason, we still need to 1969 * clear any delalloc bits within this page range since we have to 1970 * reserve data&meta space before lock_page() (see above comments). 1971 */ 1972 btrfs_clear_extent_bit(io_tree, page_start, end, 1973 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1974 EXTENT_DEFRAG, &cached_state); 1975 1976 ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state); 1977 if (ret < 0) { 1978 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 1979 goto out_unlock; 1980 } 1981 1982 /* Page is wholly or partially inside EOF. */ 1983 if (page_start + folio_size(folio) > size) 1984 zero_start = offset_in_folio(folio, size); 1985 else 1986 zero_start = fsize; 1987 1988 if (zero_start != fsize) 1989 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); 1990 1991 btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); 1992 btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); 1993 btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); 1994 1995 btrfs_set_inode_last_sub_trans(inode); 1996 1997 if (only_release_metadata) 1998 btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE, 1999 &cached_state); 2000 2001 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); 2002 up_read(&inode->i_mmap_lock); 2003 2004 btrfs_delalloc_release_extents(inode, fsize); 2005 if (only_release_metadata) 2006 btrfs_check_nocow_unlock(inode); 2007 sb_end_pagefault(inode->vfs_inode.i_sb); 2008 extent_changeset_free(data_reserved); 2009 return VM_FAULT_LOCKED; 2010 2011 out_unlock: 2012 folio_unlock(folio); 2013 up_read(&inode->i_mmap_lock); 2014 out: 2015 btrfs_delalloc_release_extents(inode, fsize); 2016 if (only_release_metadata) 2017 btrfs_delalloc_release_metadata(inode, reserved_space, true); 2018 else 2019 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2020 reserved_space, true); 2021 extent_changeset_free(data_reserved); 2022 out_noreserve: 2023 if (only_release_metadata) 2024 btrfs_check_nocow_unlock(inode); 2025 2026 sb_end_pagefault(inode->vfs_inode.i_sb); 2027 2028 if (ret < 0) 2029 return vmf_error(ret); 2030 2031 /* Make the VM retry the fault. */ 2032 return VM_FAULT_NOPAGE; 2033 } 2034 2035 static const struct vm_operations_struct btrfs_file_vm_ops = { 2036 .fault = filemap_fault, 2037 .map_pages = filemap_map_pages, 2038 .page_mkwrite = btrfs_page_mkwrite, 2039 }; 2040 2041 static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) 2042 { 2043 struct file *filp = desc->file; 2044 struct address_space *mapping = filp->f_mapping; 2045 2046 if (!mapping->a_ops->read_folio) 2047 return -ENOEXEC; 2048 2049 file_accessed(filp); 2050 desc->vm_ops = &btrfs_file_vm_ops; 2051 2052 return 0; 2053 } 2054 2055 static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, 2056 int slot, u64 start, u64 end) 2057 { 2058 struct btrfs_file_extent_item *fi; 2059 struct btrfs_key key; 2060 2061 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 2062 return false; 2063 2064 btrfs_item_key_to_cpu(leaf, &key, slot); 2065 if (key.objectid != btrfs_ino(inode) || 2066 key.type != BTRFS_EXTENT_DATA_KEY) 2067 return false; 2068 2069 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2070 2071 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2072 return false; 2073 2074 if (btrfs_file_extent_disk_bytenr(leaf, fi)) 2075 return false; 2076 2077 if (key.offset == end) 2078 return true; 2079 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) 2080 return true; 2081 return false; 2082 } 2083 2084 static int fill_holes(struct btrfs_trans_handle *trans, 2085 struct btrfs_inode *inode, 2086 struct btrfs_path *path, u64 offset, u64 end) 2087 { 2088 struct btrfs_fs_info *fs_info = trans->fs_info; 2089 struct btrfs_root *root = inode->root; 2090 struct extent_buffer *leaf; 2091 struct btrfs_file_extent_item *fi; 2092 struct extent_map *hole_em; 2093 struct btrfs_key key; 2094 int ret; 2095 2096 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 2097 goto out; 2098 2099 key.objectid = btrfs_ino(inode); 2100 key.type = BTRFS_EXTENT_DATA_KEY; 2101 key.offset = offset; 2102 2103 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2104 if (ret <= 0) { 2105 /* 2106 * We should have dropped this offset, so if we find it then 2107 * something has gone horribly wrong. 2108 */ 2109 if (ret == 0) 2110 ret = -EINVAL; 2111 return ret; 2112 } 2113 2114 leaf = path->nodes[0]; 2115 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { 2116 u64 num_bytes; 2117 2118 path->slots[0]--; 2119 fi = btrfs_item_ptr(leaf, path->slots[0], 2120 struct btrfs_file_extent_item); 2121 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + 2122 end - offset; 2123 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2124 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2125 btrfs_set_file_extent_offset(leaf, fi, 0); 2126 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2127 goto out; 2128 } 2129 2130 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { 2131 u64 num_bytes; 2132 2133 key.offset = offset; 2134 btrfs_set_item_key_safe(trans, path, &key); 2135 fi = btrfs_item_ptr(leaf, path->slots[0], 2136 struct btrfs_file_extent_item); 2137 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 2138 offset; 2139 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2140 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2141 btrfs_set_file_extent_offset(leaf, fi, 0); 2142 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2143 goto out; 2144 } 2145 btrfs_release_path(path); 2146 2147 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, 2148 end - offset); 2149 if (ret) 2150 return ret; 2151 2152 out: 2153 btrfs_release_path(path); 2154 2155 hole_em = btrfs_alloc_extent_map(); 2156 if (!hole_em) { 2157 btrfs_drop_extent_map_range(inode, offset, end - 1, false); 2158 btrfs_set_inode_full_sync(inode); 2159 } else { 2160 hole_em->start = offset; 2161 hole_em->len = end - offset; 2162 hole_em->ram_bytes = hole_em->len; 2163 2164 hole_em->disk_bytenr = EXTENT_MAP_HOLE; 2165 hole_em->disk_num_bytes = 0; 2166 hole_em->generation = trans->transid; 2167 2168 ret = btrfs_replace_extent_map_range(inode, hole_em, true); 2169 btrfs_free_extent_map(hole_em); 2170 if (ret) 2171 btrfs_set_inode_full_sync(inode); 2172 } 2173 2174 return 0; 2175 } 2176 2177 /* 2178 * Find a hole extent on given inode and change start/len to the end of hole 2179 * extent.(hole/vacuum extent whose em->start <= start && 2180 * em->start + em->len > start) 2181 * When a hole extent is found, return 1 and modify start/len. 2182 */ 2183 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 2184 { 2185 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2186 struct extent_map *em; 2187 int ret = 0; 2188 2189 em = btrfs_get_extent(inode, NULL, 2190 round_down(*start, fs_info->sectorsize), 2191 round_up(*len, fs_info->sectorsize)); 2192 if (IS_ERR(em)) 2193 return PTR_ERR(em); 2194 2195 /* Hole or vacuum extent(only exists in no-hole mode) */ 2196 if (em->disk_bytenr == EXTENT_MAP_HOLE) { 2197 ret = 1; 2198 *len = em->start + em->len > *start + *len ? 2199 0 : *start + *len - em->start - em->len; 2200 *start = em->start + em->len; 2201 } 2202 btrfs_free_extent_map(em); 2203 return ret; 2204 } 2205 2206 /* 2207 * Check if there is no folio in the range. 2208 * 2209 * We cannot utilize filemap_range_has_page() in a filemap with large folios 2210 * as we can hit the following false positive: 2211 * 2212 * start end 2213 * | | 2214 * |//|//|//|//| | | | | | | | |//|//| 2215 * \ / \ / 2216 * Folio A Folio B 2217 * 2218 * That large folio A and B cover the start and end indexes. 2219 * In that case filemap_range_has_page() will always return true, but the above 2220 * case is fine for btrfs_punch_hole_lock_range() usage. 2221 * 2222 * So here we only ensure that no other folios is in the range, excluding the 2223 * head/tail large folio. 2224 */ 2225 static bool check_range_has_page(struct inode *inode, u64 start, u64 end) 2226 { 2227 struct folio_batch fbatch; 2228 bool ret = false; 2229 /* 2230 * For subpage case, if the range is not at page boundary, we could 2231 * have pages at the leading/tailing part of the range. 2232 * This could lead to dead loop since filemap_range_has_page() 2233 * will always return true. 2234 * So here we need to do extra page alignment for 2235 * filemap_range_has_page(). 2236 * 2237 * And do not decrease page_lockend right now, as it can be 0. 2238 */ 2239 const u64 page_lockstart = round_up(start, PAGE_SIZE); 2240 const u64 page_lockend = round_down(end + 1, PAGE_SIZE); 2241 const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; 2242 const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; 2243 pgoff_t tmp = start_index; 2244 int found_folios; 2245 2246 /* The same page or adjacent pages. */ 2247 if (page_lockend <= page_lockstart) 2248 return false; 2249 2250 folio_batch_init(&fbatch); 2251 found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); 2252 for (int i = 0; i < found_folios; i++) { 2253 struct folio *folio = fbatch.folios[i]; 2254 2255 /* A large folio begins before the start. Not a target. */ 2256 if (folio->index < start_index) 2257 continue; 2258 /* A large folio extends beyond the end. Not a target. */ 2259 if (folio_next_index(folio) > end_index) 2260 continue; 2261 /* A folio doesn't cover the head/tail index. Found a target. */ 2262 ret = true; 2263 break; 2264 } 2265 folio_batch_release(&fbatch); 2266 return ret; 2267 } 2268 2269 static void btrfs_punch_hole_lock_range(struct inode *inode, 2270 const u64 lockstart, const u64 lockend, 2271 struct extent_state **cached_state) 2272 { 2273 while (1) { 2274 truncate_pagecache_range(inode, lockstart, lockend); 2275 2276 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2277 cached_state); 2278 /* 2279 * We can't have ordered extents in the range, nor dirty/writeback 2280 * pages, because we have locked the inode's VFS lock in exclusive 2281 * mode, we have locked the inode's i_mmap_lock in exclusive mode, 2282 * we have flushed all delalloc in the range and we have waited 2283 * for any ordered extents in the range to complete. 2284 * We can race with anyone reading pages from this range, so after 2285 * locking the range check if we have pages in the range, and if 2286 * we do, unlock the range and retry. 2287 */ 2288 if (!check_range_has_page(inode, lockstart, lockend)) 2289 break; 2290 2291 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2292 cached_state); 2293 } 2294 2295 btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); 2296 } 2297 2298 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, 2299 struct btrfs_inode *inode, 2300 struct btrfs_path *path, 2301 struct btrfs_replace_extent_info *extent_info, 2302 const u64 replace_len, 2303 const u64 bytes_to_drop) 2304 { 2305 struct btrfs_fs_info *fs_info = trans->fs_info; 2306 struct btrfs_root *root = inode->root; 2307 struct btrfs_file_extent_item *extent; 2308 struct extent_buffer *leaf; 2309 struct btrfs_key key; 2310 int slot; 2311 int ret; 2312 2313 if (replace_len == 0) 2314 return 0; 2315 2316 if (extent_info->disk_offset == 0 && 2317 btrfs_fs_incompat(fs_info, NO_HOLES)) { 2318 btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 2319 return 0; 2320 } 2321 2322 key.objectid = btrfs_ino(inode); 2323 key.type = BTRFS_EXTENT_DATA_KEY; 2324 key.offset = extent_info->file_offset; 2325 ret = btrfs_insert_empty_item(trans, root, path, &key, 2326 sizeof(struct btrfs_file_extent_item)); 2327 if (ret) 2328 return ret; 2329 leaf = path->nodes[0]; 2330 slot = path->slots[0]; 2331 write_extent_buffer(leaf, extent_info->extent_buf, 2332 btrfs_item_ptr_offset(leaf, slot), 2333 sizeof(struct btrfs_file_extent_item)); 2334 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2335 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); 2336 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); 2337 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); 2338 if (extent_info->is_new_extent) 2339 btrfs_set_file_extent_generation(leaf, extent, trans->transid); 2340 btrfs_release_path(path); 2341 2342 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, 2343 replace_len); 2344 if (ret) 2345 return ret; 2346 2347 /* If it's a hole, nothing more needs to be done. */ 2348 if (extent_info->disk_offset == 0) { 2349 btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 2350 return 0; 2351 } 2352 2353 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); 2354 2355 if (extent_info->is_new_extent && extent_info->insertions == 0) { 2356 key.objectid = extent_info->disk_offset; 2357 key.type = BTRFS_EXTENT_ITEM_KEY; 2358 key.offset = extent_info->disk_len; 2359 ret = btrfs_alloc_reserved_file_extent(trans, root, 2360 btrfs_ino(inode), 2361 extent_info->file_offset, 2362 extent_info->qgroup_reserved, 2363 &key); 2364 } else { 2365 struct btrfs_ref ref = { 2366 .action = BTRFS_ADD_DELAYED_REF, 2367 .bytenr = extent_info->disk_offset, 2368 .num_bytes = extent_info->disk_len, 2369 .owning_root = btrfs_root_id(root), 2370 .ref_root = btrfs_root_id(root), 2371 }; 2372 u64 ref_offset; 2373 2374 ref_offset = extent_info->file_offset - extent_info->data_offset; 2375 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); 2376 ret = btrfs_inc_extent_ref(trans, &ref); 2377 } 2378 2379 extent_info->insertions++; 2380 2381 return ret; 2382 } 2383 2384 /* 2385 * The respective range must have been previously locked, as well as the inode. 2386 * The end offset is inclusive (last byte of the range). 2387 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing 2388 * the file range with an extent. 2389 * When not punching a hole, we don't want to end up in a state where we dropped 2390 * extents without inserting a new one, so we must abort the transaction to avoid 2391 * a corruption. 2392 */ 2393 int btrfs_replace_file_extents(struct btrfs_inode *inode, 2394 struct btrfs_path *path, const u64 start, 2395 const u64 end, 2396 struct btrfs_replace_extent_info *extent_info, 2397 struct btrfs_trans_handle **trans_out) 2398 { 2399 struct btrfs_drop_extents_args drop_args = { 0 }; 2400 struct btrfs_root *root = inode->root; 2401 struct btrfs_fs_info *fs_info = root->fs_info; 2402 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); 2403 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); 2404 struct btrfs_trans_handle *trans = NULL; 2405 struct btrfs_block_rsv rsv; 2406 unsigned int rsv_count; 2407 u64 cur_offset; 2408 u64 len = end - start; 2409 int ret = 0; 2410 2411 if (end <= start) 2412 return -EINVAL; 2413 2414 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); 2415 rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1); 2416 rsv.failfast = true; 2417 2418 /* 2419 * 1 - update the inode 2420 * 1 - removing the extents in the range 2421 * 1 - adding the hole extent if no_holes isn't set or if we are 2422 * replacing the range with a new extent 2423 */ 2424 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) 2425 rsv_count = 3; 2426 else 2427 rsv_count = 2; 2428 2429 trans = btrfs_start_transaction(root, rsv_count); 2430 if (IS_ERR(trans)) { 2431 ret = PTR_ERR(trans); 2432 trans = NULL; 2433 goto out_release; 2434 } 2435 2436 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, 2437 min_size, false); 2438 if (WARN_ON(ret)) 2439 goto out_trans; 2440 trans->block_rsv = &rsv; 2441 2442 cur_offset = start; 2443 drop_args.path = path; 2444 drop_args.end = end + 1; 2445 drop_args.drop_cache = true; 2446 while (cur_offset < end) { 2447 drop_args.start = cur_offset; 2448 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2449 /* If we are punching a hole decrement the inode's byte count */ 2450 if (!extent_info) 2451 btrfs_update_inode_bytes(inode, 0, 2452 drop_args.bytes_found); 2453 if (ret != -ENOSPC) { 2454 /* 2455 * The only time we don't want to abort is if we are 2456 * attempting to clone a partial inline extent, in which 2457 * case we'll get EOPNOTSUPP. However if we aren't 2458 * clone we need to abort no matter what, because if we 2459 * got EOPNOTSUPP via prealloc then we messed up and 2460 * need to abort. 2461 */ 2462 if (unlikely(ret && 2463 (ret != -EOPNOTSUPP || 2464 (extent_info && extent_info->is_new_extent)))) 2465 btrfs_abort_transaction(trans, ret); 2466 break; 2467 } 2468 2469 trans->block_rsv = &fs_info->trans_block_rsv; 2470 2471 if (!extent_info && cur_offset < drop_args.drop_end && 2472 cur_offset < ino_size) { 2473 ret = fill_holes(trans, inode, path, cur_offset, 2474 drop_args.drop_end); 2475 if (unlikely(ret)) { 2476 /* 2477 * If we failed then we didn't insert our hole 2478 * entries for the area we dropped, so now the 2479 * fs is corrupted, so we must abort the 2480 * transaction. 2481 */ 2482 btrfs_abort_transaction(trans, ret); 2483 break; 2484 } 2485 } else if (!extent_info && cur_offset < drop_args.drop_end) { 2486 /* 2487 * We are past the i_size here, but since we didn't 2488 * insert holes we need to clear the mapped area so we 2489 * know to not set disk_i_size in this area until a new 2490 * file extent is inserted here. 2491 */ 2492 ret = btrfs_inode_clear_file_extent_range(inode, 2493 cur_offset, 2494 drop_args.drop_end - cur_offset); 2495 if (unlikely(ret)) { 2496 /* 2497 * We couldn't clear our area, so we could 2498 * presumably adjust up and corrupt the fs, so 2499 * we need to abort. 2500 */ 2501 btrfs_abort_transaction(trans, ret); 2502 break; 2503 } 2504 } 2505 2506 if (extent_info && 2507 drop_args.drop_end > extent_info->file_offset) { 2508 u64 replace_len = drop_args.drop_end - 2509 extent_info->file_offset; 2510 2511 ret = btrfs_insert_replace_extent(trans, inode, path, 2512 extent_info, replace_len, 2513 drop_args.bytes_found); 2514 if (unlikely(ret)) { 2515 btrfs_abort_transaction(trans, ret); 2516 break; 2517 } 2518 extent_info->data_len -= replace_len; 2519 extent_info->data_offset += replace_len; 2520 extent_info->file_offset += replace_len; 2521 } 2522 2523 /* 2524 * We are releasing our handle on the transaction, balance the 2525 * dirty pages of the btree inode and flush delayed items, and 2526 * then get a new transaction handle, which may now point to a 2527 * new transaction in case someone else may have committed the 2528 * transaction we used to replace/drop file extent items. So 2529 * bump the inode's iversion and update mtime and ctime except 2530 * if we are called from a dedupe context. This is because a 2531 * power failure/crash may happen after the transaction is 2532 * committed and before we finish replacing/dropping all the 2533 * file extent items we need. 2534 */ 2535 inode_inc_iversion(&inode->vfs_inode); 2536 2537 if (!extent_info || extent_info->update_times) 2538 inode_set_mtime_to_ts(&inode->vfs_inode, 2539 inode_set_ctime_current(&inode->vfs_inode)); 2540 2541 ret = btrfs_update_inode(trans, inode); 2542 if (ret) 2543 break; 2544 2545 btrfs_end_transaction(trans); 2546 btrfs_btree_balance_dirty(fs_info); 2547 2548 trans = btrfs_start_transaction(root, rsv_count); 2549 if (IS_ERR(trans)) { 2550 ret = PTR_ERR(trans); 2551 trans = NULL; 2552 break; 2553 } 2554 2555 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 2556 &rsv, min_size, false); 2557 if (WARN_ON(ret)) 2558 break; 2559 trans->block_rsv = &rsv; 2560 2561 cur_offset = drop_args.drop_end; 2562 len = end - cur_offset; 2563 if (!extent_info && len) { 2564 ret = find_first_non_hole(inode, &cur_offset, &len); 2565 if (unlikely(ret < 0)) 2566 break; 2567 if (ret && !len) { 2568 ret = 0; 2569 break; 2570 } 2571 } 2572 } 2573 2574 /* 2575 * If we were cloning, force the next fsync to be a full one since we 2576 * we replaced (or just dropped in the case of cloning holes when 2577 * NO_HOLES is enabled) file extent items and did not setup new extent 2578 * maps for the replacement extents (or holes). 2579 */ 2580 if (extent_info && !extent_info->is_new_extent) 2581 btrfs_set_inode_full_sync(inode); 2582 2583 if (ret) 2584 goto out_trans; 2585 2586 trans->block_rsv = &fs_info->trans_block_rsv; 2587 /* 2588 * If we are using the NO_HOLES feature we might have had already an 2589 * hole that overlaps a part of the region [lockstart, lockend] and 2590 * ends at (or beyond) lockend. Since we have no file extent items to 2591 * represent holes, drop_end can be less than lockend and so we must 2592 * make sure we have an extent map representing the existing hole (the 2593 * call to __btrfs_drop_extents() might have dropped the existing extent 2594 * map representing the existing hole), otherwise the fast fsync path 2595 * will not record the existence of the hole region 2596 * [existing_hole_start, lockend]. 2597 */ 2598 if (drop_args.drop_end <= end) 2599 drop_args.drop_end = end + 1; 2600 /* 2601 * Don't insert file hole extent item if it's for a range beyond eof 2602 * (because it's useless) or if it represents a 0 bytes range (when 2603 * cur_offset == drop_end). 2604 */ 2605 if (!extent_info && cur_offset < ino_size && 2606 cur_offset < drop_args.drop_end) { 2607 ret = fill_holes(trans, inode, path, cur_offset, 2608 drop_args.drop_end); 2609 if (unlikely(ret)) { 2610 /* Same comment as above. */ 2611 btrfs_abort_transaction(trans, ret); 2612 goto out_trans; 2613 } 2614 } else if (!extent_info && cur_offset < drop_args.drop_end) { 2615 /* See the comment in the loop above for the reasoning here. */ 2616 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, 2617 drop_args.drop_end - cur_offset); 2618 if (unlikely(ret)) { 2619 btrfs_abort_transaction(trans, ret); 2620 goto out_trans; 2621 } 2622 2623 } 2624 if (extent_info) { 2625 ret = btrfs_insert_replace_extent(trans, inode, path, 2626 extent_info, extent_info->data_len, 2627 drop_args.bytes_found); 2628 if (unlikely(ret)) { 2629 btrfs_abort_transaction(trans, ret); 2630 goto out_trans; 2631 } 2632 } 2633 2634 out_trans: 2635 if (!trans) 2636 goto out_release; 2637 2638 trans->block_rsv = &fs_info->trans_block_rsv; 2639 if (ret) 2640 btrfs_end_transaction(trans); 2641 else 2642 *trans_out = trans; 2643 out_release: 2644 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); 2645 return ret; 2646 } 2647 2648 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) 2649 { 2650 struct inode *inode = file_inode(file); 2651 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2652 struct btrfs_root *root = BTRFS_I(inode)->root; 2653 struct extent_state *cached_state = NULL; 2654 struct btrfs_path *path; 2655 struct btrfs_trans_handle *trans = NULL; 2656 u64 lockstart; 2657 u64 lockend; 2658 u64 tail_start; 2659 u64 tail_len; 2660 const u64 orig_start = offset; 2661 const u64 orig_end = offset + len - 1; 2662 int ret = 0; 2663 bool same_block; 2664 u64 ino_size; 2665 bool truncated_block = false; 2666 bool updated_inode = false; 2667 2668 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2669 2670 ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len); 2671 if (ret) 2672 goto out_only_mutex; 2673 2674 ino_size = round_up(inode->i_size, fs_info->sectorsize); 2675 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2676 if (ret < 0) 2677 goto out_only_mutex; 2678 if (ret && !len) { 2679 /* Already in a large hole */ 2680 ret = 0; 2681 goto out_only_mutex; 2682 } 2683 2684 ret = file_modified(file); 2685 if (ret) 2686 goto out_only_mutex; 2687 2688 lockstart = round_up(offset, fs_info->sectorsize); 2689 lockend = round_down(offset + len, fs_info->sectorsize) - 1; 2690 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) 2691 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); 2692 /* 2693 * Only do this if we are in the same block and we aren't doing the 2694 * entire block. 2695 */ 2696 if (same_block && len < fs_info->sectorsize) { 2697 if (offset < ino_size) { 2698 truncated_block = true; 2699 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 2700 orig_start, orig_end); 2701 } else { 2702 ret = 0; 2703 } 2704 goto out_only_mutex; 2705 } 2706 2707 /* zero back part of the first block */ 2708 if (offset < ino_size) { 2709 truncated_block = true; 2710 ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); 2711 if (ret) { 2712 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2713 return ret; 2714 } 2715 } 2716 2717 /* Check the aligned pages after the first unaligned page, 2718 * if offset != orig_start, which means the first unaligned page 2719 * including several following pages are already in holes, 2720 * the extra check can be skipped */ 2721 if (offset == orig_start) { 2722 /* after truncate page, check hole again */ 2723 len = offset + len - lockstart; 2724 offset = lockstart; 2725 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2726 if (ret < 0) 2727 goto out_only_mutex; 2728 if (ret && !len) { 2729 ret = 0; 2730 goto out_only_mutex; 2731 } 2732 lockstart = offset; 2733 } 2734 2735 /* Check the tail unaligned part is in a hole */ 2736 tail_start = lockend + 1; 2737 tail_len = offset + len - tail_start; 2738 if (tail_len) { 2739 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); 2740 if (unlikely(ret < 0)) 2741 goto out_only_mutex; 2742 if (!ret) { 2743 /* zero the front end of the last page */ 2744 if (tail_start + tail_len < ino_size) { 2745 truncated_block = true; 2746 ret = btrfs_truncate_block(BTRFS_I(inode), 2747 tail_start + tail_len - 1, 2748 orig_start, orig_end); 2749 if (ret) 2750 goto out_only_mutex; 2751 } 2752 } 2753 } 2754 2755 if (lockend < lockstart) { 2756 ret = 0; 2757 goto out_only_mutex; 2758 } 2759 2760 btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); 2761 2762 path = btrfs_alloc_path(); 2763 if (!path) { 2764 ret = -ENOMEM; 2765 goto out; 2766 } 2767 2768 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, 2769 lockend, NULL, &trans); 2770 btrfs_free_path(path); 2771 if (ret) 2772 goto out; 2773 2774 ASSERT(trans != NULL); 2775 inode_inc_iversion(inode); 2776 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 2777 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2778 updated_inode = true; 2779 btrfs_end_transaction(trans); 2780 btrfs_btree_balance_dirty(fs_info); 2781 out: 2782 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2783 &cached_state); 2784 out_only_mutex: 2785 if (!updated_inode && truncated_block && !ret) { 2786 /* 2787 * If we only end up zeroing part of a page, we still need to 2788 * update the inode item, so that all the time fields are 2789 * updated as well as the necessary btrfs inode in memory fields 2790 * for detecting, at fsync time, if the inode isn't yet in the 2791 * log tree or it's there but not up to date. 2792 */ 2793 struct timespec64 now = inode_set_ctime_current(inode); 2794 2795 inode_inc_iversion(inode); 2796 inode_set_mtime_to_ts(inode, now); 2797 trans = btrfs_start_transaction(root, 1); 2798 if (IS_ERR(trans)) { 2799 ret = PTR_ERR(trans); 2800 } else { 2801 int ret2; 2802 2803 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2804 ret2 = btrfs_end_transaction(trans); 2805 if (!ret) 2806 ret = ret2; 2807 } 2808 } 2809 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 2810 return ret; 2811 } 2812 2813 /* Helper structure to record which range is already reserved */ 2814 struct falloc_range { 2815 struct list_head list; 2816 u64 start; 2817 u64 len; 2818 }; 2819 2820 /* 2821 * Helper function to add falloc range 2822 * 2823 * Caller should have locked the larger range of extent containing 2824 * [start, len) 2825 */ 2826 static int add_falloc_range(struct list_head *head, u64 start, u64 len) 2827 { 2828 struct falloc_range *range = NULL; 2829 2830 if (!list_empty(head)) { 2831 /* 2832 * As fallocate iterates by bytenr order, we only need to check 2833 * the last range. 2834 */ 2835 range = list_last_entry(head, struct falloc_range, list); 2836 if (range->start + range->len == start) { 2837 range->len += len; 2838 return 0; 2839 } 2840 } 2841 2842 range = kmalloc(sizeof(*range), GFP_KERNEL); 2843 if (!range) 2844 return -ENOMEM; 2845 range->start = start; 2846 range->len = len; 2847 list_add_tail(&range->list, head); 2848 return 0; 2849 } 2850 2851 static int btrfs_fallocate_update_isize(struct inode *inode, 2852 const u64 end, 2853 const int mode) 2854 { 2855 struct btrfs_trans_handle *trans; 2856 struct btrfs_root *root = BTRFS_I(inode)->root; 2857 int ret; 2858 int ret2; 2859 2860 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) 2861 return 0; 2862 2863 trans = btrfs_start_transaction(root, 1); 2864 if (IS_ERR(trans)) 2865 return PTR_ERR(trans); 2866 2867 inode_set_ctime_current(inode); 2868 i_size_write(inode, end); 2869 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 2870 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 2871 ret2 = btrfs_end_transaction(trans); 2872 2873 return ret ? ret : ret2; 2874 } 2875 2876 enum { 2877 RANGE_BOUNDARY_WRITTEN_EXTENT, 2878 RANGE_BOUNDARY_PREALLOC_EXTENT, 2879 RANGE_BOUNDARY_HOLE, 2880 }; 2881 2882 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, 2883 u64 offset) 2884 { 2885 const u64 sectorsize = inode->root->fs_info->sectorsize; 2886 struct extent_map *em; 2887 int ret; 2888 2889 offset = round_down(offset, sectorsize); 2890 em = btrfs_get_extent(inode, NULL, offset, sectorsize); 2891 if (IS_ERR(em)) 2892 return PTR_ERR(em); 2893 2894 if (em->disk_bytenr == EXTENT_MAP_HOLE) 2895 ret = RANGE_BOUNDARY_HOLE; 2896 else if (em->flags & EXTENT_FLAG_PREALLOC) 2897 ret = RANGE_BOUNDARY_PREALLOC_EXTENT; 2898 else 2899 ret = RANGE_BOUNDARY_WRITTEN_EXTENT; 2900 2901 btrfs_free_extent_map(em); 2902 return ret; 2903 } 2904 2905 static int btrfs_zero_range(struct inode *inode, 2906 loff_t offset, 2907 loff_t len, 2908 const int mode) 2909 { 2910 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2911 struct extent_map *em; 2912 struct extent_changeset *data_reserved = NULL; 2913 int ret; 2914 u64 alloc_hint = 0; 2915 const u64 sectorsize = fs_info->sectorsize; 2916 const u64 orig_start = offset; 2917 const u64 orig_end = offset + len - 1; 2918 u64 alloc_start = round_down(offset, sectorsize); 2919 u64 alloc_end = round_up(offset + len, sectorsize); 2920 u64 bytes_to_reserve = 0; 2921 bool space_reserved = false; 2922 2923 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, 2924 alloc_end - alloc_start); 2925 if (IS_ERR(em)) { 2926 ret = PTR_ERR(em); 2927 goto out; 2928 } 2929 2930 /* 2931 * Avoid hole punching and extent allocation for some cases. More cases 2932 * could be considered, but these are unlikely common and we keep things 2933 * as simple as possible for now. Also, intentionally, if the target 2934 * range contains one or more prealloc extents together with regular 2935 * extents and holes, we drop all the existing extents and allocate a 2936 * new prealloc extent, so that we get a larger contiguous disk extent. 2937 */ 2938 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { 2939 const u64 em_end = em->start + em->len; 2940 2941 if (em_end >= offset + len) { 2942 /* 2943 * The whole range is already a prealloc extent, 2944 * do nothing except updating the inode's i_size if 2945 * needed. 2946 */ 2947 btrfs_free_extent_map(em); 2948 ret = btrfs_fallocate_update_isize(inode, offset + len, 2949 mode); 2950 goto out; 2951 } 2952 /* 2953 * Part of the range is already a prealloc extent, so operate 2954 * only on the remaining part of the range. 2955 */ 2956 alloc_start = em_end; 2957 ASSERT(IS_ALIGNED(alloc_start, sectorsize)); 2958 len = offset + len - alloc_start; 2959 offset = alloc_start; 2960 alloc_hint = btrfs_extent_map_block_start(em) + em->len; 2961 } 2962 btrfs_free_extent_map(em); 2963 2964 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == 2965 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { 2966 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); 2967 if (IS_ERR(em)) { 2968 ret = PTR_ERR(em); 2969 goto out; 2970 } 2971 2972 if (em->flags & EXTENT_FLAG_PREALLOC) { 2973 btrfs_free_extent_map(em); 2974 ret = btrfs_fallocate_update_isize(inode, offset + len, 2975 mode); 2976 goto out; 2977 } 2978 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { 2979 btrfs_free_extent_map(em); 2980 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 2981 orig_start, orig_end); 2982 if (!ret) 2983 ret = btrfs_fallocate_update_isize(inode, 2984 offset + len, 2985 mode); 2986 return ret; 2987 } 2988 btrfs_free_extent_map(em); 2989 alloc_start = round_down(offset, sectorsize); 2990 alloc_end = alloc_start + sectorsize; 2991 goto reserve_space; 2992 } 2993 2994 alloc_start = round_up(offset, sectorsize); 2995 alloc_end = round_down(offset + len, sectorsize); 2996 2997 /* 2998 * For unaligned ranges, check the pages at the boundaries, they might 2999 * map to an extent, in which case we need to partially zero them, or 3000 * they might map to a hole, in which case we need our allocation range 3001 * to cover them. 3002 */ 3003 if (!IS_ALIGNED(offset, sectorsize)) { 3004 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 3005 offset); 3006 if (ret < 0) 3007 goto out; 3008 if (ret == RANGE_BOUNDARY_HOLE) { 3009 alloc_start = round_down(offset, sectorsize); 3010 ret = 0; 3011 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 3012 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 3013 orig_start, orig_end); 3014 if (ret) 3015 goto out; 3016 } else { 3017 ret = 0; 3018 } 3019 } 3020 3021 if (!IS_ALIGNED(offset + len, sectorsize)) { 3022 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 3023 offset + len); 3024 if (ret < 0) 3025 goto out; 3026 if (ret == RANGE_BOUNDARY_HOLE) { 3027 alloc_end = round_up(offset + len, sectorsize); 3028 ret = 0; 3029 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 3030 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, 3031 orig_start, orig_end); 3032 if (ret) 3033 goto out; 3034 } else { 3035 ret = 0; 3036 } 3037 } 3038 3039 reserve_space: 3040 if (alloc_start < alloc_end) { 3041 struct extent_state *cached_state = NULL; 3042 const u64 lockstart = alloc_start; 3043 const u64 lockend = alloc_end - 1; 3044 3045 bytes_to_reserve = alloc_end - alloc_start; 3046 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3047 bytes_to_reserve); 3048 if (ret < 0) 3049 goto out; 3050 space_reserved = true; 3051 btrfs_punch_hole_lock_range(inode, lockstart, lockend, 3052 &cached_state); 3053 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, 3054 alloc_start, bytes_to_reserve); 3055 if (ret) { 3056 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, 3057 lockend, &cached_state); 3058 goto out; 3059 } 3060 ret = btrfs_prealloc_file_range(inode, mode, alloc_start, 3061 alloc_end - alloc_start, 3062 fs_info->sectorsize, 3063 offset + len, &alloc_hint); 3064 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 3065 &cached_state); 3066 /* btrfs_prealloc_file_range releases reserved space on error */ 3067 if (ret) { 3068 space_reserved = false; 3069 goto out; 3070 } 3071 } 3072 ret = btrfs_fallocate_update_isize(inode, offset + len, mode); 3073 out: 3074 if (ret && space_reserved) 3075 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, 3076 alloc_start, bytes_to_reserve); 3077 extent_changeset_free(data_reserved); 3078 3079 return ret; 3080 } 3081 3082 static long btrfs_fallocate(struct file *file, int mode, 3083 loff_t offset, loff_t len) 3084 { 3085 struct inode *inode = file_inode(file); 3086 struct extent_state *cached_state = NULL; 3087 struct extent_changeset *data_reserved = NULL; 3088 struct falloc_range *range; 3089 struct falloc_range *tmp; 3090 LIST_HEAD(reserve_list); 3091 u64 cur_offset; 3092 u64 last_byte; 3093 u64 alloc_start; 3094 u64 alloc_end; 3095 u64 alloc_hint = 0; 3096 u64 locked_end; 3097 u64 actual_end = 0; 3098 u64 data_space_needed = 0; 3099 u64 data_space_reserved = 0; 3100 u64 qgroup_reserved = 0; 3101 struct extent_map *em; 3102 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; 3103 int ret; 3104 3105 /* Do not allow fallocate in ZONED mode */ 3106 if (btrfs_is_zoned(inode_to_fs_info(inode))) 3107 return -EOPNOTSUPP; 3108 3109 alloc_start = round_down(offset, blocksize); 3110 alloc_end = round_up(offset + len, blocksize); 3111 cur_offset = alloc_start; 3112 3113 /* Make sure we aren't being give some crap mode */ 3114 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 3115 FALLOC_FL_ZERO_RANGE)) 3116 return -EOPNOTSUPP; 3117 3118 if (mode & FALLOC_FL_PUNCH_HOLE) 3119 return btrfs_punch_hole(file, offset, len); 3120 3121 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3122 3123 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { 3124 ret = inode_newsize_ok(inode, offset + len); 3125 if (ret) 3126 goto out; 3127 } 3128 3129 ret = file_modified(file); 3130 if (ret) 3131 goto out; 3132 3133 /* 3134 * TODO: Move these two operations after we have checked 3135 * accurate reserved space, or fallocate can still fail but 3136 * with page truncated or size expanded. 3137 * 3138 * But that's a minor problem and won't do much harm BTW. 3139 */ 3140 if (alloc_start > inode->i_size) { 3141 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), 3142 alloc_start); 3143 if (ret) 3144 goto out; 3145 } else if (offset + len > inode->i_size) { 3146 /* 3147 * If we are fallocating from the end of the file onward we 3148 * need to zero out the end of the block if i_size lands in the 3149 * middle of a block. 3150 */ 3151 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 3152 inode->i_size, (u64)-1); 3153 if (ret) 3154 goto out; 3155 } 3156 3157 /* 3158 * We have locked the inode at the VFS level (in exclusive mode) and we 3159 * have locked the i_mmap_lock lock (in exclusive mode). Now before 3160 * locking the file range, flush all dealloc in the range and wait for 3161 * all ordered extents in the range to complete. After this we can lock 3162 * the file range and, due to the previous locking we did, we know there 3163 * can't be more delalloc or ordered extents in the range. 3164 */ 3165 ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start, 3166 alloc_end - alloc_start); 3167 if (ret) 3168 goto out; 3169 3170 if (mode & FALLOC_FL_ZERO_RANGE) { 3171 ret = btrfs_zero_range(inode, offset, len, mode); 3172 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3173 return ret; 3174 } 3175 3176 locked_end = alloc_end - 1; 3177 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 3178 &cached_state); 3179 3180 btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); 3181 3182 /* First, check if we exceed the qgroup limit */ 3183 while (cur_offset < alloc_end) { 3184 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, 3185 alloc_end - cur_offset); 3186 if (IS_ERR(em)) { 3187 ret = PTR_ERR(em); 3188 break; 3189 } 3190 last_byte = min(btrfs_extent_map_end(em), alloc_end); 3191 actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); 3192 last_byte = ALIGN(last_byte, blocksize); 3193 if (em->disk_bytenr == EXTENT_MAP_HOLE || 3194 (cur_offset >= inode->i_size && 3195 !(em->flags & EXTENT_FLAG_PREALLOC))) { 3196 const u64 range_len = last_byte - cur_offset; 3197 3198 ret = add_falloc_range(&reserve_list, cur_offset, range_len); 3199 if (ret < 0) { 3200 btrfs_free_extent_map(em); 3201 break; 3202 } 3203 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), 3204 &data_reserved, cur_offset, range_len); 3205 if (ret < 0) { 3206 btrfs_free_extent_map(em); 3207 break; 3208 } 3209 qgroup_reserved += range_len; 3210 data_space_needed += range_len; 3211 } 3212 btrfs_free_extent_map(em); 3213 cur_offset = last_byte; 3214 } 3215 3216 if (!ret && data_space_needed > 0) { 3217 /* 3218 * We are safe to reserve space here as we can't have delalloc 3219 * in the range, see above. 3220 */ 3221 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3222 data_space_needed); 3223 if (!ret) 3224 data_space_reserved = data_space_needed; 3225 } 3226 3227 /* 3228 * If ret is still 0, means we're OK to fallocate. 3229 * Or just cleanup the list and exit. 3230 */ 3231 list_for_each_entry_safe(range, tmp, &reserve_list, list) { 3232 if (!ret) { 3233 ret = btrfs_prealloc_file_range(inode, mode, 3234 range->start, 3235 range->len, blocksize, 3236 offset + len, &alloc_hint); 3237 /* 3238 * btrfs_prealloc_file_range() releases space even 3239 * if it returns an error. 3240 */ 3241 data_space_reserved -= range->len; 3242 qgroup_reserved -= range->len; 3243 } else if (data_space_reserved > 0) { 3244 btrfs_free_reserved_data_space(BTRFS_I(inode), 3245 data_reserved, range->start, 3246 range->len); 3247 data_space_reserved -= range->len; 3248 qgroup_reserved -= range->len; 3249 } else if (qgroup_reserved > 0) { 3250 btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, 3251 range->start, range->len, NULL); 3252 qgroup_reserved -= range->len; 3253 } 3254 list_del(&range->list); 3255 kfree(range); 3256 } 3257 if (ret < 0) 3258 goto out_unlock; 3259 3260 /* 3261 * We didn't need to allocate any more space, but we still extended the 3262 * size of the file so we need to update i_size and the inode item. 3263 */ 3264 ret = btrfs_fallocate_update_isize(inode, actual_end, mode); 3265 out_unlock: 3266 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 3267 &cached_state); 3268 out: 3269 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 3270 extent_changeset_free(data_reserved); 3271 return ret; 3272 } 3273 3274 /* 3275 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range 3276 * that has unflushed and/or flushing delalloc. There might be other adjacent 3277 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps 3278 * looping while it gets adjacent subranges, and merging them together. 3279 */ 3280 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, 3281 struct extent_state **cached_state, 3282 bool *search_io_tree, 3283 u64 *delalloc_start_ret, u64 *delalloc_end_ret) 3284 { 3285 u64 len = end + 1 - start; 3286 u64 delalloc_len = 0; 3287 struct btrfs_ordered_extent *oe; 3288 u64 oe_start; 3289 u64 oe_end; 3290 3291 /* 3292 * Search the io tree first for EXTENT_DELALLOC. If we find any, it 3293 * means we have delalloc (dirty pages) for which writeback has not 3294 * started yet. 3295 */ 3296 if (*search_io_tree) { 3297 spin_lock(&inode->lock); 3298 if (inode->delalloc_bytes > 0) { 3299 spin_unlock(&inode->lock); 3300 *delalloc_start_ret = start; 3301 delalloc_len = btrfs_count_range_bits(&inode->io_tree, 3302 delalloc_start_ret, end, 3303 len, EXTENT_DELALLOC, 1, 3304 cached_state); 3305 } else { 3306 spin_unlock(&inode->lock); 3307 } 3308 } 3309 3310 if (delalloc_len > 0) { 3311 /* 3312 * If delalloc was found then *delalloc_start_ret has a sector size 3313 * aligned value (rounded down). 3314 */ 3315 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; 3316 3317 if (*delalloc_start_ret == start) { 3318 /* Delalloc for the whole range, nothing more to do. */ 3319 if (*delalloc_end_ret == end) 3320 return true; 3321 /* Else trim our search range for ordered extents. */ 3322 start = *delalloc_end_ret + 1; 3323 len = end + 1 - start; 3324 } 3325 } else { 3326 /* No delalloc, future calls don't need to search again. */ 3327 *search_io_tree = false; 3328 } 3329 3330 /* 3331 * Now also check if there's any ordered extent in the range. 3332 * We do this because: 3333 * 3334 * 1) When delalloc is flushed, the file range is locked, we clear the 3335 * EXTENT_DELALLOC bit from the io tree and create an extent map and 3336 * an ordered extent for the write. So we might just have been called 3337 * after delalloc is flushed and before the ordered extent completes 3338 * and inserts the new file extent item in the subvolume's btree; 3339 * 3340 * 2) We may have an ordered extent created by flushing delalloc for a 3341 * subrange that starts before the subrange we found marked with 3342 * EXTENT_DELALLOC in the io tree. 3343 * 3344 * We could also use the extent map tree to find such delalloc that is 3345 * being flushed, but using the ordered extents tree is more efficient 3346 * because it's usually much smaller as ordered extents are removed from 3347 * the tree once they complete. With the extent maps, we may have them 3348 * in the extent map tree for a very long time, and they were either 3349 * created by previous writes or loaded by read operations. 3350 */ 3351 oe = btrfs_lookup_first_ordered_range(inode, start, len); 3352 if (!oe) 3353 return (delalloc_len > 0); 3354 3355 /* The ordered extent may span beyond our search range. */ 3356 oe_start = max(oe->file_offset, start); 3357 oe_end = min(oe->file_offset + oe->num_bytes - 1, end); 3358 3359 btrfs_put_ordered_extent(oe); 3360 3361 /* Don't have unflushed delalloc, return the ordered extent range. */ 3362 if (delalloc_len == 0) { 3363 *delalloc_start_ret = oe_start; 3364 *delalloc_end_ret = oe_end; 3365 return true; 3366 } 3367 3368 /* 3369 * We have both unflushed delalloc (io_tree) and an ordered extent. 3370 * If the ranges are adjacent returned a combined range, otherwise 3371 * return the leftmost range. 3372 */ 3373 if (oe_start < *delalloc_start_ret) { 3374 if (oe_end < *delalloc_start_ret) 3375 *delalloc_end_ret = oe_end; 3376 *delalloc_start_ret = oe_start; 3377 } else if (*delalloc_end_ret + 1 == oe_start) { 3378 *delalloc_end_ret = oe_end; 3379 } 3380 3381 return true; 3382 } 3383 3384 /* 3385 * Check if there's delalloc in a given range. 3386 * 3387 * @inode: The inode. 3388 * @start: The start offset of the range. It does not need to be 3389 * sector size aligned. 3390 * @end: The end offset (inclusive value) of the search range. 3391 * It does not need to be sector size aligned. 3392 * @cached_state: Extent state record used for speeding up delalloc 3393 * searches in the inode's io_tree. Can be NULL. 3394 * @delalloc_start_ret: Output argument, set to the start offset of the 3395 * subrange found with delalloc (may not be sector size 3396 * aligned). 3397 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) 3398 * of the subrange found with delalloc. 3399 * 3400 * Returns true if a subrange with delalloc is found within the given range, and 3401 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and 3402 * end offsets of the subrange. 3403 */ 3404 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, 3405 struct extent_state **cached_state, 3406 u64 *delalloc_start_ret, u64 *delalloc_end_ret) 3407 { 3408 u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); 3409 u64 prev_delalloc_end = 0; 3410 bool search_io_tree = true; 3411 bool ret = false; 3412 3413 while (cur_offset <= end) { 3414 u64 delalloc_start; 3415 u64 delalloc_end; 3416 bool delalloc; 3417 3418 delalloc = find_delalloc_subrange(inode, cur_offset, end, 3419 cached_state, &search_io_tree, 3420 &delalloc_start, 3421 &delalloc_end); 3422 if (!delalloc) 3423 break; 3424 3425 if (prev_delalloc_end == 0) { 3426 /* First subrange found. */ 3427 *delalloc_start_ret = max(delalloc_start, start); 3428 *delalloc_end_ret = delalloc_end; 3429 ret = true; 3430 } else if (delalloc_start == prev_delalloc_end + 1) { 3431 /* Subrange adjacent to the previous one, merge them. */ 3432 *delalloc_end_ret = delalloc_end; 3433 } else { 3434 /* Subrange not adjacent to the previous one, exit. */ 3435 break; 3436 } 3437 3438 prev_delalloc_end = delalloc_end; 3439 cur_offset = delalloc_end + 1; 3440 cond_resched(); 3441 } 3442 3443 return ret; 3444 } 3445 3446 /* 3447 * Check if there's a hole or delalloc range in a range representing a hole (or 3448 * prealloc extent) found in the inode's subvolume btree. 3449 * 3450 * @inode: The inode. 3451 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). 3452 * @start: Start offset of the hole region. It does not need to be sector 3453 * size aligned. 3454 * @end: End offset (inclusive value) of the hole region. It does not 3455 * need to be sector size aligned. 3456 * @start_ret: Return parameter, used to set the start of the subrange in the 3457 * hole that matches the search criteria (seek mode), if such 3458 * subrange is found (return value of the function is true). 3459 * The value returned here may not be sector size aligned. 3460 * 3461 * Returns true if a subrange matching the given seek mode is found, and if one 3462 * is found, it updates @start_ret with the start of the subrange. 3463 */ 3464 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, 3465 struct extent_state **cached_state, 3466 u64 start, u64 end, u64 *start_ret) 3467 { 3468 u64 delalloc_start; 3469 u64 delalloc_end; 3470 bool delalloc; 3471 3472 delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, 3473 &delalloc_start, &delalloc_end); 3474 if (delalloc && whence == SEEK_DATA) { 3475 *start_ret = delalloc_start; 3476 return true; 3477 } 3478 3479 if (delalloc && whence == SEEK_HOLE) { 3480 /* 3481 * We found delalloc but it starts after out start offset. So we 3482 * have a hole between our start offset and the delalloc start. 3483 */ 3484 if (start < delalloc_start) { 3485 *start_ret = start; 3486 return true; 3487 } 3488 /* 3489 * Delalloc range starts at our start offset. 3490 * If the delalloc range's length is smaller than our range, 3491 * then it means we have a hole that starts where the delalloc 3492 * subrange ends. 3493 */ 3494 if (delalloc_end < end) { 3495 *start_ret = delalloc_end + 1; 3496 return true; 3497 } 3498 3499 /* There's delalloc for the whole range. */ 3500 return false; 3501 } 3502 3503 if (!delalloc && whence == SEEK_HOLE) { 3504 *start_ret = start; 3505 return true; 3506 } 3507 3508 /* 3509 * No delalloc in the range and we are seeking for data. The caller has 3510 * to iterate to the next extent item in the subvolume btree. 3511 */ 3512 return false; 3513 } 3514 3515 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) 3516 { 3517 struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); 3518 struct btrfs_file_private *private; 3519 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3520 struct extent_state *cached_state = NULL; 3521 struct extent_state **delalloc_cached_state; 3522 const loff_t i_size = i_size_read(&inode->vfs_inode); 3523 const u64 ino = btrfs_ino(inode); 3524 struct btrfs_root *root = inode->root; 3525 struct btrfs_path *path; 3526 struct btrfs_key key; 3527 u64 last_extent_end; 3528 u64 lockstart; 3529 u64 lockend; 3530 u64 start; 3531 int ret; 3532 bool found = false; 3533 3534 if (i_size == 0 || offset >= i_size) 3535 return -ENXIO; 3536 3537 /* 3538 * Quick path. If the inode has no prealloc extents and its number of 3539 * bytes used matches its i_size, then it can not have holes. 3540 */ 3541 if (whence == SEEK_HOLE && 3542 !(inode->flags & BTRFS_INODE_PREALLOC) && 3543 inode_get_bytes(&inode->vfs_inode) == i_size) 3544 return i_size; 3545 3546 spin_lock(&inode->lock); 3547 private = file->private_data; 3548 spin_unlock(&inode->lock); 3549 3550 if (private && private->owner_task != current) { 3551 /* 3552 * Not allocated by us, don't use it as its cached state is used 3553 * by the task that allocated it and we don't want neither to 3554 * mess with it nor get incorrect results because it reflects an 3555 * invalid state for the current task. 3556 */ 3557 private = NULL; 3558 } else if (!private) { 3559 private = kzalloc(sizeof(*private), GFP_KERNEL); 3560 /* 3561 * No worries if memory allocation failed. 3562 * The private structure is used only for speeding up multiple 3563 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, 3564 * so everything will still be correct. 3565 */ 3566 if (private) { 3567 bool free = false; 3568 3569 private->owner_task = current; 3570 3571 spin_lock(&inode->lock); 3572 if (file->private_data) 3573 free = true; 3574 else 3575 file->private_data = private; 3576 spin_unlock(&inode->lock); 3577 3578 if (free) { 3579 kfree(private); 3580 private = NULL; 3581 } 3582 } 3583 } 3584 3585 if (private) 3586 delalloc_cached_state = &private->llseek_cached_state; 3587 else 3588 delalloc_cached_state = NULL; 3589 3590 /* 3591 * offset can be negative, in this case we start finding DATA/HOLE from 3592 * the very start of the file. 3593 */ 3594 start = max_t(loff_t, 0, offset); 3595 3596 lockstart = round_down(start, fs_info->sectorsize); 3597 lockend = round_up(i_size, fs_info->sectorsize); 3598 if (lockend <= lockstart) 3599 lockend = lockstart + fs_info->sectorsize; 3600 lockend--; 3601 3602 path = btrfs_alloc_path(); 3603 if (!path) 3604 return -ENOMEM; 3605 path->reada = READA_FORWARD; 3606 3607 key.objectid = ino; 3608 key.type = BTRFS_EXTENT_DATA_KEY; 3609 key.offset = start; 3610 3611 last_extent_end = lockstart; 3612 3613 btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 3614 3615 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3616 if (ret < 0) { 3617 goto out; 3618 } else if (ret > 0 && path->slots[0] > 0) { 3619 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 3620 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 3621 path->slots[0]--; 3622 } 3623 3624 while (start < i_size) { 3625 struct extent_buffer *leaf = path->nodes[0]; 3626 struct btrfs_file_extent_item *extent; 3627 u64 extent_end; 3628 u8 type; 3629 3630 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3631 ret = btrfs_next_leaf(root, path); 3632 if (ret < 0) 3633 goto out; 3634 else if (ret > 0) 3635 break; 3636 3637 leaf = path->nodes[0]; 3638 } 3639 3640 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3641 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 3642 break; 3643 3644 extent_end = btrfs_file_extent_end(path); 3645 3646 /* 3647 * In the first iteration we may have a slot that points to an 3648 * extent that ends before our start offset, so skip it. 3649 */ 3650 if (extent_end <= start) { 3651 path->slots[0]++; 3652 continue; 3653 } 3654 3655 /* We have an implicit hole, NO_HOLES feature is likely set. */ 3656 if (last_extent_end < key.offset) { 3657 u64 search_start = last_extent_end; 3658 u64 found_start; 3659 3660 /* 3661 * First iteration, @start matches @offset and it's 3662 * within the hole. 3663 */ 3664 if (start == offset) 3665 search_start = offset; 3666 3667 found = find_desired_extent_in_hole(inode, whence, 3668 delalloc_cached_state, 3669 search_start, 3670 key.offset - 1, 3671 &found_start); 3672 if (found) { 3673 start = found_start; 3674 break; 3675 } 3676 /* 3677 * Didn't find data or a hole (due to delalloc) in the 3678 * implicit hole range, so need to analyze the extent. 3679 */ 3680 } 3681 3682 extent = btrfs_item_ptr(leaf, path->slots[0], 3683 struct btrfs_file_extent_item); 3684 type = btrfs_file_extent_type(leaf, extent); 3685 3686 /* 3687 * Can't access the extent's disk_bytenr field if this is an 3688 * inline extent, since at that offset, it's where the extent 3689 * data starts. 3690 */ 3691 if (type == BTRFS_FILE_EXTENT_PREALLOC || 3692 (type == BTRFS_FILE_EXTENT_REG && 3693 btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { 3694 /* 3695 * Explicit hole or prealloc extent, search for delalloc. 3696 * A prealloc extent is treated like a hole. 3697 */ 3698 u64 search_start = key.offset; 3699 u64 found_start; 3700 3701 /* 3702 * First iteration, @start matches @offset and it's 3703 * within the hole. 3704 */ 3705 if (start == offset) 3706 search_start = offset; 3707 3708 found = find_desired_extent_in_hole(inode, whence, 3709 delalloc_cached_state, 3710 search_start, 3711 extent_end - 1, 3712 &found_start); 3713 if (found) { 3714 start = found_start; 3715 break; 3716 } 3717 /* 3718 * Didn't find data or a hole (due to delalloc) in the 3719 * implicit hole range, so need to analyze the next 3720 * extent item. 3721 */ 3722 } else { 3723 /* 3724 * Found a regular or inline extent. 3725 * If we are seeking for data, adjust the start offset 3726 * and stop, we're done. 3727 */ 3728 if (whence == SEEK_DATA) { 3729 start = max_t(u64, key.offset, offset); 3730 found = true; 3731 break; 3732 } 3733 /* 3734 * Else, we are seeking for a hole, check the next file 3735 * extent item. 3736 */ 3737 } 3738 3739 start = extent_end; 3740 last_extent_end = extent_end; 3741 path->slots[0]++; 3742 if (fatal_signal_pending(current)) { 3743 ret = -EINTR; 3744 goto out; 3745 } 3746 cond_resched(); 3747 } 3748 3749 /* We have an implicit hole from the last extent found up to i_size. */ 3750 if (!found && start < i_size) { 3751 found = find_desired_extent_in_hole(inode, whence, 3752 delalloc_cached_state, start, 3753 i_size - 1, &start); 3754 if (!found) 3755 start = i_size; 3756 } 3757 3758 out: 3759 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 3760 btrfs_free_path(path); 3761 3762 if (ret < 0) 3763 return ret; 3764 3765 if (whence == SEEK_DATA && start >= i_size) 3766 return -ENXIO; 3767 3768 return min_t(loff_t, start, i_size); 3769 } 3770 3771 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) 3772 { 3773 struct inode *inode = file->f_mapping->host; 3774 3775 switch (whence) { 3776 default: 3777 return generic_file_llseek(file, offset, whence); 3778 case SEEK_DATA: 3779 case SEEK_HOLE: 3780 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 3781 offset = find_desired_extent(file, offset, whence); 3782 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 3783 break; 3784 } 3785 3786 if (offset < 0) 3787 return offset; 3788 3789 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 3790 } 3791 3792 static int btrfs_file_open(struct inode *inode, struct file *filp) 3793 { 3794 int ret; 3795 3796 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; 3797 3798 ret = fsverity_file_open(inode, filp); 3799 if (ret) 3800 return ret; 3801 return generic_file_open(inode, filp); 3802 } 3803 3804 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 3805 { 3806 ssize_t ret = 0; 3807 3808 if (iocb->ki_flags & IOCB_DIRECT) { 3809 ret = btrfs_direct_read(iocb, to); 3810 if (ret < 0 || !iov_iter_count(to) || 3811 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) 3812 return ret; 3813 } 3814 3815 return filemap_read(iocb, to, ret); 3816 } 3817 3818 const struct file_operations btrfs_file_operations = { 3819 .llseek = btrfs_file_llseek, 3820 .read_iter = btrfs_file_read_iter, 3821 .splice_read = filemap_splice_read, 3822 .write_iter = btrfs_file_write_iter, 3823 .splice_write = iter_file_splice_write, 3824 .mmap_prepare = btrfs_file_mmap_prepare, 3825 .open = btrfs_file_open, 3826 .release = btrfs_release_file, 3827 .get_unmapped_area = thp_get_unmapped_area, 3828 .fsync = btrfs_sync_file, 3829 .fallocate = btrfs_fallocate, 3830 .unlocked_ioctl = btrfs_ioctl, 3831 #ifdef CONFIG_COMPAT 3832 .compat_ioctl = btrfs_compat_ioctl, 3833 #endif 3834 .remap_file_range = btrfs_remap_file_range, 3835 .uring_cmd = btrfs_uring_cmd, 3836 .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, 3837 }; 3838 3839 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) 3840 { 3841 struct address_space *mapping = inode->vfs_inode.i_mapping; 3842 int ret; 3843 3844 /* 3845 * So with compression we will find and lock a dirty page and clear the 3846 * first one as dirty, setup an async extent, and immediately return 3847 * with the entire range locked but with nobody actually marked with 3848 * writeback. So we can't just filemap_write_and_wait_range() and 3849 * expect it to work since it will just kick off a thread to do the 3850 * actual work. So we need to call filemap_fdatawrite_range _again_ 3851 * since it will wait on the page lock, which won't be unlocked until 3852 * after the pages have been marked as writeback and so we're good to go 3853 * from there. We have to do this otherwise we'll miss the ordered 3854 * extents and that results in badness. Please Josef, do not think you 3855 * know better and pull this out at some point in the future, it is 3856 * right and you are wrong. 3857 */ 3858 ret = filemap_fdatawrite_range(mapping, start, end); 3859 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) 3860 ret = filemap_fdatawrite_range(mapping, start, end); 3861 3862 return ret; 3863 } 3864