1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <crypto/hash.h> 7 #include <linux/kernel.h> 8 #include <linux/bio.h> 9 #include <linux/blk-cgroup.h> 10 #include <linux/file.h> 11 #include <linux/fs.h> 12 #include <linux/pagemap.h> 13 #include <linux/highmem.h> 14 #include <linux/time.h> 15 #include <linux/init.h> 16 #include <linux/string.h> 17 #include <linux/backing-dev.h> 18 #include <linux/writeback.h> 19 #include <linux/compat.h> 20 #include <linux/xattr.h> 21 #include <linux/posix_acl.h> 22 #include <linux/falloc.h> 23 #include <linux/slab.h> 24 #include <linux/ratelimit.h> 25 #include <linux/btrfs.h> 26 #include <linux/blkdev.h> 27 #include <linux/posix_acl_xattr.h> 28 #include <linux/uio.h> 29 #include <linux/magic.h> 30 #include <linux/iversion.h> 31 #include <linux/swap.h> 32 #include <linux/migrate.h> 33 #include <linux/sched/mm.h> 34 #include <linux/iomap.h> 35 #include <asm/unaligned.h> 36 #include <linux/fsverity.h> 37 #include "misc.h" 38 #include "ctree.h" 39 #include "disk-io.h" 40 #include "transaction.h" 41 #include "btrfs_inode.h" 42 #include "ordered-data.h" 43 #include "xattr.h" 44 #include "tree-log.h" 45 #include "bio.h" 46 #include "compression.h" 47 #include "locking.h" 48 #include "props.h" 49 #include "qgroup.h" 50 #include "delalloc-space.h" 51 #include "block-group.h" 52 #include "space-info.h" 53 #include "zoned.h" 54 #include "subpage.h" 55 #include "inode-item.h" 56 #include "fs.h" 57 #include "accessors.h" 58 #include "extent-tree.h" 59 #include "root-tree.h" 60 #include "defrag.h" 61 #include "dir-item.h" 62 #include "file-item.h" 63 #include "uuid-tree.h" 64 #include "ioctl.h" 65 #include "file.h" 66 #include "acl.h" 67 #include "relocation.h" 68 #include "verity.h" 69 #include "super.h" 70 #include "orphan.h" 71 #include "backref.h" 72 #include "raid-stripe-tree.h" 73 #include "fiemap.h" 74 75 struct btrfs_iget_args { 76 u64 ino; 77 struct btrfs_root *root; 78 }; 79 80 struct btrfs_rename_ctx { 81 /* Output field. Stores the index number of the old directory entry. */ 82 u64 index; 83 }; 84 85 /* 86 * Used by data_reloc_print_warning_inode() to pass needed info for filename 87 * resolution and output of error message. 88 */ 89 struct data_reloc_warn { 90 struct btrfs_path path; 91 struct btrfs_fs_info *fs_info; 92 u64 extent_item_size; 93 u64 logical; 94 int mirror_num; 95 }; 96 97 /* 98 * For the file_extent_tree, we want to hold the inode lock when we lookup and 99 * update the disk_i_size, but lockdep will complain because our io_tree we hold 100 * the tree lock and get the inode lock when setting delalloc. These two things 101 * are unrelated, so make a class for the file_extent_tree so we don't get the 102 * two locking patterns mixed up. 103 */ 104 static struct lock_class_key file_extent_tree_class; 105 106 static const struct inode_operations btrfs_dir_inode_operations; 107 static const struct inode_operations btrfs_symlink_inode_operations; 108 static const struct inode_operations btrfs_special_inode_operations; 109 static const struct inode_operations btrfs_file_inode_operations; 110 static const struct address_space_operations btrfs_aops; 111 static const struct file_operations btrfs_dir_file_operations; 112 113 static struct kmem_cache *btrfs_inode_cachep; 114 115 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 116 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); 117 118 static noinline int run_delalloc_cow(struct btrfs_inode *inode, 119 struct page *locked_page, u64 start, 120 u64 end, struct writeback_control *wbc, 121 bool pages_dirty); 122 123 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, 124 u64 root, void *warn_ctx) 125 { 126 struct data_reloc_warn *warn = warn_ctx; 127 struct btrfs_fs_info *fs_info = warn->fs_info; 128 struct extent_buffer *eb; 129 struct btrfs_inode_item *inode_item; 130 struct inode_fs_paths *ipath = NULL; 131 struct btrfs_root *local_root; 132 struct btrfs_key key; 133 unsigned int nofs_flag; 134 u32 nlink; 135 int ret; 136 137 local_root = btrfs_get_fs_root(fs_info, root, true); 138 if (IS_ERR(local_root)) { 139 ret = PTR_ERR(local_root); 140 goto err; 141 } 142 143 /* This makes the path point to (inum INODE_ITEM ioff). */ 144 key.objectid = inum; 145 key.type = BTRFS_INODE_ITEM_KEY; 146 key.offset = 0; 147 148 ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); 149 if (ret) { 150 btrfs_put_root(local_root); 151 btrfs_release_path(&warn->path); 152 goto err; 153 } 154 155 eb = warn->path.nodes[0]; 156 inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); 157 nlink = btrfs_inode_nlink(eb, inode_item); 158 btrfs_release_path(&warn->path); 159 160 nofs_flag = memalloc_nofs_save(); 161 ipath = init_ipath(4096, local_root, &warn->path); 162 memalloc_nofs_restore(nofs_flag); 163 if (IS_ERR(ipath)) { 164 btrfs_put_root(local_root); 165 ret = PTR_ERR(ipath); 166 ipath = NULL; 167 /* 168 * -ENOMEM, not a critical error, just output an generic error 169 * without filename. 170 */ 171 btrfs_warn(fs_info, 172 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", 173 warn->logical, warn->mirror_num, root, inum, offset); 174 return ret; 175 } 176 ret = paths_from_inode(inum, ipath); 177 if (ret < 0) 178 goto err; 179 180 /* 181 * We deliberately ignore the bit ipath might have been too small to 182 * hold all of the paths here 183 */ 184 for (int i = 0; i < ipath->fspath->elem_cnt; i++) { 185 btrfs_warn(fs_info, 186 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", 187 warn->logical, warn->mirror_num, root, inum, offset, 188 fs_info->sectorsize, nlink, 189 (char *)(unsigned long)ipath->fspath->val[i]); 190 } 191 192 btrfs_put_root(local_root); 193 free_ipath(ipath); 194 return 0; 195 196 err: 197 btrfs_warn(fs_info, 198 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", 199 warn->logical, warn->mirror_num, root, inum, offset, ret); 200 201 free_ipath(ipath); 202 return ret; 203 } 204 205 /* 206 * Do extra user-friendly error output (e.g. lookup all the affected files). 207 * 208 * Return true if we succeeded doing the backref lookup. 209 * Return false if such lookup failed, and has to fallback to the old error message. 210 */ 211 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, 212 const u8 *csum, const u8 *csum_expected, 213 int mirror_num) 214 { 215 struct btrfs_fs_info *fs_info = inode->root->fs_info; 216 struct btrfs_path path = { 0 }; 217 struct btrfs_key found_key = { 0 }; 218 struct extent_buffer *eb; 219 struct btrfs_extent_item *ei; 220 const u32 csum_size = fs_info->csum_size; 221 u64 logical; 222 u64 flags; 223 u32 item_size; 224 int ret; 225 226 mutex_lock(&fs_info->reloc_mutex); 227 logical = btrfs_get_reloc_bg_bytenr(fs_info); 228 mutex_unlock(&fs_info->reloc_mutex); 229 230 if (logical == U64_MAX) { 231 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); 232 btrfs_warn_rl(fs_info, 233 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 234 btrfs_root_id(inode->root), btrfs_ino(inode), file_off, 235 CSUM_FMT_VALUE(csum_size, csum), 236 CSUM_FMT_VALUE(csum_size, csum_expected), 237 mirror_num); 238 return; 239 } 240 241 logical += file_off; 242 btrfs_warn_rl(fs_info, 243 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 244 btrfs_root_id(inode->root), 245 btrfs_ino(inode), file_off, logical, 246 CSUM_FMT_VALUE(csum_size, csum), 247 CSUM_FMT_VALUE(csum_size, csum_expected), 248 mirror_num); 249 250 ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); 251 if (ret < 0) { 252 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", 253 logical, ret); 254 return; 255 } 256 eb = path.nodes[0]; 257 ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); 258 item_size = btrfs_item_size(eb, path.slots[0]); 259 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 260 unsigned long ptr = 0; 261 u64 ref_root; 262 u8 ref_level; 263 264 while (true) { 265 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 266 item_size, &ref_root, 267 &ref_level); 268 if (ret < 0) { 269 btrfs_warn_rl(fs_info, 270 "failed to resolve tree backref for logical %llu: %d", 271 logical, ret); 272 break; 273 } 274 if (ret > 0) 275 break; 276 277 btrfs_warn_rl(fs_info, 278 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", 279 logical, mirror_num, 280 (ref_level ? "node" : "leaf"), 281 ref_level, ref_root); 282 } 283 btrfs_release_path(&path); 284 } else { 285 struct btrfs_backref_walk_ctx ctx = { 0 }; 286 struct data_reloc_warn reloc_warn = { 0 }; 287 288 btrfs_release_path(&path); 289 290 ctx.bytenr = found_key.objectid; 291 ctx.extent_item_pos = logical - found_key.objectid; 292 ctx.fs_info = fs_info; 293 294 reloc_warn.logical = logical; 295 reloc_warn.extent_item_size = found_key.offset; 296 reloc_warn.mirror_num = mirror_num; 297 reloc_warn.fs_info = fs_info; 298 299 iterate_extent_inodes(&ctx, true, 300 data_reloc_print_warning_inode, &reloc_warn); 301 } 302 } 303 304 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, 305 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) 306 { 307 struct btrfs_root *root = inode->root; 308 const u32 csum_size = root->fs_info->csum_size; 309 310 /* For data reloc tree, it's better to do a backref lookup instead. */ 311 if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) 312 return print_data_reloc_error(inode, logical_start, csum, 313 csum_expected, mirror_num); 314 315 /* Output without objectid, which is more meaningful */ 316 if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { 317 btrfs_warn_rl(root->fs_info, 318 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 319 btrfs_root_id(root), btrfs_ino(inode), 320 logical_start, 321 CSUM_FMT_VALUE(csum_size, csum), 322 CSUM_FMT_VALUE(csum_size, csum_expected), 323 mirror_num); 324 } else { 325 btrfs_warn_rl(root->fs_info, 326 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 327 btrfs_root_id(root), btrfs_ino(inode), 328 logical_start, 329 CSUM_FMT_VALUE(csum_size, csum), 330 CSUM_FMT_VALUE(csum_size, csum_expected), 331 mirror_num); 332 } 333 } 334 335 /* 336 * Lock inode i_rwsem based on arguments passed. 337 * 338 * ilock_flags can have the following bit set: 339 * 340 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode 341 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt 342 * return -EAGAIN 343 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock 344 */ 345 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) 346 { 347 if (ilock_flags & BTRFS_ILOCK_SHARED) { 348 if (ilock_flags & BTRFS_ILOCK_TRY) { 349 if (!inode_trylock_shared(&inode->vfs_inode)) 350 return -EAGAIN; 351 else 352 return 0; 353 } 354 inode_lock_shared(&inode->vfs_inode); 355 } else { 356 if (ilock_flags & BTRFS_ILOCK_TRY) { 357 if (!inode_trylock(&inode->vfs_inode)) 358 return -EAGAIN; 359 else 360 return 0; 361 } 362 inode_lock(&inode->vfs_inode); 363 } 364 if (ilock_flags & BTRFS_ILOCK_MMAP) 365 down_write(&inode->i_mmap_lock); 366 return 0; 367 } 368 369 /* 370 * Unock inode i_rwsem. 371 * 372 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() 373 * to decide whether the lock acquired is shared or exclusive. 374 */ 375 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) 376 { 377 if (ilock_flags & BTRFS_ILOCK_MMAP) 378 up_write(&inode->i_mmap_lock); 379 if (ilock_flags & BTRFS_ILOCK_SHARED) 380 inode_unlock_shared(&inode->vfs_inode); 381 else 382 inode_unlock(&inode->vfs_inode); 383 } 384 385 /* 386 * Cleanup all submitted ordered extents in specified range to handle errors 387 * from the btrfs_run_delalloc_range() callback. 388 * 389 * NOTE: caller must ensure that when an error happens, it can not call 390 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING 391 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata 392 * to be released, which we want to happen only when finishing the ordered 393 * extent (btrfs_finish_ordered_io()). 394 */ 395 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, 396 struct page *locked_page, 397 u64 offset, u64 bytes) 398 { 399 unsigned long index = offset >> PAGE_SHIFT; 400 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; 401 u64 page_start = 0, page_end = 0; 402 struct page *page; 403 404 if (locked_page) { 405 page_start = page_offset(locked_page); 406 page_end = page_start + PAGE_SIZE - 1; 407 } 408 409 while (index <= end_index) { 410 /* 411 * For locked page, we will call btrfs_mark_ordered_io_finished 412 * through btrfs_mark_ordered_io_finished() on it 413 * in run_delalloc_range() for the error handling, which will 414 * clear page Ordered and run the ordered extent accounting. 415 * 416 * Here we can't just clear the Ordered bit, or 417 * btrfs_mark_ordered_io_finished() would skip the accounting 418 * for the page range, and the ordered extent will never finish. 419 */ 420 if (locked_page && index == (page_start >> PAGE_SHIFT)) { 421 index++; 422 continue; 423 } 424 page = find_get_page(inode->vfs_inode.i_mapping, index); 425 index++; 426 if (!page) 427 continue; 428 429 /* 430 * Here we just clear all Ordered bits for every page in the 431 * range, then btrfs_mark_ordered_io_finished() will handle 432 * the ordered extent accounting for the range. 433 */ 434 btrfs_folio_clamp_clear_ordered(inode->root->fs_info, 435 page_folio(page), offset, bytes); 436 put_page(page); 437 } 438 439 if (locked_page) { 440 /* The locked page covers the full range, nothing needs to be done */ 441 if (bytes + offset <= page_start + PAGE_SIZE) 442 return; 443 /* 444 * In case this page belongs to the delalloc range being 445 * instantiated then skip it, since the first page of a range is 446 * going to be properly cleaned up by the caller of 447 * run_delalloc_range 448 */ 449 if (page_start >= offset && page_end <= (offset + bytes - 1)) { 450 bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; 451 offset = page_offset(locked_page) + PAGE_SIZE; 452 } 453 } 454 455 return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); 456 } 457 458 static int btrfs_dirty_inode(struct btrfs_inode *inode); 459 460 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 461 struct btrfs_new_inode_args *args) 462 { 463 int err; 464 465 if (args->default_acl) { 466 err = __btrfs_set_acl(trans, args->inode, args->default_acl, 467 ACL_TYPE_DEFAULT); 468 if (err) 469 return err; 470 } 471 if (args->acl) { 472 err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); 473 if (err) 474 return err; 475 } 476 if (!args->default_acl && !args->acl) 477 cache_no_acl(args->inode); 478 return btrfs_xattr_security_init(trans, args->inode, args->dir, 479 &args->dentry->d_name); 480 } 481 482 /* 483 * this does all the hard work for inserting an inline extent into 484 * the btree. The caller should have done a btrfs_drop_extents so that 485 * no overlapping inline items exist in the btree 486 */ 487 static int insert_inline_extent(struct btrfs_trans_handle *trans, 488 struct btrfs_path *path, 489 struct btrfs_inode *inode, bool extent_inserted, 490 size_t size, size_t compressed_size, 491 int compress_type, 492 struct folio *compressed_folio, 493 bool update_i_size) 494 { 495 struct btrfs_root *root = inode->root; 496 struct extent_buffer *leaf; 497 struct page *page = NULL; 498 const u32 sectorsize = trans->fs_info->sectorsize; 499 char *kaddr; 500 unsigned long ptr; 501 struct btrfs_file_extent_item *ei; 502 int ret; 503 size_t cur_size = size; 504 u64 i_size; 505 506 /* 507 * The decompressed size must still be no larger than a sector. Under 508 * heavy race, we can have size == 0 passed in, but that shouldn't be a 509 * big deal and we can continue the insertion. 510 */ 511 ASSERT(size <= sectorsize); 512 513 /* 514 * The compressed size also needs to be no larger than a sector. 515 * That's also why we only need one page as the parameter. 516 */ 517 if (compressed_folio) 518 ASSERT(compressed_size <= sectorsize); 519 else 520 ASSERT(compressed_size == 0); 521 522 if (compressed_size && compressed_folio) 523 cur_size = compressed_size; 524 525 if (!extent_inserted) { 526 struct btrfs_key key; 527 size_t datasize; 528 529 key.objectid = btrfs_ino(inode); 530 key.offset = 0; 531 key.type = BTRFS_EXTENT_DATA_KEY; 532 533 datasize = btrfs_file_extent_calc_inline_size(cur_size); 534 ret = btrfs_insert_empty_item(trans, root, path, &key, 535 datasize); 536 if (ret) 537 goto fail; 538 } 539 leaf = path->nodes[0]; 540 ei = btrfs_item_ptr(leaf, path->slots[0], 541 struct btrfs_file_extent_item); 542 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 543 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 544 btrfs_set_file_extent_encryption(leaf, ei, 0); 545 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 546 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 547 ptr = btrfs_file_extent_inline_start(ei); 548 549 if (compress_type != BTRFS_COMPRESS_NONE) { 550 kaddr = kmap_local_folio(compressed_folio, 0); 551 write_extent_buffer(leaf, kaddr, ptr, compressed_size); 552 kunmap_local(kaddr); 553 554 btrfs_set_file_extent_compression(leaf, ei, 555 compress_type); 556 } else { 557 page = find_get_page(inode->vfs_inode.i_mapping, 0); 558 btrfs_set_file_extent_compression(leaf, ei, 0); 559 kaddr = kmap_local_page(page); 560 write_extent_buffer(leaf, kaddr, ptr, size); 561 kunmap_local(kaddr); 562 put_page(page); 563 } 564 btrfs_mark_buffer_dirty(trans, leaf); 565 btrfs_release_path(path); 566 567 /* 568 * We align size to sectorsize for inline extents just for simplicity 569 * sake. 570 */ 571 ret = btrfs_inode_set_file_extent_range(inode, 0, 572 ALIGN(size, root->fs_info->sectorsize)); 573 if (ret) 574 goto fail; 575 576 /* 577 * We're an inline extent, so nobody can extend the file past i_size 578 * without locking a page we already have locked. 579 * 580 * We must do any i_size and inode updates before we unlock the pages. 581 * Otherwise we could end up racing with unlink. 582 */ 583 i_size = i_size_read(&inode->vfs_inode); 584 if (update_i_size && size > i_size) { 585 i_size_write(&inode->vfs_inode, size); 586 i_size = size; 587 } 588 inode->disk_i_size = i_size; 589 590 fail: 591 return ret; 592 } 593 594 static bool can_cow_file_range_inline(struct btrfs_inode *inode, 595 u64 offset, u64 size, 596 size_t compressed_size) 597 { 598 struct btrfs_fs_info *fs_info = inode->root->fs_info; 599 u64 data_len = (compressed_size ?: size); 600 601 /* Inline extents must start at offset 0. */ 602 if (offset != 0) 603 return false; 604 605 /* 606 * Due to the page size limit, for subpage we can only trigger the 607 * writeback for the dirty sectors of page, that means data writeback 608 * is doing more writeback than what we want. 609 * 610 * This is especially unexpected for some call sites like fallocate, 611 * where we only increase i_size after everything is done. 612 * This means we can trigger inline extent even if we didn't want to. 613 * So here we skip inline extent creation completely. 614 */ 615 if (fs_info->sectorsize != PAGE_SIZE) 616 return false; 617 618 /* Inline extents are limited to sectorsize. */ 619 if (size > fs_info->sectorsize) 620 return false; 621 622 /* We cannot exceed the maximum inline data size. */ 623 if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) 624 return false; 625 626 /* We cannot exceed the user specified max_inline size. */ 627 if (data_len > fs_info->max_inline) 628 return false; 629 630 /* Inline extents must be the entirety of the file. */ 631 if (size < i_size_read(&inode->vfs_inode)) 632 return false; 633 634 return true; 635 } 636 637 /* 638 * conditionally insert an inline extent into the file. This 639 * does the checks required to make sure the data is small enough 640 * to fit as an inline extent. 641 * 642 * If being used directly, you must have already checked we're allowed to cow 643 * the range by getting true from can_cow_file_range_inline(). 644 */ 645 static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, 646 u64 size, size_t compressed_size, 647 int compress_type, 648 struct folio *compressed_folio, 649 bool update_i_size) 650 { 651 struct btrfs_drop_extents_args drop_args = { 0 }; 652 struct btrfs_root *root = inode->root; 653 struct btrfs_fs_info *fs_info = root->fs_info; 654 struct btrfs_trans_handle *trans; 655 u64 data_len = (compressed_size ?: size); 656 int ret; 657 struct btrfs_path *path; 658 659 path = btrfs_alloc_path(); 660 if (!path) 661 return -ENOMEM; 662 663 trans = btrfs_join_transaction(root); 664 if (IS_ERR(trans)) { 665 btrfs_free_path(path); 666 return PTR_ERR(trans); 667 } 668 trans->block_rsv = &inode->block_rsv; 669 670 drop_args.path = path; 671 drop_args.start = 0; 672 drop_args.end = fs_info->sectorsize; 673 drop_args.drop_cache = true; 674 drop_args.replace_extent = true; 675 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); 676 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 677 if (ret) { 678 btrfs_abort_transaction(trans, ret); 679 goto out; 680 } 681 682 ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, 683 size, compressed_size, compress_type, 684 compressed_folio, update_i_size); 685 if (ret && ret != -ENOSPC) { 686 btrfs_abort_transaction(trans, ret); 687 goto out; 688 } else if (ret == -ENOSPC) { 689 ret = 1; 690 goto out; 691 } 692 693 btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); 694 ret = btrfs_update_inode(trans, inode); 695 if (ret && ret != -ENOSPC) { 696 btrfs_abort_transaction(trans, ret); 697 goto out; 698 } else if (ret == -ENOSPC) { 699 ret = 1; 700 goto out; 701 } 702 703 btrfs_set_inode_full_sync(inode); 704 out: 705 /* 706 * Don't forget to free the reserved space, as for inlined extent 707 * it won't count as data extent, free them directly here. 708 * And at reserve time, it's always aligned to page size, so 709 * just free one page here. 710 */ 711 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); 712 btrfs_free_path(path); 713 btrfs_end_transaction(trans); 714 return ret; 715 } 716 717 static noinline int cow_file_range_inline(struct btrfs_inode *inode, 718 struct page *locked_page, 719 u64 offset, u64 end, 720 size_t compressed_size, 721 int compress_type, 722 struct folio *compressed_folio, 723 bool update_i_size) 724 { 725 struct extent_state *cached = NULL; 726 unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 727 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; 728 u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); 729 int ret; 730 731 if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) 732 return 1; 733 734 lock_extent(&inode->io_tree, offset, end, &cached); 735 ret = __cow_file_range_inline(inode, offset, size, compressed_size, 736 compress_type, compressed_folio, 737 update_i_size); 738 if (ret > 0) { 739 unlock_extent(&inode->io_tree, offset, end, &cached); 740 return ret; 741 } 742 743 if (ret == 0) 744 locked_page = NULL; 745 746 extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, 747 clear_flags, 748 PAGE_UNLOCK | PAGE_START_WRITEBACK | 749 PAGE_END_WRITEBACK); 750 return ret; 751 } 752 753 struct async_extent { 754 u64 start; 755 u64 ram_size; 756 u64 compressed_size; 757 struct folio **folios; 758 unsigned long nr_folios; 759 int compress_type; 760 struct list_head list; 761 }; 762 763 struct async_chunk { 764 struct btrfs_inode *inode; 765 struct page *locked_page; 766 u64 start; 767 u64 end; 768 blk_opf_t write_flags; 769 struct list_head extents; 770 struct cgroup_subsys_state *blkcg_css; 771 struct btrfs_work work; 772 struct async_cow *async_cow; 773 }; 774 775 struct async_cow { 776 atomic_t num_chunks; 777 struct async_chunk chunks[]; 778 }; 779 780 static noinline int add_async_extent(struct async_chunk *cow, 781 u64 start, u64 ram_size, 782 u64 compressed_size, 783 struct folio **folios, 784 unsigned long nr_folios, 785 int compress_type) 786 { 787 struct async_extent *async_extent; 788 789 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 790 if (!async_extent) 791 return -ENOMEM; 792 async_extent->start = start; 793 async_extent->ram_size = ram_size; 794 async_extent->compressed_size = compressed_size; 795 async_extent->folios = folios; 796 async_extent->nr_folios = nr_folios; 797 async_extent->compress_type = compress_type; 798 list_add_tail(&async_extent->list, &cow->extents); 799 return 0; 800 } 801 802 /* 803 * Check if the inode needs to be submitted to compression, based on mount 804 * options, defragmentation, properties or heuristics. 805 */ 806 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, 807 u64 end) 808 { 809 struct btrfs_fs_info *fs_info = inode->root->fs_info; 810 811 if (!btrfs_inode_can_compress(inode)) { 812 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), 813 KERN_ERR "BTRFS: unexpected compression for ino %llu\n", 814 btrfs_ino(inode)); 815 return 0; 816 } 817 /* 818 * Special check for subpage. 819 * 820 * We lock the full page then run each delalloc range in the page, thus 821 * for the following case, we will hit some subpage specific corner case: 822 * 823 * 0 32K 64K 824 * | |///////| |///////| 825 * \- A \- B 826 * 827 * In above case, both range A and range B will try to unlock the full 828 * page [0, 64K), causing the one finished later will have page 829 * unlocked already, triggering various page lock requirement BUG_ON()s. 830 * 831 * So here we add an artificial limit that subpage compression can only 832 * if the range is fully page aligned. 833 * 834 * In theory we only need to ensure the first page is fully covered, but 835 * the tailing partial page will be locked until the full compression 836 * finishes, delaying the write of other range. 837 * 838 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range 839 * first to prevent any submitted async extent to unlock the full page. 840 * By this, we can ensure for subpage case that only the last async_cow 841 * will unlock the full page. 842 */ 843 if (fs_info->sectorsize < PAGE_SIZE) { 844 if (!PAGE_ALIGNED(start) || 845 !PAGE_ALIGNED(end + 1)) 846 return 0; 847 } 848 849 /* force compress */ 850 if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) 851 return 1; 852 /* defrag ioctl */ 853 if (inode->defrag_compress) 854 return 1; 855 /* bad compression ratios */ 856 if (inode->flags & BTRFS_INODE_NOCOMPRESS) 857 return 0; 858 if (btrfs_test_opt(fs_info, COMPRESS) || 859 inode->flags & BTRFS_INODE_COMPRESS || 860 inode->prop_compress) 861 return btrfs_compress_heuristic(inode, start, end); 862 return 0; 863 } 864 865 static inline void inode_should_defrag(struct btrfs_inode *inode, 866 u64 start, u64 end, u64 num_bytes, u32 small_write) 867 { 868 /* If this is a small write inside eof, kick off a defrag */ 869 if (num_bytes < small_write && 870 (start > 0 || end + 1 < inode->disk_i_size)) 871 btrfs_add_inode_defrag(NULL, inode, small_write); 872 } 873 874 static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 875 { 876 unsigned long end_index = end >> PAGE_SHIFT; 877 struct page *page; 878 int ret = 0; 879 880 for (unsigned long index = start >> PAGE_SHIFT; 881 index <= end_index; index++) { 882 page = find_get_page(inode->i_mapping, index); 883 if (unlikely(!page)) { 884 if (!ret) 885 ret = -ENOENT; 886 continue; 887 } 888 clear_page_dirty_for_io(page); 889 put_page(page); 890 } 891 return ret; 892 } 893 894 /* 895 * Work queue call back to started compression on a file and pages. 896 * 897 * This is done inside an ordered work queue, and the compression is spread 898 * across many cpus. The actual IO submission is step two, and the ordered work 899 * queue takes care of making sure that happens in the same order things were 900 * put onto the queue by writepages and friends. 901 * 902 * If this code finds it can't get good compression, it puts an entry onto the 903 * work queue to write the uncompressed bytes. This makes sure that both 904 * compressed inodes and uncompressed inodes are written in the same order that 905 * the flusher thread sent them down. 906 */ 907 static void compress_file_range(struct btrfs_work *work) 908 { 909 struct async_chunk *async_chunk = 910 container_of(work, struct async_chunk, work); 911 struct btrfs_inode *inode = async_chunk->inode; 912 struct btrfs_fs_info *fs_info = inode->root->fs_info; 913 struct address_space *mapping = inode->vfs_inode.i_mapping; 914 u64 blocksize = fs_info->sectorsize; 915 u64 start = async_chunk->start; 916 u64 end = async_chunk->end; 917 u64 actual_end; 918 u64 i_size; 919 int ret = 0; 920 struct folio **folios; 921 unsigned long nr_folios; 922 unsigned long total_compressed = 0; 923 unsigned long total_in = 0; 924 unsigned int poff; 925 int i; 926 int compress_type = fs_info->compress_type; 927 928 inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); 929 930 /* 931 * We need to call clear_page_dirty_for_io on each page in the range. 932 * Otherwise applications with the file mmap'd can wander in and change 933 * the page contents while we are compressing them. 934 */ 935 ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); 936 937 /* 938 * All the folios should have been locked thus no failure. 939 * 940 * And even if some folios are missing, btrfs_compress_folios() 941 * would handle them correctly, so here just do an ASSERT() check for 942 * early logic errors. 943 */ 944 ASSERT(ret == 0); 945 946 /* 947 * We need to save i_size before now because it could change in between 948 * us evaluating the size and assigning it. This is because we lock and 949 * unlock the page in truncate and fallocate, and then modify the i_size 950 * later on. 951 * 952 * The barriers are to emulate READ_ONCE, remove that once i_size_read 953 * does that for us. 954 */ 955 barrier(); 956 i_size = i_size_read(&inode->vfs_inode); 957 barrier(); 958 actual_end = min_t(u64, i_size, end + 1); 959 again: 960 folios = NULL; 961 nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 962 nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); 963 964 /* 965 * we don't want to send crud past the end of i_size through 966 * compression, that's just a waste of CPU time. So, if the 967 * end of the file is before the start of our current 968 * requested range of bytes, we bail out to the uncompressed 969 * cleanup code that can deal with all of this. 970 * 971 * It isn't really the fastest way to fix things, but this is a 972 * very uncommon corner. 973 */ 974 if (actual_end <= start) 975 goto cleanup_and_bail_uncompressed; 976 977 total_compressed = actual_end - start; 978 979 /* 980 * Skip compression for a small file range(<=blocksize) that 981 * isn't an inline extent, since it doesn't save disk space at all. 982 */ 983 if (total_compressed <= blocksize && 984 (start > 0 || end + 1 < inode->disk_i_size)) 985 goto cleanup_and_bail_uncompressed; 986 987 /* 988 * For subpage case, we require full page alignment for the sector 989 * aligned range. 990 * Thus we must also check against @actual_end, not just @end. 991 */ 992 if (blocksize < PAGE_SIZE) { 993 if (!PAGE_ALIGNED(start) || 994 !PAGE_ALIGNED(round_up(actual_end, blocksize))) 995 goto cleanup_and_bail_uncompressed; 996 } 997 998 total_compressed = min_t(unsigned long, total_compressed, 999 BTRFS_MAX_UNCOMPRESSED); 1000 total_in = 0; 1001 ret = 0; 1002 1003 /* 1004 * We do compression for mount -o compress and when the inode has not 1005 * been flagged as NOCOMPRESS. This flag can change at any time if we 1006 * discover bad compression ratios. 1007 */ 1008 if (!inode_need_compress(inode, start, end)) 1009 goto cleanup_and_bail_uncompressed; 1010 1011 folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); 1012 if (!folios) { 1013 /* 1014 * Memory allocation failure is not a fatal error, we can fall 1015 * back to uncompressed code. 1016 */ 1017 goto cleanup_and_bail_uncompressed; 1018 } 1019 1020 if (inode->defrag_compress) 1021 compress_type = inode->defrag_compress; 1022 else if (inode->prop_compress) 1023 compress_type = inode->prop_compress; 1024 1025 /* Compression level is applied here. */ 1026 ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), 1027 mapping, start, folios, &nr_folios, &total_in, 1028 &total_compressed); 1029 if (ret) 1030 goto mark_incompressible; 1031 1032 /* 1033 * Zero the tail end of the last page, as we might be sending it down 1034 * to disk. 1035 */ 1036 poff = offset_in_page(total_compressed); 1037 if (poff) 1038 folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); 1039 1040 /* 1041 * Try to create an inline extent. 1042 * 1043 * If we didn't compress the entire range, try to create an uncompressed 1044 * inline extent, else a compressed one. 1045 * 1046 * Check cow_file_range() for why we don't even try to create inline 1047 * extent for the subpage case. 1048 */ 1049 if (total_in < actual_end) 1050 ret = cow_file_range_inline(inode, NULL, start, end, 0, 1051 BTRFS_COMPRESS_NONE, NULL, false); 1052 else 1053 ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, 1054 compress_type, folios[0], false); 1055 if (ret <= 0) { 1056 if (ret < 0) 1057 mapping_set_error(mapping, -EIO); 1058 goto free_pages; 1059 } 1060 1061 /* 1062 * We aren't doing an inline extent. Round the compressed size up to a 1063 * block size boundary so the allocator does sane things. 1064 */ 1065 total_compressed = ALIGN(total_compressed, blocksize); 1066 1067 /* 1068 * One last check to make sure the compression is really a win, compare 1069 * the page count read with the blocks on disk, compression must free at 1070 * least one sector. 1071 */ 1072 total_in = round_up(total_in, fs_info->sectorsize); 1073 if (total_compressed + blocksize > total_in) 1074 goto mark_incompressible; 1075 1076 /* 1077 * The async work queues will take care of doing actual allocation on 1078 * disk for these compressed pages, and will submit the bios. 1079 */ 1080 ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, 1081 nr_folios, compress_type); 1082 BUG_ON(ret); 1083 if (start + total_in < end) { 1084 start += total_in; 1085 cond_resched(); 1086 goto again; 1087 } 1088 return; 1089 1090 mark_incompressible: 1091 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) 1092 inode->flags |= BTRFS_INODE_NOCOMPRESS; 1093 cleanup_and_bail_uncompressed: 1094 ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, 1095 BTRFS_COMPRESS_NONE); 1096 BUG_ON(ret); 1097 free_pages: 1098 if (folios) { 1099 for (i = 0; i < nr_folios; i++) { 1100 WARN_ON(folios[i]->mapping); 1101 btrfs_free_compr_folio(folios[i]); 1102 } 1103 kfree(folios); 1104 } 1105 } 1106 1107 static void free_async_extent_pages(struct async_extent *async_extent) 1108 { 1109 int i; 1110 1111 if (!async_extent->folios) 1112 return; 1113 1114 for (i = 0; i < async_extent->nr_folios; i++) { 1115 WARN_ON(async_extent->folios[i]->mapping); 1116 btrfs_free_compr_folio(async_extent->folios[i]); 1117 } 1118 kfree(async_extent->folios); 1119 async_extent->nr_folios = 0; 1120 async_extent->folios = NULL; 1121 } 1122 1123 static void submit_uncompressed_range(struct btrfs_inode *inode, 1124 struct async_extent *async_extent, 1125 struct page *locked_page) 1126 { 1127 u64 start = async_extent->start; 1128 u64 end = async_extent->start + async_extent->ram_size - 1; 1129 int ret; 1130 struct writeback_control wbc = { 1131 .sync_mode = WB_SYNC_ALL, 1132 .range_start = start, 1133 .range_end = end, 1134 .no_cgroup_owner = 1, 1135 }; 1136 1137 wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); 1138 ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); 1139 wbc_detach_inode(&wbc); 1140 if (ret < 0) { 1141 btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); 1142 if (locked_page) { 1143 const u64 page_start = page_offset(locked_page); 1144 1145 set_page_writeback(locked_page); 1146 end_page_writeback(locked_page); 1147 btrfs_mark_ordered_io_finished(inode, locked_page, 1148 page_start, PAGE_SIZE, 1149 !ret); 1150 mapping_set_error(locked_page->mapping, ret); 1151 unlock_page(locked_page); 1152 } 1153 } 1154 } 1155 1156 static void submit_one_async_extent(struct async_chunk *async_chunk, 1157 struct async_extent *async_extent, 1158 u64 *alloc_hint) 1159 { 1160 struct btrfs_inode *inode = async_chunk->inode; 1161 struct extent_io_tree *io_tree = &inode->io_tree; 1162 struct btrfs_root *root = inode->root; 1163 struct btrfs_fs_info *fs_info = root->fs_info; 1164 struct btrfs_ordered_extent *ordered; 1165 struct btrfs_file_extent file_extent; 1166 struct btrfs_key ins; 1167 struct page *locked_page = NULL; 1168 struct extent_state *cached = NULL; 1169 struct extent_map *em; 1170 int ret = 0; 1171 u64 start = async_extent->start; 1172 u64 end = async_extent->start + async_extent->ram_size - 1; 1173 1174 if (async_chunk->blkcg_css) 1175 kthread_associate_blkcg(async_chunk->blkcg_css); 1176 1177 /* 1178 * If async_chunk->locked_page is in the async_extent range, we need to 1179 * handle it. 1180 */ 1181 if (async_chunk->locked_page) { 1182 u64 locked_page_start = page_offset(async_chunk->locked_page); 1183 u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; 1184 1185 if (!(start >= locked_page_end || end <= locked_page_start)) 1186 locked_page = async_chunk->locked_page; 1187 } 1188 1189 if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { 1190 submit_uncompressed_range(inode, async_extent, locked_page); 1191 goto done; 1192 } 1193 1194 ret = btrfs_reserve_extent(root, async_extent->ram_size, 1195 async_extent->compressed_size, 1196 async_extent->compressed_size, 1197 0, *alloc_hint, &ins, 1, 1); 1198 if (ret) { 1199 /* 1200 * We can't reserve contiguous space for the compressed size. 1201 * Unlikely, but it's possible that we could have enough 1202 * non-contiguous space for the uncompressed size instead. So 1203 * fall back to uncompressed. 1204 */ 1205 submit_uncompressed_range(inode, async_extent, locked_page); 1206 goto done; 1207 } 1208 1209 lock_extent(io_tree, start, end, &cached); 1210 1211 /* Here we're doing allocation and writeback of the compressed pages */ 1212 file_extent.disk_bytenr = ins.objectid; 1213 file_extent.disk_num_bytes = ins.offset; 1214 file_extent.ram_bytes = async_extent->ram_size; 1215 file_extent.num_bytes = async_extent->ram_size; 1216 file_extent.offset = 0; 1217 file_extent.compression = async_extent->compress_type; 1218 1219 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); 1220 if (IS_ERR(em)) { 1221 ret = PTR_ERR(em); 1222 goto out_free_reserve; 1223 } 1224 free_extent_map(em); 1225 1226 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1227 1 << BTRFS_ORDERED_COMPRESSED); 1228 if (IS_ERR(ordered)) { 1229 btrfs_drop_extent_map_range(inode, start, end, false); 1230 ret = PTR_ERR(ordered); 1231 goto out_free_reserve; 1232 } 1233 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1234 1235 /* Clear dirty, set writeback and unlock the pages. */ 1236 extent_clear_unlock_delalloc(inode, start, end, 1237 NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, 1238 PAGE_UNLOCK | PAGE_START_WRITEBACK); 1239 btrfs_submit_compressed_write(ordered, 1240 async_extent->folios, /* compressed_folios */ 1241 async_extent->nr_folios, 1242 async_chunk->write_flags, true); 1243 *alloc_hint = ins.objectid + ins.offset; 1244 done: 1245 if (async_chunk->blkcg_css) 1246 kthread_associate_blkcg(NULL); 1247 kfree(async_extent); 1248 return; 1249 1250 out_free_reserve: 1251 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1252 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 1253 mapping_set_error(inode->vfs_inode.i_mapping, -EIO); 1254 extent_clear_unlock_delalloc(inode, start, end, 1255 NULL, &cached, 1256 EXTENT_LOCKED | EXTENT_DELALLOC | 1257 EXTENT_DELALLOC_NEW | 1258 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 1259 PAGE_UNLOCK | PAGE_START_WRITEBACK | 1260 PAGE_END_WRITEBACK); 1261 free_async_extent_pages(async_extent); 1262 if (async_chunk->blkcg_css) 1263 kthread_associate_blkcg(NULL); 1264 btrfs_debug(fs_info, 1265 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", 1266 btrfs_root_id(root), btrfs_ino(inode), start, 1267 async_extent->ram_size, ret); 1268 kfree(async_extent); 1269 } 1270 1271 u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, 1272 u64 num_bytes) 1273 { 1274 struct extent_map_tree *em_tree = &inode->extent_tree; 1275 struct extent_map *em; 1276 u64 alloc_hint = 0; 1277 1278 read_lock(&em_tree->lock); 1279 em = search_extent_mapping(em_tree, start, num_bytes); 1280 if (em) { 1281 /* 1282 * if block start isn't an actual block number then find the 1283 * first block in this inode and use that as a hint. If that 1284 * block is also bogus then just don't worry about it. 1285 */ 1286 if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { 1287 free_extent_map(em); 1288 em = search_extent_mapping(em_tree, 0, 0); 1289 if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) 1290 alloc_hint = extent_map_block_start(em); 1291 if (em) 1292 free_extent_map(em); 1293 } else { 1294 alloc_hint = extent_map_block_start(em); 1295 free_extent_map(em); 1296 } 1297 } 1298 read_unlock(&em_tree->lock); 1299 1300 return alloc_hint; 1301 } 1302 1303 /* 1304 * when extent_io.c finds a delayed allocation range in the file, 1305 * the call backs end up in this code. The basic idea is to 1306 * allocate extents on disk for the range, and create ordered data structs 1307 * in ram to track those extents. 1308 * 1309 * locked_page is the page that writepage had locked already. We use 1310 * it to make sure we don't do extra locks or unlocks. 1311 * 1312 * When this function fails, it unlocks all pages except @locked_page. 1313 * 1314 * When this function successfully creates an inline extent, it returns 1 and 1315 * unlocks all pages including locked_page and starts I/O on them. 1316 * (In reality inline extents are limited to a single page, so locked_page is 1317 * the only page handled anyway). 1318 * 1319 * When this function succeed and creates a normal extent, the page locking 1320 * status depends on the passed in flags: 1321 * 1322 * - If @keep_locked is set, all pages are kept locked. 1323 * - Else all pages except for @locked_page are unlocked. 1324 * 1325 * When a failure happens in the second or later iteration of the 1326 * while-loop, the ordered extents created in previous iterations are kept 1327 * intact. So, the caller must clean them up by calling 1328 * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for 1329 * example. 1330 */ 1331 static noinline int cow_file_range(struct btrfs_inode *inode, 1332 struct page *locked_page, u64 start, u64 end, 1333 u64 *done_offset, 1334 bool keep_locked, bool no_inline) 1335 { 1336 struct btrfs_root *root = inode->root; 1337 struct btrfs_fs_info *fs_info = root->fs_info; 1338 struct extent_state *cached = NULL; 1339 u64 alloc_hint = 0; 1340 u64 orig_start = start; 1341 u64 num_bytes; 1342 unsigned long ram_size; 1343 u64 cur_alloc_size = 0; 1344 u64 min_alloc_size; 1345 u64 blocksize = fs_info->sectorsize; 1346 struct btrfs_key ins; 1347 struct extent_map *em; 1348 unsigned clear_bits; 1349 unsigned long page_ops; 1350 bool extent_reserved = false; 1351 int ret = 0; 1352 1353 if (btrfs_is_free_space_inode(inode)) { 1354 ret = -EINVAL; 1355 goto out_unlock; 1356 } 1357 1358 num_bytes = ALIGN(end - start + 1, blocksize); 1359 num_bytes = max(blocksize, num_bytes); 1360 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); 1361 1362 inode_should_defrag(inode, start, end, num_bytes, SZ_64K); 1363 1364 if (!no_inline) { 1365 /* lets try to make an inline extent */ 1366 ret = cow_file_range_inline(inode, locked_page, start, end, 0, 1367 BTRFS_COMPRESS_NONE, NULL, false); 1368 if (ret <= 0) { 1369 /* 1370 * We succeeded, return 1 so the caller knows we're done 1371 * with this page and already handled the IO. 1372 * 1373 * If there was an error then cow_file_range_inline() has 1374 * already done the cleanup. 1375 */ 1376 if (ret == 0) 1377 ret = 1; 1378 goto done; 1379 } 1380 } 1381 1382 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); 1383 1384 /* 1385 * Relocation relies on the relocated extents to have exactly the same 1386 * size as the original extents. Normally writeback for relocation data 1387 * extents follows a NOCOW path because relocation preallocates the 1388 * extents. However, due to an operation such as scrub turning a block 1389 * group to RO mode, it may fallback to COW mode, so we must make sure 1390 * an extent allocated during COW has exactly the requested size and can 1391 * not be split into smaller extents, otherwise relocation breaks and 1392 * fails during the stage where it updates the bytenr of file extent 1393 * items. 1394 */ 1395 if (btrfs_is_data_reloc_root(root)) 1396 min_alloc_size = num_bytes; 1397 else 1398 min_alloc_size = fs_info->sectorsize; 1399 1400 while (num_bytes > 0) { 1401 struct btrfs_ordered_extent *ordered; 1402 struct btrfs_file_extent file_extent; 1403 1404 cur_alloc_size = num_bytes; 1405 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, 1406 min_alloc_size, 0, alloc_hint, 1407 &ins, 1, 1); 1408 if (ret == -EAGAIN) { 1409 /* 1410 * btrfs_reserve_extent only returns -EAGAIN for zoned 1411 * file systems, which is an indication that there are 1412 * no active zones to allocate from at the moment. 1413 * 1414 * If this is the first loop iteration, wait for at 1415 * least one zone to finish before retrying the 1416 * allocation. Otherwise ask the caller to write out 1417 * the already allocated blocks before coming back to 1418 * us, or return -ENOSPC if it can't handle retries. 1419 */ 1420 ASSERT(btrfs_is_zoned(fs_info)); 1421 if (start == orig_start) { 1422 wait_on_bit_io(&inode->root->fs_info->flags, 1423 BTRFS_FS_NEED_ZONE_FINISH, 1424 TASK_UNINTERRUPTIBLE); 1425 continue; 1426 } 1427 if (done_offset) { 1428 *done_offset = start - 1; 1429 return 0; 1430 } 1431 ret = -ENOSPC; 1432 } 1433 if (ret < 0) 1434 goto out_unlock; 1435 cur_alloc_size = ins.offset; 1436 extent_reserved = true; 1437 1438 ram_size = ins.offset; 1439 file_extent.disk_bytenr = ins.objectid; 1440 file_extent.disk_num_bytes = ins.offset; 1441 file_extent.num_bytes = ins.offset; 1442 file_extent.ram_bytes = ins.offset; 1443 file_extent.offset = 0; 1444 file_extent.compression = BTRFS_COMPRESS_NONE; 1445 1446 lock_extent(&inode->io_tree, start, start + ram_size - 1, 1447 &cached); 1448 1449 em = btrfs_create_io_em(inode, start, &file_extent, 1450 BTRFS_ORDERED_REGULAR); 1451 if (IS_ERR(em)) { 1452 unlock_extent(&inode->io_tree, start, 1453 start + ram_size - 1, &cached); 1454 ret = PTR_ERR(em); 1455 goto out_reserve; 1456 } 1457 free_extent_map(em); 1458 1459 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1460 1 << BTRFS_ORDERED_REGULAR); 1461 if (IS_ERR(ordered)) { 1462 unlock_extent(&inode->io_tree, start, 1463 start + ram_size - 1, &cached); 1464 ret = PTR_ERR(ordered); 1465 goto out_drop_extent_cache; 1466 } 1467 1468 if (btrfs_is_data_reloc_root(root)) { 1469 ret = btrfs_reloc_clone_csums(ordered); 1470 1471 /* 1472 * Only drop cache here, and process as normal. 1473 * 1474 * We must not allow extent_clear_unlock_delalloc() 1475 * at out_unlock label to free meta of this ordered 1476 * extent, as its meta should be freed by 1477 * btrfs_finish_ordered_io(). 1478 * 1479 * So we must continue until @start is increased to 1480 * skip current ordered extent. 1481 */ 1482 if (ret) 1483 btrfs_drop_extent_map_range(inode, start, 1484 start + ram_size - 1, 1485 false); 1486 } 1487 btrfs_put_ordered_extent(ordered); 1488 1489 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1490 1491 /* 1492 * We're not doing compressed IO, don't unlock the first page 1493 * (which the caller expects to stay locked), don't clear any 1494 * dirty bits and don't set any writeback bits 1495 * 1496 * Do set the Ordered (Private2) bit so we know this page was 1497 * properly setup for writepage. 1498 */ 1499 page_ops = (keep_locked ? 0 : PAGE_UNLOCK); 1500 page_ops |= PAGE_SET_ORDERED; 1501 1502 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, 1503 locked_page, &cached, 1504 EXTENT_LOCKED | EXTENT_DELALLOC, 1505 page_ops); 1506 if (num_bytes < cur_alloc_size) 1507 num_bytes = 0; 1508 else 1509 num_bytes -= cur_alloc_size; 1510 alloc_hint = ins.objectid + ins.offset; 1511 start += cur_alloc_size; 1512 extent_reserved = false; 1513 1514 /* 1515 * btrfs_reloc_clone_csums() error, since start is increased 1516 * extent_clear_unlock_delalloc() at out_unlock label won't 1517 * free metadata of current ordered extent, we're OK to exit. 1518 */ 1519 if (ret) 1520 goto out_unlock; 1521 } 1522 done: 1523 if (done_offset) 1524 *done_offset = end; 1525 return ret; 1526 1527 out_drop_extent_cache: 1528 btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); 1529 out_reserve: 1530 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1531 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 1532 out_unlock: 1533 /* 1534 * Now, we have three regions to clean up: 1535 * 1536 * |-------(1)----|---(2)---|-------------(3)----------| 1537 * `- orig_start `- start `- start + cur_alloc_size `- end 1538 * 1539 * We process each region below. 1540 */ 1541 1542 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 1543 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; 1544 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; 1545 1546 /* 1547 * For the range (1). We have already instantiated the ordered extents 1548 * for this region. They are cleaned up by 1549 * btrfs_cleanup_ordered_extents() in e.g, 1550 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are 1551 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | 1552 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup 1553 * function. 1554 * 1555 * However, in case of @keep_locked, we still need to unlock the pages 1556 * (except @locked_page) to ensure all the pages are unlocked. 1557 */ 1558 if (keep_locked && orig_start < start) { 1559 if (!locked_page) 1560 mapping_set_error(inode->vfs_inode.i_mapping, ret); 1561 extent_clear_unlock_delalloc(inode, orig_start, start - 1, 1562 locked_page, NULL, 0, page_ops); 1563 } 1564 1565 /* 1566 * At this point we're unlocked, we want to make sure we're only 1567 * clearing these flags under the extent lock, so lock the rest of the 1568 * range and clear everything up. 1569 */ 1570 lock_extent(&inode->io_tree, start, end, NULL); 1571 1572 /* 1573 * For the range (2). If we reserved an extent for our delalloc range 1574 * (or a subrange) and failed to create the respective ordered extent, 1575 * then it means that when we reserved the extent we decremented the 1576 * extent's size from the data space_info's bytes_may_use counter and 1577 * incremented the space_info's bytes_reserved counter by the same 1578 * amount. We must make sure extent_clear_unlock_delalloc() does not try 1579 * to decrement again the data space_info's bytes_may_use counter, 1580 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. 1581 */ 1582 if (extent_reserved) { 1583 extent_clear_unlock_delalloc(inode, start, 1584 start + cur_alloc_size - 1, 1585 locked_page, &cached, 1586 clear_bits, 1587 page_ops); 1588 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); 1589 start += cur_alloc_size; 1590 } 1591 1592 /* 1593 * For the range (3). We never touched the region. In addition to the 1594 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data 1595 * space_info's bytes_may_use counter, reserved in 1596 * btrfs_check_data_free_space(). 1597 */ 1598 if (start < end) { 1599 clear_bits |= EXTENT_CLEAR_DATA_RESV; 1600 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1601 &cached, clear_bits, page_ops); 1602 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); 1603 } 1604 return ret; 1605 } 1606 1607 /* 1608 * Phase two of compressed writeback. This is the ordered portion of the code, 1609 * which only gets called in the order the work was queued. We walk all the 1610 * async extents created by compress_file_range and send them down to the disk. 1611 * 1612 * If called with @do_free == true then it'll try to finish the work and free 1613 * the work struct eventually. 1614 */ 1615 static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) 1616 { 1617 struct async_chunk *async_chunk = container_of(work, struct async_chunk, 1618 work); 1619 struct btrfs_fs_info *fs_info = btrfs_work_owner(work); 1620 struct async_extent *async_extent; 1621 unsigned long nr_pages; 1622 u64 alloc_hint = 0; 1623 1624 if (do_free) { 1625 struct async_cow *async_cow; 1626 1627 btrfs_add_delayed_iput(async_chunk->inode); 1628 if (async_chunk->blkcg_css) 1629 css_put(async_chunk->blkcg_css); 1630 1631 async_cow = async_chunk->async_cow; 1632 if (atomic_dec_and_test(&async_cow->num_chunks)) 1633 kvfree(async_cow); 1634 return; 1635 } 1636 1637 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> 1638 PAGE_SHIFT; 1639 1640 while (!list_empty(&async_chunk->extents)) { 1641 async_extent = list_entry(async_chunk->extents.next, 1642 struct async_extent, list); 1643 list_del(&async_extent->list); 1644 submit_one_async_extent(async_chunk, async_extent, &alloc_hint); 1645 } 1646 1647 /* atomic_sub_return implies a barrier */ 1648 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 1649 5 * SZ_1M) 1650 cond_wake_up_nomb(&fs_info->async_submit_wait); 1651 } 1652 1653 static bool run_delalloc_compressed(struct btrfs_inode *inode, 1654 struct page *locked_page, u64 start, 1655 u64 end, struct writeback_control *wbc) 1656 { 1657 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1658 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); 1659 struct async_cow *ctx; 1660 struct async_chunk *async_chunk; 1661 unsigned long nr_pages; 1662 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); 1663 int i; 1664 unsigned nofs_flag; 1665 const blk_opf_t write_flags = wbc_to_write_flags(wbc); 1666 1667 nofs_flag = memalloc_nofs_save(); 1668 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); 1669 memalloc_nofs_restore(nofs_flag); 1670 if (!ctx) 1671 return false; 1672 1673 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); 1674 1675 async_chunk = ctx->chunks; 1676 atomic_set(&ctx->num_chunks, num_chunks); 1677 1678 for (i = 0; i < num_chunks; i++) { 1679 u64 cur_end = min(end, start + SZ_512K - 1); 1680 1681 /* 1682 * igrab is called higher up in the call chain, take only the 1683 * lightweight reference for the callback lifetime 1684 */ 1685 ihold(&inode->vfs_inode); 1686 async_chunk[i].async_cow = ctx; 1687 async_chunk[i].inode = inode; 1688 async_chunk[i].start = start; 1689 async_chunk[i].end = cur_end; 1690 async_chunk[i].write_flags = write_flags; 1691 INIT_LIST_HEAD(&async_chunk[i].extents); 1692 1693 /* 1694 * The locked_page comes all the way from writepage and its 1695 * the original page we were actually given. As we spread 1696 * this large delalloc region across multiple async_chunk 1697 * structs, only the first struct needs a pointer to locked_page 1698 * 1699 * This way we don't need racey decisions about who is supposed 1700 * to unlock it. 1701 */ 1702 if (locked_page) { 1703 /* 1704 * Depending on the compressibility, the pages might or 1705 * might not go through async. We want all of them to 1706 * be accounted against wbc once. Let's do it here 1707 * before the paths diverge. wbc accounting is used 1708 * only for foreign writeback detection and doesn't 1709 * need full accuracy. Just account the whole thing 1710 * against the first page. 1711 */ 1712 wbc_account_cgroup_owner(wbc, locked_page, 1713 cur_end - start); 1714 async_chunk[i].locked_page = locked_page; 1715 locked_page = NULL; 1716 } else { 1717 async_chunk[i].locked_page = NULL; 1718 } 1719 1720 if (blkcg_css != blkcg_root_css) { 1721 css_get(blkcg_css); 1722 async_chunk[i].blkcg_css = blkcg_css; 1723 async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; 1724 } else { 1725 async_chunk[i].blkcg_css = NULL; 1726 } 1727 1728 btrfs_init_work(&async_chunk[i].work, compress_file_range, 1729 submit_compressed_extents); 1730 1731 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); 1732 atomic_add(nr_pages, &fs_info->async_delalloc_pages); 1733 1734 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); 1735 1736 start = cur_end + 1; 1737 } 1738 return true; 1739 } 1740 1741 /* 1742 * Run the delalloc range from start to end, and write back any dirty pages 1743 * covered by the range. 1744 */ 1745 static noinline int run_delalloc_cow(struct btrfs_inode *inode, 1746 struct page *locked_page, u64 start, 1747 u64 end, struct writeback_control *wbc, 1748 bool pages_dirty) 1749 { 1750 u64 done_offset = end; 1751 int ret; 1752 1753 while (start <= end) { 1754 ret = cow_file_range(inode, locked_page, start, end, &done_offset, 1755 true, false); 1756 if (ret) 1757 return ret; 1758 extent_write_locked_range(&inode->vfs_inode, locked_page, start, 1759 done_offset, wbc, pages_dirty); 1760 start = done_offset + 1; 1761 } 1762 1763 return 1; 1764 } 1765 1766 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, 1767 const u64 start, const u64 end) 1768 { 1769 const bool is_space_ino = btrfs_is_free_space_inode(inode); 1770 const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); 1771 const u64 range_bytes = end + 1 - start; 1772 struct extent_io_tree *io_tree = &inode->io_tree; 1773 struct extent_state *cached_state = NULL; 1774 u64 range_start = start; 1775 u64 count; 1776 int ret; 1777 1778 /* 1779 * If EXTENT_NORESERVE is set it means that when the buffered write was 1780 * made we had not enough available data space and therefore we did not 1781 * reserve data space for it, since we though we could do NOCOW for the 1782 * respective file range (either there is prealloc extent or the inode 1783 * has the NOCOW bit set). 1784 * 1785 * However when we need to fallback to COW mode (because for example the 1786 * block group for the corresponding extent was turned to RO mode by a 1787 * scrub or relocation) we need to do the following: 1788 * 1789 * 1) We increment the bytes_may_use counter of the data space info. 1790 * If COW succeeds, it allocates a new data extent and after doing 1791 * that it decrements the space info's bytes_may_use counter and 1792 * increments its bytes_reserved counter by the same amount (we do 1793 * this at btrfs_add_reserved_bytes()). So we need to increment the 1794 * bytes_may_use counter to compensate (when space is reserved at 1795 * buffered write time, the bytes_may_use counter is incremented); 1796 * 1797 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so 1798 * that if the COW path fails for any reason, it decrements (through 1799 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the 1800 * data space info, which we incremented in the step above. 1801 * 1802 * If we need to fallback to cow and the inode corresponds to a free 1803 * space cache inode or an inode of the data relocation tree, we must 1804 * also increment bytes_may_use of the data space_info for the same 1805 * reason. Space caches and relocated data extents always get a prealloc 1806 * extent for them, however scrub or balance may have set the block 1807 * group that contains that extent to RO mode and therefore force COW 1808 * when starting writeback. 1809 */ 1810 lock_extent(io_tree, start, end, &cached_state); 1811 count = count_range_bits(io_tree, &range_start, end, range_bytes, 1812 EXTENT_NORESERVE, 0, NULL); 1813 if (count > 0 || is_space_ino || is_reloc_ino) { 1814 u64 bytes = count; 1815 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1816 struct btrfs_space_info *sinfo = fs_info->data_sinfo; 1817 1818 if (is_space_ino || is_reloc_ino) 1819 bytes = range_bytes; 1820 1821 spin_lock(&sinfo->lock); 1822 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); 1823 spin_unlock(&sinfo->lock); 1824 1825 if (count > 0) 1826 clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, 1827 NULL); 1828 } 1829 unlock_extent(io_tree, start, end, &cached_state); 1830 1831 /* 1832 * Don't try to create inline extents, as a mix of inline extent that 1833 * is written out and unlocked directly and a normal NOCOW extent 1834 * doesn't work. 1835 */ 1836 ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); 1837 ASSERT(ret != 1); 1838 return ret; 1839 } 1840 1841 struct can_nocow_file_extent_args { 1842 /* Input fields. */ 1843 1844 /* Start file offset of the range we want to NOCOW. */ 1845 u64 start; 1846 /* End file offset (inclusive) of the range we want to NOCOW. */ 1847 u64 end; 1848 bool writeback_path; 1849 bool strict; 1850 /* 1851 * Free the path passed to can_nocow_file_extent() once it's not needed 1852 * anymore. 1853 */ 1854 bool free_path; 1855 1856 /* 1857 * Output fields. Only set when can_nocow_file_extent() returns 1. 1858 * The expected file extent for the NOCOW write. 1859 */ 1860 struct btrfs_file_extent file_extent; 1861 }; 1862 1863 /* 1864 * Check if we can NOCOW the file extent that the path points to. 1865 * This function may return with the path released, so the caller should check 1866 * if path->nodes[0] is NULL or not if it needs to use the path afterwards. 1867 * 1868 * Returns: < 0 on error 1869 * 0 if we can not NOCOW 1870 * 1 if we can NOCOW 1871 */ 1872 static int can_nocow_file_extent(struct btrfs_path *path, 1873 struct btrfs_key *key, 1874 struct btrfs_inode *inode, 1875 struct can_nocow_file_extent_args *args) 1876 { 1877 const bool is_freespace_inode = btrfs_is_free_space_inode(inode); 1878 struct extent_buffer *leaf = path->nodes[0]; 1879 struct btrfs_root *root = inode->root; 1880 struct btrfs_file_extent_item *fi; 1881 struct btrfs_root *csum_root; 1882 u64 io_start; 1883 u64 extent_end; 1884 u8 extent_type; 1885 int can_nocow = 0; 1886 int ret = 0; 1887 bool nowait = path->nowait; 1888 1889 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 1890 extent_type = btrfs_file_extent_type(leaf, fi); 1891 1892 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1893 goto out; 1894 1895 if (!(inode->flags & BTRFS_INODE_NODATACOW) && 1896 extent_type == BTRFS_FILE_EXTENT_REG) 1897 goto out; 1898 1899 /* 1900 * If the extent was created before the generation where the last snapshot 1901 * for its subvolume was created, then this implies the extent is shared, 1902 * hence we must COW. 1903 */ 1904 if (!args->strict && 1905 btrfs_file_extent_generation(leaf, fi) <= 1906 btrfs_root_last_snapshot(&root->root_item)) 1907 goto out; 1908 1909 /* An explicit hole, must COW. */ 1910 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 1911 goto out; 1912 1913 /* Compressed/encrypted/encoded extents must be COWed. */ 1914 if (btrfs_file_extent_compression(leaf, fi) || 1915 btrfs_file_extent_encryption(leaf, fi) || 1916 btrfs_file_extent_other_encoding(leaf, fi)) 1917 goto out; 1918 1919 extent_end = btrfs_file_extent_end(path); 1920 1921 args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1922 args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1923 args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1924 args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); 1925 args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); 1926 1927 /* 1928 * The following checks can be expensive, as they need to take other 1929 * locks and do btree or rbtree searches, so release the path to avoid 1930 * blocking other tasks for too long. 1931 */ 1932 btrfs_release_path(path); 1933 1934 ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), 1935 key->offset - args->file_extent.offset, 1936 args->file_extent.disk_bytenr, args->strict, path); 1937 WARN_ON_ONCE(ret > 0 && is_freespace_inode); 1938 if (ret != 0) 1939 goto out; 1940 1941 if (args->free_path) { 1942 /* 1943 * We don't need the path anymore, plus through the 1944 * btrfs_lookup_csums_list() call below we will end up allocating 1945 * another path. So free the path to avoid unnecessary extra 1946 * memory usage. 1947 */ 1948 btrfs_free_path(path); 1949 path = NULL; 1950 } 1951 1952 /* If there are pending snapshots for this root, we must COW. */ 1953 if (args->writeback_path && !is_freespace_inode && 1954 atomic_read(&root->snapshot_force_cow)) 1955 goto out; 1956 1957 args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; 1958 args->file_extent.offset += args->start - key->offset; 1959 io_start = args->file_extent.disk_bytenr + args->file_extent.offset; 1960 1961 /* 1962 * Force COW if csums exist in the range. This ensures that csums for a 1963 * given extent are either valid or do not exist. 1964 */ 1965 1966 csum_root = btrfs_csum_root(root->fs_info, io_start); 1967 ret = btrfs_lookup_csums_list(csum_root, io_start, 1968 io_start + args->file_extent.num_bytes - 1, 1969 NULL, nowait); 1970 WARN_ON_ONCE(ret > 0 && is_freespace_inode); 1971 if (ret != 0) 1972 goto out; 1973 1974 can_nocow = 1; 1975 out: 1976 if (args->free_path && path) 1977 btrfs_free_path(path); 1978 1979 return ret < 0 ? ret : can_nocow; 1980 } 1981 1982 /* 1983 * when nowcow writeback call back. This checks for snapshots or COW copies 1984 * of the extents that exist in the file, and COWs the file as required. 1985 * 1986 * If no cow copies or snapshots exist, we write directly to the existing 1987 * blocks on disk 1988 */ 1989 static noinline int run_delalloc_nocow(struct btrfs_inode *inode, 1990 struct page *locked_page, 1991 const u64 start, const u64 end) 1992 { 1993 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1994 struct btrfs_root *root = inode->root; 1995 struct btrfs_path *path; 1996 u64 cow_start = (u64)-1; 1997 u64 cur_offset = start; 1998 int ret; 1999 bool check_prev = true; 2000 u64 ino = btrfs_ino(inode); 2001 struct can_nocow_file_extent_args nocow_args = { 0 }; 2002 2003 /* 2004 * Normally on a zoned device we're only doing COW writes, but in case 2005 * of relocation on a zoned filesystem serializes I/O so that we're only 2006 * writing sequentially and can end up here as well. 2007 */ 2008 ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); 2009 2010 path = btrfs_alloc_path(); 2011 if (!path) { 2012 ret = -ENOMEM; 2013 goto error; 2014 } 2015 2016 nocow_args.end = end; 2017 nocow_args.writeback_path = true; 2018 2019 while (cur_offset <= end) { 2020 struct btrfs_block_group *nocow_bg = NULL; 2021 struct btrfs_ordered_extent *ordered; 2022 struct btrfs_key found_key; 2023 struct btrfs_file_extent_item *fi; 2024 struct extent_buffer *leaf; 2025 struct extent_state *cached_state = NULL; 2026 u64 extent_end; 2027 u64 nocow_end; 2028 int extent_type; 2029 bool is_prealloc; 2030 2031 ret = btrfs_lookup_file_extent(NULL, root, path, ino, 2032 cur_offset, 0); 2033 if (ret < 0) 2034 goto error; 2035 2036 /* 2037 * If there is no extent for our range when doing the initial 2038 * search, then go back to the previous slot as it will be the 2039 * one containing the search offset 2040 */ 2041 if (ret > 0 && path->slots[0] > 0 && check_prev) { 2042 leaf = path->nodes[0]; 2043 btrfs_item_key_to_cpu(leaf, &found_key, 2044 path->slots[0] - 1); 2045 if (found_key.objectid == ino && 2046 found_key.type == BTRFS_EXTENT_DATA_KEY) 2047 path->slots[0]--; 2048 } 2049 check_prev = false; 2050 next_slot: 2051 /* Go to next leaf if we have exhausted the current one */ 2052 leaf = path->nodes[0]; 2053 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2054 ret = btrfs_next_leaf(root, path); 2055 if (ret < 0) 2056 goto error; 2057 if (ret > 0) 2058 break; 2059 leaf = path->nodes[0]; 2060 } 2061 2062 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2063 2064 /* Didn't find anything for our INO */ 2065 if (found_key.objectid > ino) 2066 break; 2067 /* 2068 * Keep searching until we find an EXTENT_ITEM or there are no 2069 * more extents for this inode 2070 */ 2071 if (WARN_ON_ONCE(found_key.objectid < ino) || 2072 found_key.type < BTRFS_EXTENT_DATA_KEY) { 2073 path->slots[0]++; 2074 goto next_slot; 2075 } 2076 2077 /* Found key is not EXTENT_DATA_KEY or starts after req range */ 2078 if (found_key.type > BTRFS_EXTENT_DATA_KEY || 2079 found_key.offset > end) 2080 break; 2081 2082 /* 2083 * If the found extent starts after requested offset, then 2084 * adjust extent_end to be right before this extent begins 2085 */ 2086 if (found_key.offset > cur_offset) { 2087 extent_end = found_key.offset; 2088 extent_type = 0; 2089 goto must_cow; 2090 } 2091 2092 /* 2093 * Found extent which begins before our range and potentially 2094 * intersect it 2095 */ 2096 fi = btrfs_item_ptr(leaf, path->slots[0], 2097 struct btrfs_file_extent_item); 2098 extent_type = btrfs_file_extent_type(leaf, fi); 2099 /* If this is triggered then we have a memory corruption. */ 2100 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); 2101 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { 2102 ret = -EUCLEAN; 2103 goto error; 2104 } 2105 extent_end = btrfs_file_extent_end(path); 2106 2107 /* 2108 * If the extent we got ends before our current offset, skip to 2109 * the next extent. 2110 */ 2111 if (extent_end <= cur_offset) { 2112 path->slots[0]++; 2113 goto next_slot; 2114 } 2115 2116 nocow_args.start = cur_offset; 2117 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); 2118 if (ret < 0) 2119 goto error; 2120 if (ret == 0) 2121 goto must_cow; 2122 2123 ret = 0; 2124 nocow_bg = btrfs_inc_nocow_writers(fs_info, 2125 nocow_args.file_extent.disk_bytenr + 2126 nocow_args.file_extent.offset); 2127 if (!nocow_bg) { 2128 must_cow: 2129 /* 2130 * If we can't perform NOCOW writeback for the range, 2131 * then record the beginning of the range that needs to 2132 * be COWed. It will be written out before the next 2133 * NOCOW range if we find one, or when exiting this 2134 * loop. 2135 */ 2136 if (cow_start == (u64)-1) 2137 cow_start = cur_offset; 2138 cur_offset = extent_end; 2139 if (cur_offset > end) 2140 break; 2141 if (!path->nodes[0]) 2142 continue; 2143 path->slots[0]++; 2144 goto next_slot; 2145 } 2146 2147 /* 2148 * COW range from cow_start to found_key.offset - 1. As the key 2149 * will contain the beginning of the first extent that can be 2150 * NOCOW, following one which needs to be COW'ed 2151 */ 2152 if (cow_start != (u64)-1) { 2153 ret = fallback_to_cow(inode, locked_page, 2154 cow_start, found_key.offset - 1); 2155 cow_start = (u64)-1; 2156 if (ret) { 2157 btrfs_dec_nocow_writers(nocow_bg); 2158 goto error; 2159 } 2160 } 2161 2162 nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; 2163 lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); 2164 2165 is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; 2166 if (is_prealloc) { 2167 struct extent_map *em; 2168 2169 em = btrfs_create_io_em(inode, cur_offset, 2170 &nocow_args.file_extent, 2171 BTRFS_ORDERED_PREALLOC); 2172 if (IS_ERR(em)) { 2173 unlock_extent(&inode->io_tree, cur_offset, 2174 nocow_end, &cached_state); 2175 btrfs_dec_nocow_writers(nocow_bg); 2176 ret = PTR_ERR(em); 2177 goto error; 2178 } 2179 free_extent_map(em); 2180 } 2181 2182 ordered = btrfs_alloc_ordered_extent(inode, cur_offset, 2183 &nocow_args.file_extent, 2184 is_prealloc 2185 ? (1 << BTRFS_ORDERED_PREALLOC) 2186 : (1 << BTRFS_ORDERED_NOCOW)); 2187 btrfs_dec_nocow_writers(nocow_bg); 2188 if (IS_ERR(ordered)) { 2189 if (is_prealloc) { 2190 btrfs_drop_extent_map_range(inode, cur_offset, 2191 nocow_end, false); 2192 } 2193 unlock_extent(&inode->io_tree, cur_offset, 2194 nocow_end, &cached_state); 2195 ret = PTR_ERR(ordered); 2196 goto error; 2197 } 2198 2199 if (btrfs_is_data_reloc_root(root)) 2200 /* 2201 * Error handled later, as we must prevent 2202 * extent_clear_unlock_delalloc() in error handler 2203 * from freeing metadata of created ordered extent. 2204 */ 2205 ret = btrfs_reloc_clone_csums(ordered); 2206 btrfs_put_ordered_extent(ordered); 2207 2208 extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, 2209 locked_page, &cached_state, 2210 EXTENT_LOCKED | EXTENT_DELALLOC | 2211 EXTENT_CLEAR_DATA_RESV, 2212 PAGE_UNLOCK | PAGE_SET_ORDERED); 2213 2214 cur_offset = extent_end; 2215 2216 /* 2217 * btrfs_reloc_clone_csums() error, now we're OK to call error 2218 * handler, as metadata for created ordered extent will only 2219 * be freed by btrfs_finish_ordered_io(). 2220 */ 2221 if (ret) 2222 goto error; 2223 } 2224 btrfs_release_path(path); 2225 2226 if (cur_offset <= end && cow_start == (u64)-1) 2227 cow_start = cur_offset; 2228 2229 if (cow_start != (u64)-1) { 2230 cur_offset = end; 2231 ret = fallback_to_cow(inode, locked_page, cow_start, end); 2232 cow_start = (u64)-1; 2233 if (ret) 2234 goto error; 2235 } 2236 2237 btrfs_free_path(path); 2238 return 0; 2239 2240 error: 2241 /* 2242 * If an error happened while a COW region is outstanding, cur_offset 2243 * needs to be reset to cow_start to ensure the COW region is unlocked 2244 * as well. 2245 */ 2246 if (cow_start != (u64)-1) 2247 cur_offset = cow_start; 2248 2249 /* 2250 * We need to lock the extent here because we're clearing DELALLOC and 2251 * we're not locked at this point. 2252 */ 2253 if (cur_offset < end) { 2254 struct extent_state *cached = NULL; 2255 2256 lock_extent(&inode->io_tree, cur_offset, end, &cached); 2257 extent_clear_unlock_delalloc(inode, cur_offset, end, 2258 locked_page, &cached, 2259 EXTENT_LOCKED | EXTENT_DELALLOC | 2260 EXTENT_DEFRAG | 2261 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 2262 PAGE_START_WRITEBACK | 2263 PAGE_END_WRITEBACK); 2264 btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); 2265 } 2266 btrfs_free_path(path); 2267 return ret; 2268 } 2269 2270 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) 2271 { 2272 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { 2273 if (inode->defrag_bytes && 2274 test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) 2275 return false; 2276 return true; 2277 } 2278 return false; 2279 } 2280 2281 /* 2282 * Function to process delayed allocation (create CoW) for ranges which are 2283 * being touched for the first time. 2284 */ 2285 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, 2286 u64 start, u64 end, struct writeback_control *wbc) 2287 { 2288 const bool zoned = btrfs_is_zoned(inode->root->fs_info); 2289 int ret; 2290 2291 /* 2292 * The range must cover part of the @locked_page, or a return of 1 2293 * can confuse the caller. 2294 */ 2295 ASSERT(!(end <= page_offset(locked_page) || 2296 start >= page_offset(locked_page) + PAGE_SIZE)); 2297 2298 if (should_nocow(inode, start, end)) { 2299 ret = run_delalloc_nocow(inode, locked_page, start, end); 2300 goto out; 2301 } 2302 2303 if (btrfs_inode_can_compress(inode) && 2304 inode_need_compress(inode, start, end) && 2305 run_delalloc_compressed(inode, locked_page, start, end, wbc)) 2306 return 1; 2307 2308 if (zoned) 2309 ret = run_delalloc_cow(inode, locked_page, start, end, wbc, 2310 true); 2311 else 2312 ret = cow_file_range(inode, locked_page, start, end, NULL, 2313 false, false); 2314 2315 out: 2316 if (ret < 0) 2317 btrfs_cleanup_ordered_extents(inode, locked_page, start, 2318 end - start + 1); 2319 return ret; 2320 } 2321 2322 void btrfs_split_delalloc_extent(struct btrfs_inode *inode, 2323 struct extent_state *orig, u64 split) 2324 { 2325 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2326 u64 size; 2327 2328 lockdep_assert_held(&inode->io_tree.lock); 2329 2330 /* not delalloc, ignore it */ 2331 if (!(orig->state & EXTENT_DELALLOC)) 2332 return; 2333 2334 size = orig->end - orig->start + 1; 2335 if (size > fs_info->max_extent_size) { 2336 u32 num_extents; 2337 u64 new_size; 2338 2339 /* 2340 * See the explanation in btrfs_merge_delalloc_extent, the same 2341 * applies here, just in reverse. 2342 */ 2343 new_size = orig->end - split + 1; 2344 num_extents = count_max_extents(fs_info, new_size); 2345 new_size = split - orig->start; 2346 num_extents += count_max_extents(fs_info, new_size); 2347 if (count_max_extents(fs_info, size) >= num_extents) 2348 return; 2349 } 2350 2351 spin_lock(&inode->lock); 2352 btrfs_mod_outstanding_extents(inode, 1); 2353 spin_unlock(&inode->lock); 2354 } 2355 2356 /* 2357 * Handle merged delayed allocation extents so we can keep track of new extents 2358 * that are just merged onto old extents, such as when we are doing sequential 2359 * writes, so we can properly account for the metadata space we'll need. 2360 */ 2361 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, 2362 struct extent_state *other) 2363 { 2364 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2365 u64 new_size, old_size; 2366 u32 num_extents; 2367 2368 lockdep_assert_held(&inode->io_tree.lock); 2369 2370 /* not delalloc, ignore it */ 2371 if (!(other->state & EXTENT_DELALLOC)) 2372 return; 2373 2374 if (new->start > other->start) 2375 new_size = new->end - other->start + 1; 2376 else 2377 new_size = other->end - new->start + 1; 2378 2379 /* we're not bigger than the max, unreserve the space and go */ 2380 if (new_size <= fs_info->max_extent_size) { 2381 spin_lock(&inode->lock); 2382 btrfs_mod_outstanding_extents(inode, -1); 2383 spin_unlock(&inode->lock); 2384 return; 2385 } 2386 2387 /* 2388 * We have to add up either side to figure out how many extents were 2389 * accounted for before we merged into one big extent. If the number of 2390 * extents we accounted for is <= the amount we need for the new range 2391 * then we can return, otherwise drop. Think of it like this 2392 * 2393 * [ 4k][MAX_SIZE] 2394 * 2395 * So we've grown the extent by a MAX_SIZE extent, this would mean we 2396 * need 2 outstanding extents, on one side we have 1 and the other side 2397 * we have 1 so they are == and we can return. But in this case 2398 * 2399 * [MAX_SIZE+4k][MAX_SIZE+4k] 2400 * 2401 * Each range on their own accounts for 2 extents, but merged together 2402 * they are only 3 extents worth of accounting, so we need to drop in 2403 * this case. 2404 */ 2405 old_size = other->end - other->start + 1; 2406 num_extents = count_max_extents(fs_info, old_size); 2407 old_size = new->end - new->start + 1; 2408 num_extents += count_max_extents(fs_info, old_size); 2409 if (count_max_extents(fs_info, new_size) >= num_extents) 2410 return; 2411 2412 spin_lock(&inode->lock); 2413 btrfs_mod_outstanding_extents(inode, -1); 2414 spin_unlock(&inode->lock); 2415 } 2416 2417 static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) 2418 { 2419 struct btrfs_root *root = inode->root; 2420 struct btrfs_fs_info *fs_info = root->fs_info; 2421 2422 spin_lock(&root->delalloc_lock); 2423 ASSERT(list_empty(&inode->delalloc_inodes)); 2424 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); 2425 root->nr_delalloc_inodes++; 2426 if (root->nr_delalloc_inodes == 1) { 2427 spin_lock(&fs_info->delalloc_root_lock); 2428 ASSERT(list_empty(&root->delalloc_root)); 2429 list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); 2430 spin_unlock(&fs_info->delalloc_root_lock); 2431 } 2432 spin_unlock(&root->delalloc_lock); 2433 } 2434 2435 void btrfs_del_delalloc_inode(struct btrfs_inode *inode) 2436 { 2437 struct btrfs_root *root = inode->root; 2438 struct btrfs_fs_info *fs_info = root->fs_info; 2439 2440 lockdep_assert_held(&root->delalloc_lock); 2441 2442 /* 2443 * We may be called after the inode was already deleted from the list, 2444 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), 2445 * and then later through btrfs_clear_delalloc_extent() while the inode 2446 * still has ->delalloc_bytes > 0. 2447 */ 2448 if (!list_empty(&inode->delalloc_inodes)) { 2449 list_del_init(&inode->delalloc_inodes); 2450 root->nr_delalloc_inodes--; 2451 if (!root->nr_delalloc_inodes) { 2452 ASSERT(list_empty(&root->delalloc_inodes)); 2453 spin_lock(&fs_info->delalloc_root_lock); 2454 ASSERT(!list_empty(&root->delalloc_root)); 2455 list_del_init(&root->delalloc_root); 2456 spin_unlock(&fs_info->delalloc_root_lock); 2457 } 2458 } 2459 } 2460 2461 /* 2462 * Properly track delayed allocation bytes in the inode and to maintain the 2463 * list of inodes that have pending delalloc work to be done. 2464 */ 2465 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, 2466 u32 bits) 2467 { 2468 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2469 2470 lockdep_assert_held(&inode->io_tree.lock); 2471 2472 if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) 2473 WARN_ON(1); 2474 /* 2475 * set_bit and clear bit hooks normally require _irqsave/restore 2476 * but in this case, we are only testing for the DELALLOC 2477 * bit, which is only set or cleared with irqs on 2478 */ 2479 if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 2480 u64 len = state->end + 1 - state->start; 2481 u64 prev_delalloc_bytes; 2482 u32 num_extents = count_max_extents(fs_info, len); 2483 2484 spin_lock(&inode->lock); 2485 btrfs_mod_outstanding_extents(inode, num_extents); 2486 spin_unlock(&inode->lock); 2487 2488 /* For sanity tests */ 2489 if (btrfs_is_testing(fs_info)) 2490 return; 2491 2492 percpu_counter_add_batch(&fs_info->delalloc_bytes, len, 2493 fs_info->delalloc_batch); 2494 spin_lock(&inode->lock); 2495 prev_delalloc_bytes = inode->delalloc_bytes; 2496 inode->delalloc_bytes += len; 2497 if (bits & EXTENT_DEFRAG) 2498 inode->defrag_bytes += len; 2499 spin_unlock(&inode->lock); 2500 2501 /* 2502 * We don't need to be under the protection of the inode's lock, 2503 * because we are called while holding the inode's io_tree lock 2504 * and are therefore protected against concurrent calls of this 2505 * function and btrfs_clear_delalloc_extent(). 2506 */ 2507 if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) 2508 btrfs_add_delalloc_inode(inode); 2509 } 2510 2511 if (!(state->state & EXTENT_DELALLOC_NEW) && 2512 (bits & EXTENT_DELALLOC_NEW)) { 2513 spin_lock(&inode->lock); 2514 inode->new_delalloc_bytes += state->end + 1 - state->start; 2515 spin_unlock(&inode->lock); 2516 } 2517 } 2518 2519 /* 2520 * Once a range is no longer delalloc this function ensures that proper 2521 * accounting happens. 2522 */ 2523 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, 2524 struct extent_state *state, u32 bits) 2525 { 2526 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2527 u64 len = state->end + 1 - state->start; 2528 u32 num_extents = count_max_extents(fs_info, len); 2529 2530 lockdep_assert_held(&inode->io_tree.lock); 2531 2532 if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { 2533 spin_lock(&inode->lock); 2534 inode->defrag_bytes -= len; 2535 spin_unlock(&inode->lock); 2536 } 2537 2538 /* 2539 * set_bit and clear bit hooks normally require _irqsave/restore 2540 * but in this case, we are only testing for the DELALLOC 2541 * bit, which is only set or cleared with irqs on 2542 */ 2543 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 2544 struct btrfs_root *root = inode->root; 2545 u64 new_delalloc_bytes; 2546 2547 spin_lock(&inode->lock); 2548 btrfs_mod_outstanding_extents(inode, -num_extents); 2549 spin_unlock(&inode->lock); 2550 2551 /* 2552 * We don't reserve metadata space for space cache inodes so we 2553 * don't need to call delalloc_release_metadata if there is an 2554 * error. 2555 */ 2556 if (bits & EXTENT_CLEAR_META_RESV && 2557 root != fs_info->tree_root) 2558 btrfs_delalloc_release_metadata(inode, len, true); 2559 2560 /* For sanity tests. */ 2561 if (btrfs_is_testing(fs_info)) 2562 return; 2563 2564 if (!btrfs_is_data_reloc_root(root) && 2565 !btrfs_is_free_space_inode(inode) && 2566 !(state->state & EXTENT_NORESERVE) && 2567 (bits & EXTENT_CLEAR_DATA_RESV)) 2568 btrfs_free_reserved_data_space_noquota(fs_info, len); 2569 2570 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, 2571 fs_info->delalloc_batch); 2572 spin_lock(&inode->lock); 2573 inode->delalloc_bytes -= len; 2574 new_delalloc_bytes = inode->delalloc_bytes; 2575 spin_unlock(&inode->lock); 2576 2577 /* 2578 * We don't need to be under the protection of the inode's lock, 2579 * because we are called while holding the inode's io_tree lock 2580 * and are therefore protected against concurrent calls of this 2581 * function and btrfs_set_delalloc_extent(). 2582 */ 2583 if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { 2584 spin_lock(&root->delalloc_lock); 2585 btrfs_del_delalloc_inode(inode); 2586 spin_unlock(&root->delalloc_lock); 2587 } 2588 } 2589 2590 if ((state->state & EXTENT_DELALLOC_NEW) && 2591 (bits & EXTENT_DELALLOC_NEW)) { 2592 spin_lock(&inode->lock); 2593 ASSERT(inode->new_delalloc_bytes >= len); 2594 inode->new_delalloc_bytes -= len; 2595 if (bits & EXTENT_ADD_INODE_BYTES) 2596 inode_add_bytes(&inode->vfs_inode, len); 2597 spin_unlock(&inode->lock); 2598 } 2599 } 2600 2601 /* 2602 * given a list of ordered sums record them in the inode. This happens 2603 * at IO completion time based on sums calculated at bio submission time. 2604 */ 2605 static int add_pending_csums(struct btrfs_trans_handle *trans, 2606 struct list_head *list) 2607 { 2608 struct btrfs_ordered_sum *sum; 2609 struct btrfs_root *csum_root = NULL; 2610 int ret; 2611 2612 list_for_each_entry(sum, list, list) { 2613 trans->adding_csums = true; 2614 if (!csum_root) 2615 csum_root = btrfs_csum_root(trans->fs_info, 2616 sum->logical); 2617 ret = btrfs_csum_file_blocks(trans, csum_root, sum); 2618 trans->adding_csums = false; 2619 if (ret) 2620 return ret; 2621 } 2622 return 0; 2623 } 2624 2625 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, 2626 const u64 start, 2627 const u64 len, 2628 struct extent_state **cached_state) 2629 { 2630 u64 search_start = start; 2631 const u64 end = start + len - 1; 2632 2633 while (search_start < end) { 2634 const u64 search_len = end - search_start + 1; 2635 struct extent_map *em; 2636 u64 em_len; 2637 int ret = 0; 2638 2639 em = btrfs_get_extent(inode, NULL, search_start, search_len); 2640 if (IS_ERR(em)) 2641 return PTR_ERR(em); 2642 2643 if (em->disk_bytenr != EXTENT_MAP_HOLE) 2644 goto next; 2645 2646 em_len = em->len; 2647 if (em->start < search_start) 2648 em_len -= search_start - em->start; 2649 if (em_len > search_len) 2650 em_len = search_len; 2651 2652 ret = set_extent_bit(&inode->io_tree, search_start, 2653 search_start + em_len - 1, 2654 EXTENT_DELALLOC_NEW, cached_state); 2655 next: 2656 search_start = extent_map_end(em); 2657 free_extent_map(em); 2658 if (ret) 2659 return ret; 2660 } 2661 return 0; 2662 } 2663 2664 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2665 unsigned int extra_bits, 2666 struct extent_state **cached_state) 2667 { 2668 WARN_ON(PAGE_ALIGNED(end)); 2669 2670 if (start >= i_size_read(&inode->vfs_inode) && 2671 !(inode->flags & BTRFS_INODE_PREALLOC)) { 2672 /* 2673 * There can't be any extents following eof in this case so just 2674 * set the delalloc new bit for the range directly. 2675 */ 2676 extra_bits |= EXTENT_DELALLOC_NEW; 2677 } else { 2678 int ret; 2679 2680 ret = btrfs_find_new_delalloc_bytes(inode, start, 2681 end + 1 - start, 2682 cached_state); 2683 if (ret) 2684 return ret; 2685 } 2686 2687 return set_extent_bit(&inode->io_tree, start, end, 2688 EXTENT_DELALLOC | extra_bits, cached_state); 2689 } 2690 2691 /* see btrfs_writepage_start_hook for details on why this is required */ 2692 struct btrfs_writepage_fixup { 2693 struct page *page; 2694 struct btrfs_inode *inode; 2695 struct btrfs_work work; 2696 }; 2697 2698 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 2699 { 2700 struct btrfs_writepage_fixup *fixup = 2701 container_of(work, struct btrfs_writepage_fixup, work); 2702 struct btrfs_ordered_extent *ordered; 2703 struct extent_state *cached_state = NULL; 2704 struct extent_changeset *data_reserved = NULL; 2705 struct page *page = fixup->page; 2706 struct btrfs_inode *inode = fixup->inode; 2707 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2708 u64 page_start = page_offset(page); 2709 u64 page_end = page_offset(page) + PAGE_SIZE - 1; 2710 int ret = 0; 2711 bool free_delalloc_space = true; 2712 2713 /* 2714 * This is similar to page_mkwrite, we need to reserve the space before 2715 * we take the page lock. 2716 */ 2717 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 2718 PAGE_SIZE); 2719 again: 2720 lock_page(page); 2721 2722 /* 2723 * Before we queued this fixup, we took a reference on the page. 2724 * page->mapping may go NULL, but it shouldn't be moved to a different 2725 * address space. 2726 */ 2727 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 2728 /* 2729 * Unfortunately this is a little tricky, either 2730 * 2731 * 1) We got here and our page had already been dealt with and 2732 * we reserved our space, thus ret == 0, so we need to just 2733 * drop our space reservation and bail. This can happen the 2734 * first time we come into the fixup worker, or could happen 2735 * while waiting for the ordered extent. 2736 * 2) Our page was already dealt with, but we happened to get an 2737 * ENOSPC above from the btrfs_delalloc_reserve_space. In 2738 * this case we obviously don't have anything to release, but 2739 * because the page was already dealt with we don't want to 2740 * mark the page with an error, so make sure we're resetting 2741 * ret to 0. This is why we have this check _before_ the ret 2742 * check, because we do not want to have a surprise ENOSPC 2743 * when the page was already properly dealt with. 2744 */ 2745 if (!ret) { 2746 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2747 btrfs_delalloc_release_space(inode, data_reserved, 2748 page_start, PAGE_SIZE, 2749 true); 2750 } 2751 ret = 0; 2752 goto out_page; 2753 } 2754 2755 /* 2756 * We can't mess with the page state unless it is locked, so now that 2757 * it is locked bail if we failed to make our space reservation. 2758 */ 2759 if (ret) 2760 goto out_page; 2761 2762 lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 2763 2764 /* already ordered? We're done */ 2765 if (PageOrdered(page)) 2766 goto out_reserved; 2767 2768 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 2769 if (ordered) { 2770 unlock_extent(&inode->io_tree, page_start, page_end, 2771 &cached_state); 2772 unlock_page(page); 2773 btrfs_start_ordered_extent(ordered); 2774 btrfs_put_ordered_extent(ordered); 2775 goto again; 2776 } 2777 2778 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 2779 &cached_state); 2780 if (ret) 2781 goto out_reserved; 2782 2783 /* 2784 * Everything went as planned, we're now the owner of a dirty page with 2785 * delayed allocation bits set and space reserved for our COW 2786 * destination. 2787 * 2788 * The page was dirty when we started, nothing should have cleaned it. 2789 */ 2790 BUG_ON(!PageDirty(page)); 2791 free_delalloc_space = false; 2792 out_reserved: 2793 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2794 if (free_delalloc_space) 2795 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2796 PAGE_SIZE, true); 2797 unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); 2798 out_page: 2799 if (ret) { 2800 /* 2801 * We hit ENOSPC or other errors. Update the mapping and page 2802 * to reflect the errors and clean the page. 2803 */ 2804 mapping_set_error(page->mapping, ret); 2805 btrfs_mark_ordered_io_finished(inode, page, page_start, 2806 PAGE_SIZE, !ret); 2807 clear_page_dirty_for_io(page); 2808 } 2809 btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); 2810 unlock_page(page); 2811 put_page(page); 2812 kfree(fixup); 2813 extent_changeset_free(data_reserved); 2814 /* 2815 * As a precaution, do a delayed iput in case it would be the last iput 2816 * that could need flushing space. Recursing back to fixup worker would 2817 * deadlock. 2818 */ 2819 btrfs_add_delayed_iput(inode); 2820 } 2821 2822 /* 2823 * There are a few paths in the higher layers of the kernel that directly 2824 * set the page dirty bit without asking the filesystem if it is a 2825 * good idea. This causes problems because we want to make sure COW 2826 * properly happens and the data=ordered rules are followed. 2827 * 2828 * In our case any range that doesn't have the ORDERED bit set 2829 * hasn't been properly setup for IO. We kick off an async process 2830 * to fix it up. The async helper will wait for ordered extents, set 2831 * the delalloc bit and make it safe to write the page. 2832 */ 2833 int btrfs_writepage_cow_fixup(struct page *page) 2834 { 2835 struct inode *inode = page->mapping->host; 2836 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2837 struct btrfs_writepage_fixup *fixup; 2838 2839 /* This page has ordered extent covering it already */ 2840 if (PageOrdered(page)) 2841 return 0; 2842 2843 /* 2844 * PageChecked is set below when we create a fixup worker for this page, 2845 * don't try to create another one if we're already PageChecked() 2846 * 2847 * The extent_io writepage code will redirty the page if we send back 2848 * EAGAIN. 2849 */ 2850 if (PageChecked(page)) 2851 return -EAGAIN; 2852 2853 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 2854 if (!fixup) 2855 return -EAGAIN; 2856 2857 /* 2858 * We are already holding a reference to this inode from 2859 * write_cache_pages. We need to hold it because the space reservation 2860 * takes place outside of the page lock, and we can't trust 2861 * page->mapping outside of the page lock. 2862 */ 2863 ihold(inode); 2864 btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); 2865 get_page(page); 2866 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); 2867 fixup->page = page; 2868 fixup->inode = BTRFS_I(inode); 2869 btrfs_queue_work(fs_info->fixup_workers, &fixup->work); 2870 2871 return -EAGAIN; 2872 } 2873 2874 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 2875 struct btrfs_inode *inode, u64 file_pos, 2876 struct btrfs_file_extent_item *stack_fi, 2877 const bool update_inode_bytes, 2878 u64 qgroup_reserved) 2879 { 2880 struct btrfs_root *root = inode->root; 2881 const u64 sectorsize = root->fs_info->sectorsize; 2882 struct btrfs_path *path; 2883 struct extent_buffer *leaf; 2884 struct btrfs_key ins; 2885 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); 2886 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); 2887 u64 offset = btrfs_stack_file_extent_offset(stack_fi); 2888 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); 2889 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); 2890 struct btrfs_drop_extents_args drop_args = { 0 }; 2891 int ret; 2892 2893 path = btrfs_alloc_path(); 2894 if (!path) 2895 return -ENOMEM; 2896 2897 /* 2898 * we may be replacing one extent in the tree with another. 2899 * The new extent is pinned in the extent map, and we don't want 2900 * to drop it from the cache until it is completely in the btree. 2901 * 2902 * So, tell btrfs_drop_extents to leave this extent in the cache. 2903 * the caller is expected to unpin it and allow it to be merged 2904 * with the others. 2905 */ 2906 drop_args.path = path; 2907 drop_args.start = file_pos; 2908 drop_args.end = file_pos + num_bytes; 2909 drop_args.replace_extent = true; 2910 drop_args.extent_item_size = sizeof(*stack_fi); 2911 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2912 if (ret) 2913 goto out; 2914 2915 if (!drop_args.extent_inserted) { 2916 ins.objectid = btrfs_ino(inode); 2917 ins.offset = file_pos; 2918 ins.type = BTRFS_EXTENT_DATA_KEY; 2919 2920 ret = btrfs_insert_empty_item(trans, root, path, &ins, 2921 sizeof(*stack_fi)); 2922 if (ret) 2923 goto out; 2924 } 2925 leaf = path->nodes[0]; 2926 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); 2927 write_extent_buffer(leaf, stack_fi, 2928 btrfs_item_ptr_offset(leaf, path->slots[0]), 2929 sizeof(struct btrfs_file_extent_item)); 2930 2931 btrfs_mark_buffer_dirty(trans, leaf); 2932 btrfs_release_path(path); 2933 2934 /* 2935 * If we dropped an inline extent here, we know the range where it is 2936 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the 2937 * number of bytes only for that range containing the inline extent. 2938 * The remaining of the range will be processed when clearning the 2939 * EXTENT_DELALLOC_BIT bit through the ordered extent completion. 2940 */ 2941 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { 2942 u64 inline_size = round_down(drop_args.bytes_found, sectorsize); 2943 2944 inline_size = drop_args.bytes_found - inline_size; 2945 btrfs_update_inode_bytes(inode, sectorsize, inline_size); 2946 drop_args.bytes_found -= inline_size; 2947 num_bytes -= sectorsize; 2948 } 2949 2950 if (update_inode_bytes) 2951 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); 2952 2953 ins.objectid = disk_bytenr; 2954 ins.offset = disk_num_bytes; 2955 ins.type = BTRFS_EXTENT_ITEM_KEY; 2956 2957 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); 2958 if (ret) 2959 goto out; 2960 2961 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), 2962 file_pos - offset, 2963 qgroup_reserved, &ins); 2964 out: 2965 btrfs_free_path(path); 2966 2967 return ret; 2968 } 2969 2970 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, 2971 u64 start, u64 len) 2972 { 2973 struct btrfs_block_group *cache; 2974 2975 cache = btrfs_lookup_block_group(fs_info, start); 2976 ASSERT(cache); 2977 2978 spin_lock(&cache->lock); 2979 cache->delalloc_bytes -= len; 2980 spin_unlock(&cache->lock); 2981 2982 btrfs_put_block_group(cache); 2983 } 2984 2985 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, 2986 struct btrfs_ordered_extent *oe) 2987 { 2988 struct btrfs_file_extent_item stack_fi; 2989 bool update_inode_bytes; 2990 u64 num_bytes = oe->num_bytes; 2991 u64 ram_bytes = oe->ram_bytes; 2992 2993 memset(&stack_fi, 0, sizeof(stack_fi)); 2994 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); 2995 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); 2996 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, 2997 oe->disk_num_bytes); 2998 btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); 2999 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) 3000 num_bytes = oe->truncated_len; 3001 btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); 3002 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); 3003 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); 3004 /* Encryption and other encoding is reserved and all 0 */ 3005 3006 /* 3007 * For delalloc, when completing an ordered extent we update the inode's 3008 * bytes when clearing the range in the inode's io tree, so pass false 3009 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), 3010 * except if the ordered extent was truncated. 3011 */ 3012 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || 3013 test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || 3014 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); 3015 3016 return insert_reserved_file_extent(trans, oe->inode, 3017 oe->file_offset, &stack_fi, 3018 update_inode_bytes, oe->qgroup_rsv); 3019 } 3020 3021 /* 3022 * As ordered data IO finishes, this gets called so we can finish 3023 * an ordered extent if the range of bytes in the file it covers are 3024 * fully written. 3025 */ 3026 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) 3027 { 3028 struct btrfs_inode *inode = ordered_extent->inode; 3029 struct btrfs_root *root = inode->root; 3030 struct btrfs_fs_info *fs_info = root->fs_info; 3031 struct btrfs_trans_handle *trans = NULL; 3032 struct extent_io_tree *io_tree = &inode->io_tree; 3033 struct extent_state *cached_state = NULL; 3034 u64 start, end; 3035 int compress_type = 0; 3036 int ret = 0; 3037 u64 logical_len = ordered_extent->num_bytes; 3038 bool freespace_inode; 3039 bool truncated = false; 3040 bool clear_reserved_extent = true; 3041 unsigned int clear_bits = EXTENT_DEFRAG; 3042 3043 start = ordered_extent->file_offset; 3044 end = start + ordered_extent->num_bytes - 1; 3045 3046 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3047 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && 3048 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && 3049 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) 3050 clear_bits |= EXTENT_DELALLOC_NEW; 3051 3052 freespace_inode = btrfs_is_free_space_inode(inode); 3053 if (!freespace_inode) 3054 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); 3055 3056 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 3057 ret = -EIO; 3058 goto out; 3059 } 3060 3061 if (btrfs_is_zoned(fs_info)) 3062 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, 3063 ordered_extent->disk_num_bytes); 3064 3065 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 3066 truncated = true; 3067 logical_len = ordered_extent->truncated_len; 3068 /* Truncated the entire extent, don't bother adding */ 3069 if (!logical_len) 3070 goto out; 3071 } 3072 3073 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 3074 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 3075 3076 btrfs_inode_safe_disk_i_size_write(inode, 0); 3077 if (freespace_inode) 3078 trans = btrfs_join_transaction_spacecache(root); 3079 else 3080 trans = btrfs_join_transaction(root); 3081 if (IS_ERR(trans)) { 3082 ret = PTR_ERR(trans); 3083 trans = NULL; 3084 goto out; 3085 } 3086 trans->block_rsv = &inode->block_rsv; 3087 ret = btrfs_update_inode_fallback(trans, inode); 3088 if (ret) /* -ENOMEM or corruption */ 3089 btrfs_abort_transaction(trans, ret); 3090 goto out; 3091 } 3092 3093 clear_bits |= EXTENT_LOCKED; 3094 lock_extent(io_tree, start, end, &cached_state); 3095 3096 if (freespace_inode) 3097 trans = btrfs_join_transaction_spacecache(root); 3098 else 3099 trans = btrfs_join_transaction(root); 3100 if (IS_ERR(trans)) { 3101 ret = PTR_ERR(trans); 3102 trans = NULL; 3103 goto out; 3104 } 3105 3106 trans->block_rsv = &inode->block_rsv; 3107 3108 ret = btrfs_insert_raid_extent(trans, ordered_extent); 3109 if (ret) 3110 goto out; 3111 3112 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 3113 compress_type = ordered_extent->compress_type; 3114 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3115 BUG_ON(compress_type); 3116 ret = btrfs_mark_extent_written(trans, inode, 3117 ordered_extent->file_offset, 3118 ordered_extent->file_offset + 3119 logical_len); 3120 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, 3121 ordered_extent->disk_num_bytes); 3122 } else { 3123 BUG_ON(root == fs_info->tree_root); 3124 ret = insert_ordered_extent_file_extent(trans, ordered_extent); 3125 if (!ret) { 3126 clear_reserved_extent = false; 3127 btrfs_release_delalloc_bytes(fs_info, 3128 ordered_extent->disk_bytenr, 3129 ordered_extent->disk_num_bytes); 3130 } 3131 } 3132 if (ret < 0) { 3133 btrfs_abort_transaction(trans, ret); 3134 goto out; 3135 } 3136 3137 ret = unpin_extent_cache(inode, ordered_extent->file_offset, 3138 ordered_extent->num_bytes, trans->transid); 3139 if (ret < 0) { 3140 btrfs_abort_transaction(trans, ret); 3141 goto out; 3142 } 3143 3144 ret = add_pending_csums(trans, &ordered_extent->list); 3145 if (ret) { 3146 btrfs_abort_transaction(trans, ret); 3147 goto out; 3148 } 3149 3150 /* 3151 * If this is a new delalloc range, clear its new delalloc flag to 3152 * update the inode's number of bytes. This needs to be done first 3153 * before updating the inode item. 3154 */ 3155 if ((clear_bits & EXTENT_DELALLOC_NEW) && 3156 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) 3157 clear_extent_bit(&inode->io_tree, start, end, 3158 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, 3159 &cached_state); 3160 3161 btrfs_inode_safe_disk_i_size_write(inode, 0); 3162 ret = btrfs_update_inode_fallback(trans, inode); 3163 if (ret) { /* -ENOMEM or corruption */ 3164 btrfs_abort_transaction(trans, ret); 3165 goto out; 3166 } 3167 out: 3168 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 3169 &cached_state); 3170 3171 if (trans) 3172 btrfs_end_transaction(trans); 3173 3174 if (ret || truncated) { 3175 u64 unwritten_start = start; 3176 3177 /* 3178 * If we failed to finish this ordered extent for any reason we 3179 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered 3180 * extent, and mark the inode with the error if it wasn't 3181 * already set. Any error during writeback would have already 3182 * set the mapping error, so we need to set it if we're the ones 3183 * marking this ordered extent as failed. 3184 */ 3185 if (ret) 3186 btrfs_mark_ordered_extent_error(ordered_extent); 3187 3188 if (truncated) 3189 unwritten_start += logical_len; 3190 clear_extent_uptodate(io_tree, unwritten_start, end, NULL); 3191 3192 /* 3193 * Drop extent maps for the part of the extent we didn't write. 3194 * 3195 * We have an exception here for the free_space_inode, this is 3196 * because when we do btrfs_get_extent() on the free space inode 3197 * we will search the commit root. If this is a new block group 3198 * we won't find anything, and we will trip over the assert in 3199 * writepage where we do ASSERT(em->block_start != 3200 * EXTENT_MAP_HOLE). 3201 * 3202 * Theoretically we could also skip this for any NOCOW extent as 3203 * we don't mess with the extent map tree in the NOCOW case, but 3204 * for now simply skip this if we are the free space inode. 3205 */ 3206 if (!btrfs_is_free_space_inode(inode)) 3207 btrfs_drop_extent_map_range(inode, unwritten_start, 3208 end, false); 3209 3210 /* 3211 * If the ordered extent had an IOERR or something else went 3212 * wrong we need to return the space for this ordered extent 3213 * back to the allocator. We only free the extent in the 3214 * truncated case if we didn't write out the extent at all. 3215 * 3216 * If we made it past insert_reserved_file_extent before we 3217 * errored out then we don't need to do this as the accounting 3218 * has already been done. 3219 */ 3220 if ((ret || !logical_len) && 3221 clear_reserved_extent && 3222 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3223 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3224 /* 3225 * Discard the range before returning it back to the 3226 * free space pool 3227 */ 3228 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) 3229 btrfs_discard_extent(fs_info, 3230 ordered_extent->disk_bytenr, 3231 ordered_extent->disk_num_bytes, 3232 NULL); 3233 btrfs_free_reserved_extent(fs_info, 3234 ordered_extent->disk_bytenr, 3235 ordered_extent->disk_num_bytes, 1); 3236 /* 3237 * Actually free the qgroup rsv which was released when 3238 * the ordered extent was created. 3239 */ 3240 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), 3241 ordered_extent->qgroup_rsv, 3242 BTRFS_QGROUP_RSV_DATA); 3243 } 3244 } 3245 3246 /* 3247 * This needs to be done to make sure anybody waiting knows we are done 3248 * updating everything for this ordered extent. 3249 */ 3250 btrfs_remove_ordered_extent(inode, ordered_extent); 3251 3252 /* once for us */ 3253 btrfs_put_ordered_extent(ordered_extent); 3254 /* once for the tree */ 3255 btrfs_put_ordered_extent(ordered_extent); 3256 3257 return ret; 3258 } 3259 3260 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) 3261 { 3262 if (btrfs_is_zoned(ordered->inode->root->fs_info) && 3263 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3264 list_empty(&ordered->bioc_list)) 3265 btrfs_finish_ordered_zoned(ordered); 3266 return btrfs_finish_one_ordered(ordered); 3267 } 3268 3269 /* 3270 * Verify the checksum for a single sector without any extra action that depend 3271 * on the type of I/O. 3272 */ 3273 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, 3274 u32 pgoff, u8 *csum, const u8 * const csum_expected) 3275 { 3276 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 3277 char *kaddr; 3278 3279 ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); 3280 3281 shash->tfm = fs_info->csum_shash; 3282 3283 kaddr = kmap_local_page(page) + pgoff; 3284 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); 3285 kunmap_local(kaddr); 3286 3287 if (memcmp(csum, csum_expected, fs_info->csum_size)) 3288 return -EIO; 3289 return 0; 3290 } 3291 3292 /* 3293 * Verify the checksum of a single data sector. 3294 * 3295 * @bbio: btrfs_io_bio which contains the csum 3296 * @dev: device the sector is on 3297 * @bio_offset: offset to the beginning of the bio (in bytes) 3298 * @bv: bio_vec to check 3299 * 3300 * Check if the checksum on a data block is valid. When a checksum mismatch is 3301 * detected, report the error and fill the corrupted range with zero. 3302 * 3303 * Return %true if the sector is ok or had no checksum to start with, else %false. 3304 */ 3305 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, 3306 u32 bio_offset, struct bio_vec *bv) 3307 { 3308 struct btrfs_inode *inode = bbio->inode; 3309 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3310 u64 file_offset = bbio->file_offset + bio_offset; 3311 u64 end = file_offset + bv->bv_len - 1; 3312 u8 *csum_expected; 3313 u8 csum[BTRFS_CSUM_SIZE]; 3314 3315 ASSERT(bv->bv_len == fs_info->sectorsize); 3316 3317 if (!bbio->csum) 3318 return true; 3319 3320 if (btrfs_is_data_reloc_root(inode->root) && 3321 test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, 3322 NULL)) { 3323 /* Skip the range without csum for data reloc inode */ 3324 clear_extent_bits(&inode->io_tree, file_offset, end, 3325 EXTENT_NODATASUM); 3326 return true; 3327 } 3328 3329 csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * 3330 fs_info->csum_size; 3331 if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, 3332 csum_expected)) 3333 goto zeroit; 3334 return true; 3335 3336 zeroit: 3337 btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, 3338 bbio->mirror_num); 3339 if (dev) 3340 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 3341 memzero_bvec(bv); 3342 return false; 3343 } 3344 3345 /* 3346 * Perform a delayed iput on @inode. 3347 * 3348 * @inode: The inode we want to perform iput on 3349 * 3350 * This function uses the generic vfs_inode::i_count to track whether we should 3351 * just decrement it (in case it's > 1) or if this is the last iput then link 3352 * the inode to the delayed iput machinery. Delayed iputs are processed at 3353 * transaction commit time/superblock commit/cleaner kthread. 3354 */ 3355 void btrfs_add_delayed_iput(struct btrfs_inode *inode) 3356 { 3357 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3358 unsigned long flags; 3359 3360 if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) 3361 return; 3362 3363 atomic_inc(&fs_info->nr_delayed_iputs); 3364 /* 3365 * Need to be irq safe here because we can be called from either an irq 3366 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq 3367 * context. 3368 */ 3369 spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); 3370 ASSERT(list_empty(&inode->delayed_iput)); 3371 list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); 3372 spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); 3373 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) 3374 wake_up_process(fs_info->cleaner_kthread); 3375 } 3376 3377 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, 3378 struct btrfs_inode *inode) 3379 { 3380 list_del_init(&inode->delayed_iput); 3381 spin_unlock_irq(&fs_info->delayed_iput_lock); 3382 iput(&inode->vfs_inode); 3383 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) 3384 wake_up(&fs_info->delayed_iputs_wait); 3385 spin_lock_irq(&fs_info->delayed_iput_lock); 3386 } 3387 3388 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, 3389 struct btrfs_inode *inode) 3390 { 3391 if (!list_empty(&inode->delayed_iput)) { 3392 spin_lock_irq(&fs_info->delayed_iput_lock); 3393 if (!list_empty(&inode->delayed_iput)) 3394 run_delayed_iput_locked(fs_info, inode); 3395 spin_unlock_irq(&fs_info->delayed_iput_lock); 3396 } 3397 } 3398 3399 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3400 { 3401 /* 3402 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which 3403 * calls btrfs_add_delayed_iput() and that needs to lock 3404 * fs_info->delayed_iput_lock. So we need to disable irqs here to 3405 * prevent a deadlock. 3406 */ 3407 spin_lock_irq(&fs_info->delayed_iput_lock); 3408 while (!list_empty(&fs_info->delayed_iputs)) { 3409 struct btrfs_inode *inode; 3410 3411 inode = list_first_entry(&fs_info->delayed_iputs, 3412 struct btrfs_inode, delayed_iput); 3413 run_delayed_iput_locked(fs_info, inode); 3414 if (need_resched()) { 3415 spin_unlock_irq(&fs_info->delayed_iput_lock); 3416 cond_resched(); 3417 spin_lock_irq(&fs_info->delayed_iput_lock); 3418 } 3419 } 3420 spin_unlock_irq(&fs_info->delayed_iput_lock); 3421 } 3422 3423 /* 3424 * Wait for flushing all delayed iputs 3425 * 3426 * @fs_info: the filesystem 3427 * 3428 * This will wait on any delayed iputs that are currently running with KILLABLE 3429 * set. Once they are all done running we will return, unless we are killed in 3430 * which case we return EINTR. This helps in user operations like fallocate etc 3431 * that might get blocked on the iputs. 3432 * 3433 * Return EINTR if we were killed, 0 if nothing's pending 3434 */ 3435 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) 3436 { 3437 int ret = wait_event_killable(fs_info->delayed_iputs_wait, 3438 atomic_read(&fs_info->nr_delayed_iputs) == 0); 3439 if (ret) 3440 return -EINTR; 3441 return 0; 3442 } 3443 3444 /* 3445 * This creates an orphan entry for the given inode in case something goes wrong 3446 * in the middle of an unlink. 3447 */ 3448 int btrfs_orphan_add(struct btrfs_trans_handle *trans, 3449 struct btrfs_inode *inode) 3450 { 3451 int ret; 3452 3453 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); 3454 if (ret && ret != -EEXIST) { 3455 btrfs_abort_transaction(trans, ret); 3456 return ret; 3457 } 3458 3459 return 0; 3460 } 3461 3462 /* 3463 * We have done the delete so we can go ahead and remove the orphan item for 3464 * this particular inode. 3465 */ 3466 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3467 struct btrfs_inode *inode) 3468 { 3469 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); 3470 } 3471 3472 /* 3473 * this cleans up any orphans that may be left on the list from the last use 3474 * of this root. 3475 */ 3476 int btrfs_orphan_cleanup(struct btrfs_root *root) 3477 { 3478 struct btrfs_fs_info *fs_info = root->fs_info; 3479 struct btrfs_path *path; 3480 struct extent_buffer *leaf; 3481 struct btrfs_key key, found_key; 3482 struct btrfs_trans_handle *trans; 3483 struct inode *inode; 3484 u64 last_objectid = 0; 3485 int ret = 0, nr_unlink = 0; 3486 3487 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) 3488 return 0; 3489 3490 path = btrfs_alloc_path(); 3491 if (!path) { 3492 ret = -ENOMEM; 3493 goto out; 3494 } 3495 path->reada = READA_BACK; 3496 3497 key.objectid = BTRFS_ORPHAN_OBJECTID; 3498 key.type = BTRFS_ORPHAN_ITEM_KEY; 3499 key.offset = (u64)-1; 3500 3501 while (1) { 3502 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3503 if (ret < 0) 3504 goto out; 3505 3506 /* 3507 * if ret == 0 means we found what we were searching for, which 3508 * is weird, but possible, so only screw with path if we didn't 3509 * find the key and see if we have stuff that matches 3510 */ 3511 if (ret > 0) { 3512 ret = 0; 3513 if (path->slots[0] == 0) 3514 break; 3515 path->slots[0]--; 3516 } 3517 3518 /* pull out the item */ 3519 leaf = path->nodes[0]; 3520 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3521 3522 /* make sure the item matches what we want */ 3523 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3524 break; 3525 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3526 break; 3527 3528 /* release the path since we're done with it */ 3529 btrfs_release_path(path); 3530 3531 /* 3532 * this is where we are basically btrfs_lookup, without the 3533 * crossing root thing. we store the inode number in the 3534 * offset of the orphan item. 3535 */ 3536 3537 if (found_key.offset == last_objectid) { 3538 /* 3539 * We found the same inode as before. This means we were 3540 * not able to remove its items via eviction triggered 3541 * by an iput(). A transaction abort may have happened, 3542 * due to -ENOSPC for example, so try to grab the error 3543 * that lead to a transaction abort, if any. 3544 */ 3545 btrfs_err(fs_info, 3546 "Error removing orphan entry, stopping orphan cleanup"); 3547 ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; 3548 goto out; 3549 } 3550 3551 last_objectid = found_key.offset; 3552 3553 found_key.objectid = found_key.offset; 3554 found_key.type = BTRFS_INODE_ITEM_KEY; 3555 found_key.offset = 0; 3556 inode = btrfs_iget(last_objectid, root); 3557 if (IS_ERR(inode)) { 3558 ret = PTR_ERR(inode); 3559 inode = NULL; 3560 if (ret != -ENOENT) 3561 goto out; 3562 } 3563 3564 if (!inode && root == fs_info->tree_root) { 3565 struct btrfs_root *dead_root; 3566 int is_dead_root = 0; 3567 3568 /* 3569 * This is an orphan in the tree root. Currently these 3570 * could come from 2 sources: 3571 * a) a root (snapshot/subvolume) deletion in progress 3572 * b) a free space cache inode 3573 * We need to distinguish those two, as the orphan item 3574 * for a root must not get deleted before the deletion 3575 * of the snapshot/subvolume's tree completes. 3576 * 3577 * btrfs_find_orphan_roots() ran before us, which has 3578 * found all deleted roots and loaded them into 3579 * fs_info->fs_roots_radix. So here we can find if an 3580 * orphan item corresponds to a deleted root by looking 3581 * up the root from that radix tree. 3582 */ 3583 3584 spin_lock(&fs_info->fs_roots_radix_lock); 3585 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, 3586 (unsigned long)found_key.objectid); 3587 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) 3588 is_dead_root = 1; 3589 spin_unlock(&fs_info->fs_roots_radix_lock); 3590 3591 if (is_dead_root) { 3592 /* prevent this orphan from being found again */ 3593 key.offset = found_key.objectid - 1; 3594 continue; 3595 } 3596 3597 } 3598 3599 /* 3600 * If we have an inode with links, there are a couple of 3601 * possibilities: 3602 * 3603 * 1. We were halfway through creating fsverity metadata for the 3604 * file. In that case, the orphan item represents incomplete 3605 * fsverity metadata which must be cleaned up with 3606 * btrfs_drop_verity_items and deleting the orphan item. 3607 3608 * 2. Old kernels (before v3.12) used to create an 3609 * orphan item for truncate indicating that there were possibly 3610 * extent items past i_size that needed to be deleted. In v3.12, 3611 * truncate was changed to update i_size in sync with the extent 3612 * items, but the (useless) orphan item was still created. Since 3613 * v4.18, we don't create the orphan item for truncate at all. 3614 * 3615 * So, this item could mean that we need to do a truncate, but 3616 * only if this filesystem was last used on a pre-v3.12 kernel 3617 * and was not cleanly unmounted. The odds of that are quite 3618 * slim, and it's a pain to do the truncate now, so just delete 3619 * the orphan item. 3620 * 3621 * It's also possible that this orphan item was supposed to be 3622 * deleted but wasn't. The inode number may have been reused, 3623 * but either way, we can delete the orphan item. 3624 */ 3625 if (!inode || inode->i_nlink) { 3626 if (inode) { 3627 ret = btrfs_drop_verity_items(BTRFS_I(inode)); 3628 iput(inode); 3629 inode = NULL; 3630 if (ret) 3631 goto out; 3632 } 3633 trans = btrfs_start_transaction(root, 1); 3634 if (IS_ERR(trans)) { 3635 ret = PTR_ERR(trans); 3636 goto out; 3637 } 3638 btrfs_debug(fs_info, "auto deleting %Lu", 3639 found_key.objectid); 3640 ret = btrfs_del_orphan_item(trans, root, 3641 found_key.objectid); 3642 btrfs_end_transaction(trans); 3643 if (ret) 3644 goto out; 3645 continue; 3646 } 3647 3648 nr_unlink++; 3649 3650 /* this will do delete_inode and everything for us */ 3651 iput(inode); 3652 } 3653 /* release the path since we're done with it */ 3654 btrfs_release_path(path); 3655 3656 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3657 trans = btrfs_join_transaction(root); 3658 if (!IS_ERR(trans)) 3659 btrfs_end_transaction(trans); 3660 } 3661 3662 if (nr_unlink) 3663 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); 3664 3665 out: 3666 if (ret) 3667 btrfs_err(fs_info, "could not do orphan cleanup %d", ret); 3668 btrfs_free_path(path); 3669 return ret; 3670 } 3671 3672 /* 3673 * very simple check to peek ahead in the leaf looking for xattrs. If we 3674 * don't find any xattrs, we know there can't be any acls. 3675 * 3676 * slot is the slot the inode is in, objectid is the objectid of the inode 3677 */ 3678 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3679 int slot, u64 objectid, 3680 int *first_xattr_slot) 3681 { 3682 u32 nritems = btrfs_header_nritems(leaf); 3683 struct btrfs_key found_key; 3684 static u64 xattr_access = 0; 3685 static u64 xattr_default = 0; 3686 int scanned = 0; 3687 3688 if (!xattr_access) { 3689 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, 3690 strlen(XATTR_NAME_POSIX_ACL_ACCESS)); 3691 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, 3692 strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); 3693 } 3694 3695 slot++; 3696 *first_xattr_slot = -1; 3697 while (slot < nritems) { 3698 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3699 3700 /* we found a different objectid, there must not be acls */ 3701 if (found_key.objectid != objectid) 3702 return 0; 3703 3704 /* we found an xattr, assume we've got an acl */ 3705 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3706 if (*first_xattr_slot == -1) 3707 *first_xattr_slot = slot; 3708 if (found_key.offset == xattr_access || 3709 found_key.offset == xattr_default) 3710 return 1; 3711 } 3712 3713 /* 3714 * we found a key greater than an xattr key, there can't 3715 * be any acls later on 3716 */ 3717 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3718 return 0; 3719 3720 slot++; 3721 scanned++; 3722 3723 /* 3724 * it goes inode, inode backrefs, xattrs, extents, 3725 * so if there are a ton of hard links to an inode there can 3726 * be a lot of backrefs. Don't waste time searching too hard, 3727 * this is just an optimization 3728 */ 3729 if (scanned >= 8) 3730 break; 3731 } 3732 /* we hit the end of the leaf before we found an xattr or 3733 * something larger than an xattr. We have to assume the inode 3734 * has acls 3735 */ 3736 if (*first_xattr_slot == -1) 3737 *first_xattr_slot = slot; 3738 return 1; 3739 } 3740 3741 static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) 3742 { 3743 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3744 3745 if (WARN_ON_ONCE(inode->file_extent_tree)) 3746 return 0; 3747 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 3748 return 0; 3749 if (!S_ISREG(inode->vfs_inode.i_mode)) 3750 return 0; 3751 if (btrfs_is_free_space_inode(inode)) 3752 return 0; 3753 3754 inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); 3755 if (!inode->file_extent_tree) 3756 return -ENOMEM; 3757 3758 extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); 3759 /* Lockdep class is set only for the file extent tree. */ 3760 lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); 3761 3762 return 0; 3763 } 3764 3765 /* 3766 * read an inode from the btree into the in-memory inode 3767 */ 3768 static int btrfs_read_locked_inode(struct inode *inode, 3769 struct btrfs_path *in_path) 3770 { 3771 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 3772 struct btrfs_path *path = in_path; 3773 struct extent_buffer *leaf; 3774 struct btrfs_inode_item *inode_item; 3775 struct btrfs_root *root = BTRFS_I(inode)->root; 3776 struct btrfs_key location; 3777 unsigned long ptr; 3778 int maybe_acls; 3779 u32 rdev; 3780 int ret; 3781 bool filled = false; 3782 int first_xattr_slot; 3783 3784 ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); 3785 if (ret) 3786 return ret; 3787 3788 ret = btrfs_fill_inode(inode, &rdev); 3789 if (!ret) 3790 filled = true; 3791 3792 if (!path) { 3793 path = btrfs_alloc_path(); 3794 if (!path) 3795 return -ENOMEM; 3796 } 3797 3798 btrfs_get_inode_key(BTRFS_I(inode), &location); 3799 3800 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3801 if (ret) { 3802 if (path != in_path) 3803 btrfs_free_path(path); 3804 return ret; 3805 } 3806 3807 leaf = path->nodes[0]; 3808 3809 if (filled) 3810 goto cache_index; 3811 3812 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3813 struct btrfs_inode_item); 3814 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3815 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3816 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3817 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3818 btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); 3819 btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, 3820 round_up(i_size_read(inode), fs_info->sectorsize)); 3821 3822 inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), 3823 btrfs_timespec_nsec(leaf, &inode_item->atime)); 3824 3825 inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), 3826 btrfs_timespec_nsec(leaf, &inode_item->mtime)); 3827 3828 inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), 3829 btrfs_timespec_nsec(leaf, &inode_item->ctime)); 3830 3831 BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); 3832 BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); 3833 3834 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3835 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3836 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3837 3838 inode_set_iversion_queried(inode, 3839 btrfs_inode_sequence(leaf, inode_item)); 3840 inode->i_generation = BTRFS_I(inode)->generation; 3841 inode->i_rdev = 0; 3842 rdev = btrfs_inode_rdev(leaf, inode_item); 3843 3844 if (S_ISDIR(inode->i_mode)) 3845 BTRFS_I(inode)->index_cnt = (u64)-1; 3846 3847 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), 3848 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); 3849 3850 cache_index: 3851 /* 3852 * If we were modified in the current generation and evicted from memory 3853 * and then re-read we need to do a full sync since we don't have any 3854 * idea about which extents were modified before we were evicted from 3855 * cache. 3856 * 3857 * This is required for both inode re-read from disk and delayed inode 3858 * in the delayed_nodes xarray. 3859 */ 3860 if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) 3861 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3862 &BTRFS_I(inode)->runtime_flags); 3863 3864 /* 3865 * We don't persist the id of the transaction where an unlink operation 3866 * against the inode was last made. So here we assume the inode might 3867 * have been evicted, and therefore the exact value of last_unlink_trans 3868 * lost, and set it to last_trans to avoid metadata inconsistencies 3869 * between the inode and its parent if the inode is fsync'ed and the log 3870 * replayed. For example, in the scenario: 3871 * 3872 * touch mydir/foo 3873 * ln mydir/foo mydir/bar 3874 * sync 3875 * unlink mydir/bar 3876 * echo 2 > /proc/sys/vm/drop_caches # evicts inode 3877 * xfs_io -c fsync mydir/foo 3878 * <power failure> 3879 * mount fs, triggers fsync log replay 3880 * 3881 * We must make sure that when we fsync our inode foo we also log its 3882 * parent inode, otherwise after log replay the parent still has the 3883 * dentry with the "bar" name but our inode foo has a link count of 1 3884 * and doesn't have an inode ref with the name "bar" anymore. 3885 * 3886 * Setting last_unlink_trans to last_trans is a pessimistic approach, 3887 * but it guarantees correctness at the expense of occasional full 3888 * transaction commits on fsync if our inode is a directory, or if our 3889 * inode is not a directory, logging its parent unnecessarily. 3890 */ 3891 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; 3892 3893 /* 3894 * Same logic as for last_unlink_trans. We don't persist the generation 3895 * of the last transaction where this inode was used for a reflink 3896 * operation, so after eviction and reloading the inode we must be 3897 * pessimistic and assume the last transaction that modified the inode. 3898 */ 3899 BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; 3900 3901 path->slots[0]++; 3902 if (inode->i_nlink != 1 || 3903 path->slots[0] >= btrfs_header_nritems(leaf)) 3904 goto cache_acl; 3905 3906 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 3907 if (location.objectid != btrfs_ino(BTRFS_I(inode))) 3908 goto cache_acl; 3909 3910 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3911 if (location.type == BTRFS_INODE_REF_KEY) { 3912 struct btrfs_inode_ref *ref; 3913 3914 ref = (struct btrfs_inode_ref *)ptr; 3915 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); 3916 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 3917 struct btrfs_inode_extref *extref; 3918 3919 extref = (struct btrfs_inode_extref *)ptr; 3920 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, 3921 extref); 3922 } 3923 cache_acl: 3924 /* 3925 * try to precache a NULL acl entry for files that don't have 3926 * any xattrs or acls 3927 */ 3928 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3929 btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); 3930 if (first_xattr_slot != -1) { 3931 path->slots[0] = first_xattr_slot; 3932 ret = btrfs_load_inode_props(inode, path); 3933 if (ret) 3934 btrfs_err(fs_info, 3935 "error loading props for ino %llu (root %llu): %d", 3936 btrfs_ino(BTRFS_I(inode)), 3937 btrfs_root_id(root), ret); 3938 } 3939 if (path != in_path) 3940 btrfs_free_path(path); 3941 3942 if (!maybe_acls) 3943 cache_no_acl(inode); 3944 3945 switch (inode->i_mode & S_IFMT) { 3946 case S_IFREG: 3947 inode->i_mapping->a_ops = &btrfs_aops; 3948 inode->i_fop = &btrfs_file_operations; 3949 inode->i_op = &btrfs_file_inode_operations; 3950 break; 3951 case S_IFDIR: 3952 inode->i_fop = &btrfs_dir_file_operations; 3953 inode->i_op = &btrfs_dir_inode_operations; 3954 break; 3955 case S_IFLNK: 3956 inode->i_op = &btrfs_symlink_inode_operations; 3957 inode_nohighmem(inode); 3958 inode->i_mapping->a_ops = &btrfs_aops; 3959 break; 3960 default: 3961 inode->i_op = &btrfs_special_inode_operations; 3962 init_special_inode(inode, inode->i_mode, rdev); 3963 break; 3964 } 3965 3966 btrfs_sync_inode_flags_to_i_flags(inode); 3967 return 0; 3968 } 3969 3970 /* 3971 * given a leaf and an inode, copy the inode fields into the leaf 3972 */ 3973 static void fill_inode_item(struct btrfs_trans_handle *trans, 3974 struct extent_buffer *leaf, 3975 struct btrfs_inode_item *item, 3976 struct inode *inode) 3977 { 3978 struct btrfs_map_token token; 3979 u64 flags; 3980 3981 btrfs_init_map_token(&token, leaf); 3982 3983 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3984 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3985 btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); 3986 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3987 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3988 3989 btrfs_set_token_timespec_sec(&token, &item->atime, 3990 inode_get_atime_sec(inode)); 3991 btrfs_set_token_timespec_nsec(&token, &item->atime, 3992 inode_get_atime_nsec(inode)); 3993 3994 btrfs_set_token_timespec_sec(&token, &item->mtime, 3995 inode_get_mtime_sec(inode)); 3996 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3997 inode_get_mtime_nsec(inode)); 3998 3999 btrfs_set_token_timespec_sec(&token, &item->ctime, 4000 inode_get_ctime_sec(inode)); 4001 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4002 inode_get_ctime_nsec(inode)); 4003 4004 btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); 4005 btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); 4006 4007 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); 4008 btrfs_set_token_inode_generation(&token, item, 4009 BTRFS_I(inode)->generation); 4010 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4011 btrfs_set_token_inode_transid(&token, item, trans->transid); 4012 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4013 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4014 BTRFS_I(inode)->ro_flags); 4015 btrfs_set_token_inode_flags(&token, item, flags); 4016 btrfs_set_token_inode_block_group(&token, item, 0); 4017 } 4018 4019 /* 4020 * copy everything in the in-memory inode into the btree. 4021 */ 4022 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 4023 struct btrfs_inode *inode) 4024 { 4025 struct btrfs_inode_item *inode_item; 4026 struct btrfs_path *path; 4027 struct extent_buffer *leaf; 4028 struct btrfs_key key; 4029 int ret; 4030 4031 path = btrfs_alloc_path(); 4032 if (!path) 4033 return -ENOMEM; 4034 4035 btrfs_get_inode_key(inode, &key); 4036 ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1); 4037 if (ret) { 4038 if (ret > 0) 4039 ret = -ENOENT; 4040 goto failed; 4041 } 4042 4043 leaf = path->nodes[0]; 4044 inode_item = btrfs_item_ptr(leaf, path->slots[0], 4045 struct btrfs_inode_item); 4046 4047 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); 4048 btrfs_mark_buffer_dirty(trans, leaf); 4049 btrfs_set_inode_last_trans(trans, inode); 4050 ret = 0; 4051 failed: 4052 btrfs_free_path(path); 4053 return ret; 4054 } 4055 4056 /* 4057 * copy everything in the in-memory inode into the btree. 4058 */ 4059 int btrfs_update_inode(struct btrfs_trans_handle *trans, 4060 struct btrfs_inode *inode) 4061 { 4062 struct btrfs_root *root = inode->root; 4063 struct btrfs_fs_info *fs_info = root->fs_info; 4064 int ret; 4065 4066 /* 4067 * If the inode is a free space inode, we can deadlock during commit 4068 * if we put it into the delayed code. 4069 * 4070 * The data relocation inode should also be directly updated 4071 * without delay 4072 */ 4073 if (!btrfs_is_free_space_inode(inode) 4074 && !btrfs_is_data_reloc_root(root) 4075 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 4076 btrfs_update_root_times(trans, root); 4077 4078 ret = btrfs_delayed_update_inode(trans, inode); 4079 if (!ret) 4080 btrfs_set_inode_last_trans(trans, inode); 4081 return ret; 4082 } 4083 4084 return btrfs_update_inode_item(trans, inode); 4085 } 4086 4087 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 4088 struct btrfs_inode *inode) 4089 { 4090 int ret; 4091 4092 ret = btrfs_update_inode(trans, inode); 4093 if (ret == -ENOSPC) 4094 return btrfs_update_inode_item(trans, inode); 4095 return ret; 4096 } 4097 4098 /* 4099 * unlink helper that gets used here in inode.c and in the tree logging 4100 * recovery code. It remove a link in a directory with a given name, and 4101 * also drops the back refs in the inode to the directory 4102 */ 4103 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4104 struct btrfs_inode *dir, 4105 struct btrfs_inode *inode, 4106 const struct fscrypt_str *name, 4107 struct btrfs_rename_ctx *rename_ctx) 4108 { 4109 struct btrfs_root *root = dir->root; 4110 struct btrfs_fs_info *fs_info = root->fs_info; 4111 struct btrfs_path *path; 4112 int ret = 0; 4113 struct btrfs_dir_item *di; 4114 u64 index; 4115 u64 ino = btrfs_ino(inode); 4116 u64 dir_ino = btrfs_ino(dir); 4117 4118 path = btrfs_alloc_path(); 4119 if (!path) { 4120 ret = -ENOMEM; 4121 goto out; 4122 } 4123 4124 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); 4125 if (IS_ERR_OR_NULL(di)) { 4126 ret = di ? PTR_ERR(di) : -ENOENT; 4127 goto err; 4128 } 4129 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4130 if (ret) 4131 goto err; 4132 btrfs_release_path(path); 4133 4134 /* 4135 * If we don't have dir index, we have to get it by looking up 4136 * the inode ref, since we get the inode ref, remove it directly, 4137 * it is unnecessary to do delayed deletion. 4138 * 4139 * But if we have dir index, needn't search inode ref to get it. 4140 * Since the inode ref is close to the inode item, it is better 4141 * that we delay to delete it, and just do this deletion when 4142 * we update the inode item. 4143 */ 4144 if (inode->dir_index) { 4145 ret = btrfs_delayed_delete_inode_ref(inode); 4146 if (!ret) { 4147 index = inode->dir_index; 4148 goto skip_backref; 4149 } 4150 } 4151 4152 ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); 4153 if (ret) { 4154 btrfs_info(fs_info, 4155 "failed to delete reference to %.*s, inode %llu parent %llu", 4156 name->len, name->name, ino, dir_ino); 4157 btrfs_abort_transaction(trans, ret); 4158 goto err; 4159 } 4160 skip_backref: 4161 if (rename_ctx) 4162 rename_ctx->index = index; 4163 4164 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 4165 if (ret) { 4166 btrfs_abort_transaction(trans, ret); 4167 goto err; 4168 } 4169 4170 /* 4171 * If we are in a rename context, we don't need to update anything in the 4172 * log. That will be done later during the rename by btrfs_log_new_name(). 4173 * Besides that, doing it here would only cause extra unnecessary btree 4174 * operations on the log tree, increasing latency for applications. 4175 */ 4176 if (!rename_ctx) { 4177 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); 4178 btrfs_del_dir_entries_in_log(trans, root, name, dir, index); 4179 } 4180 4181 /* 4182 * If we have a pending delayed iput we could end up with the final iput 4183 * being run in btrfs-cleaner context. If we have enough of these built 4184 * up we can end up burning a lot of time in btrfs-cleaner without any 4185 * way to throttle the unlinks. Since we're currently holding a ref on 4186 * the inode we can run the delayed iput here without any issues as the 4187 * final iput won't be done until after we drop the ref we're currently 4188 * holding. 4189 */ 4190 btrfs_run_delayed_iput(fs_info, inode); 4191 err: 4192 btrfs_free_path(path); 4193 if (ret) 4194 goto out; 4195 4196 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); 4197 inode_inc_iversion(&inode->vfs_inode); 4198 inode_inc_iversion(&dir->vfs_inode); 4199 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); 4200 ret = btrfs_update_inode(trans, dir); 4201 out: 4202 return ret; 4203 } 4204 4205 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4206 struct btrfs_inode *dir, struct btrfs_inode *inode, 4207 const struct fscrypt_str *name) 4208 { 4209 int ret; 4210 4211 ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); 4212 if (!ret) { 4213 drop_nlink(&inode->vfs_inode); 4214 ret = btrfs_update_inode(trans, inode); 4215 } 4216 return ret; 4217 } 4218 4219 /* 4220 * helper to start transaction for unlink and rmdir. 4221 * 4222 * unlink and rmdir are special in btrfs, they do not always free space, so 4223 * if we cannot make our reservations the normal way try and see if there is 4224 * plenty of slack room in the global reserve to migrate, otherwise we cannot 4225 * allow the unlink to occur. 4226 */ 4227 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) 4228 { 4229 struct btrfs_root *root = dir->root; 4230 4231 return btrfs_start_transaction_fallback_global_rsv(root, 4232 BTRFS_UNLINK_METADATA_UNITS); 4233 } 4234 4235 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 4236 { 4237 struct btrfs_trans_handle *trans; 4238 struct inode *inode = d_inode(dentry); 4239 int ret; 4240 struct fscrypt_name fname; 4241 4242 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); 4243 if (ret) 4244 return ret; 4245 4246 /* This needs to handle no-key deletions later on */ 4247 4248 trans = __unlink_start_trans(BTRFS_I(dir)); 4249 if (IS_ERR(trans)) { 4250 ret = PTR_ERR(trans); 4251 goto fscrypt_free; 4252 } 4253 4254 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4255 false); 4256 4257 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4258 &fname.disk_name); 4259 if (ret) 4260 goto end_trans; 4261 4262 if (inode->i_nlink == 0) { 4263 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 4264 if (ret) 4265 goto end_trans; 4266 } 4267 4268 end_trans: 4269 btrfs_end_transaction(trans); 4270 btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); 4271 fscrypt_free: 4272 fscrypt_free_filename(&fname); 4273 return ret; 4274 } 4275 4276 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 4277 struct btrfs_inode *dir, struct dentry *dentry) 4278 { 4279 struct btrfs_root *root = dir->root; 4280 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 4281 struct btrfs_path *path; 4282 struct extent_buffer *leaf; 4283 struct btrfs_dir_item *di; 4284 struct btrfs_key key; 4285 u64 index; 4286 int ret; 4287 u64 objectid; 4288 u64 dir_ino = btrfs_ino(dir); 4289 struct fscrypt_name fname; 4290 4291 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); 4292 if (ret) 4293 return ret; 4294 4295 /* This needs to handle no-key deletions later on */ 4296 4297 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { 4298 objectid = btrfs_root_id(inode->root); 4299 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 4300 objectid = inode->ref_root_id; 4301 } else { 4302 WARN_ON(1); 4303 fscrypt_free_filename(&fname); 4304 return -EINVAL; 4305 } 4306 4307 path = btrfs_alloc_path(); 4308 if (!path) { 4309 ret = -ENOMEM; 4310 goto out; 4311 } 4312 4313 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4314 &fname.disk_name, -1); 4315 if (IS_ERR_OR_NULL(di)) { 4316 ret = di ? PTR_ERR(di) : -ENOENT; 4317 goto out; 4318 } 4319 4320 leaf = path->nodes[0]; 4321 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4322 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 4323 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4324 if (ret) { 4325 btrfs_abort_transaction(trans, ret); 4326 goto out; 4327 } 4328 btrfs_release_path(path); 4329 4330 /* 4331 * This is a placeholder inode for a subvolume we didn't have a 4332 * reference to at the time of the snapshot creation. In the meantime 4333 * we could have renamed the real subvol link into our snapshot, so 4334 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. 4335 * Instead simply lookup the dir_index_item for this entry so we can 4336 * remove it. Otherwise we know we have a ref to the root and we can 4337 * call btrfs_del_root_ref, and it _shouldn't_ fail. 4338 */ 4339 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 4340 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); 4341 if (IS_ERR_OR_NULL(di)) { 4342 if (!di) 4343 ret = -ENOENT; 4344 else 4345 ret = PTR_ERR(di); 4346 btrfs_abort_transaction(trans, ret); 4347 goto out; 4348 } 4349 4350 leaf = path->nodes[0]; 4351 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4352 index = key.offset; 4353 btrfs_release_path(path); 4354 } else { 4355 ret = btrfs_del_root_ref(trans, objectid, 4356 btrfs_root_id(root), dir_ino, 4357 &index, &fname.disk_name); 4358 if (ret) { 4359 btrfs_abort_transaction(trans, ret); 4360 goto out; 4361 } 4362 } 4363 4364 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 4365 if (ret) { 4366 btrfs_abort_transaction(trans, ret); 4367 goto out; 4368 } 4369 4370 btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); 4371 inode_inc_iversion(&dir->vfs_inode); 4372 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); 4373 ret = btrfs_update_inode_fallback(trans, dir); 4374 if (ret) 4375 btrfs_abort_transaction(trans, ret); 4376 out: 4377 btrfs_free_path(path); 4378 fscrypt_free_filename(&fname); 4379 return ret; 4380 } 4381 4382 /* 4383 * Helper to check if the subvolume references other subvolumes or if it's 4384 * default. 4385 */ 4386 static noinline int may_destroy_subvol(struct btrfs_root *root) 4387 { 4388 struct btrfs_fs_info *fs_info = root->fs_info; 4389 struct btrfs_path *path; 4390 struct btrfs_dir_item *di; 4391 struct btrfs_key key; 4392 struct fscrypt_str name = FSTR_INIT("default", 7); 4393 u64 dir_id; 4394 int ret; 4395 4396 path = btrfs_alloc_path(); 4397 if (!path) 4398 return -ENOMEM; 4399 4400 /* Make sure this root isn't set as the default subvol */ 4401 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4402 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, 4403 dir_id, &name, 0); 4404 if (di && !IS_ERR(di)) { 4405 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 4406 if (key.objectid == btrfs_root_id(root)) { 4407 ret = -EPERM; 4408 btrfs_err(fs_info, 4409 "deleting default subvolume %llu is not allowed", 4410 key.objectid); 4411 goto out; 4412 } 4413 btrfs_release_path(path); 4414 } 4415 4416 key.objectid = btrfs_root_id(root); 4417 key.type = BTRFS_ROOT_REF_KEY; 4418 key.offset = (u64)-1; 4419 4420 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4421 if (ret < 0) 4422 goto out; 4423 if (ret == 0) { 4424 /* 4425 * Key with offset -1 found, there would have to exist a root 4426 * with such id, but this is out of valid range. 4427 */ 4428 ret = -EUCLEAN; 4429 goto out; 4430 } 4431 4432 ret = 0; 4433 if (path->slots[0] > 0) { 4434 path->slots[0]--; 4435 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 4436 if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) 4437 ret = -ENOTEMPTY; 4438 } 4439 out: 4440 btrfs_free_path(path); 4441 return ret; 4442 } 4443 4444 /* Delete all dentries for inodes belonging to the root */ 4445 static void btrfs_prune_dentries(struct btrfs_root *root) 4446 { 4447 struct btrfs_fs_info *fs_info = root->fs_info; 4448 struct btrfs_inode *inode; 4449 u64 min_ino = 0; 4450 4451 if (!BTRFS_FS_ERROR(fs_info)) 4452 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4453 4454 inode = btrfs_find_first_inode(root, min_ino); 4455 while (inode) { 4456 if (atomic_read(&inode->vfs_inode.i_count) > 1) 4457 d_prune_aliases(&inode->vfs_inode); 4458 4459 min_ino = btrfs_ino(inode) + 1; 4460 /* 4461 * btrfs_drop_inode() will have it removed from the inode 4462 * cache when its usage count hits zero. 4463 */ 4464 iput(&inode->vfs_inode); 4465 cond_resched(); 4466 inode = btrfs_find_first_inode(root, min_ino); 4467 } 4468 } 4469 4470 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) 4471 { 4472 struct btrfs_root *root = dir->root; 4473 struct btrfs_fs_info *fs_info = root->fs_info; 4474 struct inode *inode = d_inode(dentry); 4475 struct btrfs_root *dest = BTRFS_I(inode)->root; 4476 struct btrfs_trans_handle *trans; 4477 struct btrfs_block_rsv block_rsv; 4478 u64 root_flags; 4479 u64 qgroup_reserved = 0; 4480 int ret; 4481 4482 down_write(&fs_info->subvol_sem); 4483 4484 /* 4485 * Don't allow to delete a subvolume with send in progress. This is 4486 * inside the inode lock so the error handling that has to drop the bit 4487 * again is not run concurrently. 4488 */ 4489 spin_lock(&dest->root_item_lock); 4490 if (dest->send_in_progress) { 4491 spin_unlock(&dest->root_item_lock); 4492 btrfs_warn(fs_info, 4493 "attempt to delete subvolume %llu during send", 4494 btrfs_root_id(dest)); 4495 ret = -EPERM; 4496 goto out_up_write; 4497 } 4498 if (atomic_read(&dest->nr_swapfiles)) { 4499 spin_unlock(&dest->root_item_lock); 4500 btrfs_warn(fs_info, 4501 "attempt to delete subvolume %llu with active swapfile", 4502 btrfs_root_id(root)); 4503 ret = -EPERM; 4504 goto out_up_write; 4505 } 4506 root_flags = btrfs_root_flags(&dest->root_item); 4507 btrfs_set_root_flags(&dest->root_item, 4508 root_flags | BTRFS_ROOT_SUBVOL_DEAD); 4509 spin_unlock(&dest->root_item_lock); 4510 4511 ret = may_destroy_subvol(dest); 4512 if (ret) 4513 goto out_undead; 4514 4515 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 4516 /* 4517 * One for dir inode, 4518 * two for dir entries, 4519 * two for root ref/backref. 4520 */ 4521 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); 4522 if (ret) 4523 goto out_undead; 4524 qgroup_reserved = block_rsv.qgroup_rsv_reserved; 4525 4526 trans = btrfs_start_transaction(root, 0); 4527 if (IS_ERR(trans)) { 4528 ret = PTR_ERR(trans); 4529 goto out_release; 4530 } 4531 btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); 4532 qgroup_reserved = 0; 4533 trans->block_rsv = &block_rsv; 4534 trans->bytes_reserved = block_rsv.size; 4535 4536 btrfs_record_snapshot_destroy(trans, dir); 4537 4538 ret = btrfs_unlink_subvol(trans, dir, dentry); 4539 if (ret) { 4540 btrfs_abort_transaction(trans, ret); 4541 goto out_end_trans; 4542 } 4543 4544 ret = btrfs_record_root_in_trans(trans, dest); 4545 if (ret) { 4546 btrfs_abort_transaction(trans, ret); 4547 goto out_end_trans; 4548 } 4549 4550 memset(&dest->root_item.drop_progress, 0, 4551 sizeof(dest->root_item.drop_progress)); 4552 btrfs_set_root_drop_level(&dest->root_item, 0); 4553 btrfs_set_root_refs(&dest->root_item, 0); 4554 4555 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { 4556 ret = btrfs_insert_orphan_item(trans, 4557 fs_info->tree_root, 4558 btrfs_root_id(dest)); 4559 if (ret) { 4560 btrfs_abort_transaction(trans, ret); 4561 goto out_end_trans; 4562 } 4563 } 4564 4565 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, 4566 BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); 4567 if (ret && ret != -ENOENT) { 4568 btrfs_abort_transaction(trans, ret); 4569 goto out_end_trans; 4570 } 4571 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { 4572 ret = btrfs_uuid_tree_remove(trans, 4573 dest->root_item.received_uuid, 4574 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4575 btrfs_root_id(dest)); 4576 if (ret && ret != -ENOENT) { 4577 btrfs_abort_transaction(trans, ret); 4578 goto out_end_trans; 4579 } 4580 } 4581 4582 free_anon_bdev(dest->anon_dev); 4583 dest->anon_dev = 0; 4584 out_end_trans: 4585 trans->block_rsv = NULL; 4586 trans->bytes_reserved = 0; 4587 ret = btrfs_end_transaction(trans); 4588 inode->i_flags |= S_DEAD; 4589 out_release: 4590 btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); 4591 if (qgroup_reserved) 4592 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); 4593 out_undead: 4594 if (ret) { 4595 spin_lock(&dest->root_item_lock); 4596 root_flags = btrfs_root_flags(&dest->root_item); 4597 btrfs_set_root_flags(&dest->root_item, 4598 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); 4599 spin_unlock(&dest->root_item_lock); 4600 } 4601 out_up_write: 4602 up_write(&fs_info->subvol_sem); 4603 if (!ret) { 4604 d_invalidate(dentry); 4605 btrfs_prune_dentries(dest); 4606 ASSERT(dest->send_in_progress == 0); 4607 } 4608 4609 return ret; 4610 } 4611 4612 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4613 { 4614 struct inode *inode = d_inode(dentry); 4615 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 4616 int ret = 0; 4617 struct btrfs_trans_handle *trans; 4618 u64 last_unlink_trans; 4619 struct fscrypt_name fname; 4620 4621 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4622 return -ENOTEMPTY; 4623 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { 4624 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { 4625 btrfs_err(fs_info, 4626 "extent tree v2 doesn't support snapshot deletion yet"); 4627 return -EOPNOTSUPP; 4628 } 4629 return btrfs_delete_subvolume(BTRFS_I(dir), dentry); 4630 } 4631 4632 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); 4633 if (ret) 4634 return ret; 4635 4636 /* This needs to handle no-key deletions later on */ 4637 4638 trans = __unlink_start_trans(BTRFS_I(dir)); 4639 if (IS_ERR(trans)) { 4640 ret = PTR_ERR(trans); 4641 goto out_notrans; 4642 } 4643 4644 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4645 ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); 4646 goto out; 4647 } 4648 4649 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 4650 if (ret) 4651 goto out; 4652 4653 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; 4654 4655 /* now the directory is empty */ 4656 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4657 &fname.disk_name); 4658 if (!ret) { 4659 btrfs_i_size_write(BTRFS_I(inode), 0); 4660 /* 4661 * Propagate the last_unlink_trans value of the deleted dir to 4662 * its parent directory. This is to prevent an unrecoverable 4663 * log tree in the case we do something like this: 4664 * 1) create dir foo 4665 * 2) create snapshot under dir foo 4666 * 3) delete the snapshot 4667 * 4) rmdir foo 4668 * 5) mkdir foo 4669 * 6) fsync foo or some file inside foo 4670 */ 4671 if (last_unlink_trans >= trans->transid) 4672 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; 4673 } 4674 out: 4675 btrfs_end_transaction(trans); 4676 out_notrans: 4677 btrfs_btree_balance_dirty(fs_info); 4678 fscrypt_free_filename(&fname); 4679 4680 return ret; 4681 } 4682 4683 /* 4684 * Read, zero a chunk and write a block. 4685 * 4686 * @inode - inode that we're zeroing 4687 * @from - the offset to start zeroing 4688 * @len - the length to zero, 0 to zero the entire range respective to the 4689 * offset 4690 * @front - zero up to the offset instead of from the offset on 4691 * 4692 * This will find the block for the "from" offset and cow the block and zero the 4693 * part we want to zero. This is used with truncate and hole punching. 4694 */ 4695 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, 4696 int front) 4697 { 4698 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4699 struct address_space *mapping = inode->vfs_inode.i_mapping; 4700 struct extent_io_tree *io_tree = &inode->io_tree; 4701 struct btrfs_ordered_extent *ordered; 4702 struct extent_state *cached_state = NULL; 4703 struct extent_changeset *data_reserved = NULL; 4704 bool only_release_metadata = false; 4705 u32 blocksize = fs_info->sectorsize; 4706 pgoff_t index = from >> PAGE_SHIFT; 4707 unsigned offset = from & (blocksize - 1); 4708 struct folio *folio; 4709 gfp_t mask = btrfs_alloc_write_mask(mapping); 4710 size_t write_bytes = blocksize; 4711 int ret = 0; 4712 u64 block_start; 4713 u64 block_end; 4714 4715 if (IS_ALIGNED(offset, blocksize) && 4716 (!len || IS_ALIGNED(len, blocksize))) 4717 goto out; 4718 4719 block_start = round_down(from, blocksize); 4720 block_end = block_start + blocksize - 1; 4721 4722 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, 4723 blocksize, false); 4724 if (ret < 0) { 4725 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { 4726 /* For nocow case, no need to reserve data space */ 4727 only_release_metadata = true; 4728 } else { 4729 goto out; 4730 } 4731 } 4732 ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); 4733 if (ret < 0) { 4734 if (!only_release_metadata) 4735 btrfs_free_reserved_data_space(inode, data_reserved, 4736 block_start, blocksize); 4737 goto out; 4738 } 4739 again: 4740 folio = __filemap_get_folio(mapping, index, 4741 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); 4742 if (IS_ERR(folio)) { 4743 btrfs_delalloc_release_space(inode, data_reserved, block_start, 4744 blocksize, true); 4745 btrfs_delalloc_release_extents(inode, blocksize); 4746 ret = -ENOMEM; 4747 goto out; 4748 } 4749 4750 if (!folio_test_uptodate(folio)) { 4751 ret = btrfs_read_folio(NULL, folio); 4752 folio_lock(folio); 4753 if (folio->mapping != mapping) { 4754 folio_unlock(folio); 4755 folio_put(folio); 4756 goto again; 4757 } 4758 if (!folio_test_uptodate(folio)) { 4759 ret = -EIO; 4760 goto out_unlock; 4761 } 4762 } 4763 4764 /* 4765 * We unlock the page after the io is completed and then re-lock it 4766 * above. release_folio() could have come in between that and cleared 4767 * folio private, but left the page in the mapping. Set the page mapped 4768 * here to make sure it's properly set for the subpage stuff. 4769 */ 4770 ret = set_folio_extent_mapped(folio); 4771 if (ret < 0) 4772 goto out_unlock; 4773 4774 folio_wait_writeback(folio); 4775 4776 lock_extent(io_tree, block_start, block_end, &cached_state); 4777 4778 ordered = btrfs_lookup_ordered_extent(inode, block_start); 4779 if (ordered) { 4780 unlock_extent(io_tree, block_start, block_end, &cached_state); 4781 folio_unlock(folio); 4782 folio_put(folio); 4783 btrfs_start_ordered_extent(ordered); 4784 btrfs_put_ordered_extent(ordered); 4785 goto again; 4786 } 4787 4788 clear_extent_bit(&inode->io_tree, block_start, block_end, 4789 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4790 &cached_state); 4791 4792 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, 4793 &cached_state); 4794 if (ret) { 4795 unlock_extent(io_tree, block_start, block_end, &cached_state); 4796 goto out_unlock; 4797 } 4798 4799 if (offset != blocksize) { 4800 if (!len) 4801 len = blocksize - offset; 4802 if (front) 4803 folio_zero_range(folio, block_start - folio_pos(folio), 4804 offset); 4805 else 4806 folio_zero_range(folio, 4807 (block_start - folio_pos(folio)) + offset, 4808 len); 4809 } 4810 btrfs_folio_clear_checked(fs_info, folio, block_start, 4811 block_end + 1 - block_start); 4812 btrfs_folio_set_dirty(fs_info, folio, block_start, 4813 block_end + 1 - block_start); 4814 unlock_extent(io_tree, block_start, block_end, &cached_state); 4815 4816 if (only_release_metadata) 4817 set_extent_bit(&inode->io_tree, block_start, block_end, 4818 EXTENT_NORESERVE, NULL); 4819 4820 out_unlock: 4821 if (ret) { 4822 if (only_release_metadata) 4823 btrfs_delalloc_release_metadata(inode, blocksize, true); 4824 else 4825 btrfs_delalloc_release_space(inode, data_reserved, 4826 block_start, blocksize, true); 4827 } 4828 btrfs_delalloc_release_extents(inode, blocksize); 4829 folio_unlock(folio); 4830 folio_put(folio); 4831 out: 4832 if (only_release_metadata) 4833 btrfs_check_nocow_unlock(inode); 4834 extent_changeset_free(data_reserved); 4835 return ret; 4836 } 4837 4838 static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) 4839 { 4840 struct btrfs_root *root = inode->root; 4841 struct btrfs_fs_info *fs_info = root->fs_info; 4842 struct btrfs_trans_handle *trans; 4843 struct btrfs_drop_extents_args drop_args = { 0 }; 4844 int ret; 4845 4846 /* 4847 * If NO_HOLES is enabled, we don't need to do anything. 4848 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() 4849 * or btrfs_update_inode() will be called, which guarantee that the next 4850 * fsync will know this inode was changed and needs to be logged. 4851 */ 4852 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 4853 return 0; 4854 4855 /* 4856 * 1 - for the one we're dropping 4857 * 1 - for the one we're adding 4858 * 1 - for updating the inode. 4859 */ 4860 trans = btrfs_start_transaction(root, 3); 4861 if (IS_ERR(trans)) 4862 return PTR_ERR(trans); 4863 4864 drop_args.start = offset; 4865 drop_args.end = offset + len; 4866 drop_args.drop_cache = true; 4867 4868 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 4869 if (ret) { 4870 btrfs_abort_transaction(trans, ret); 4871 btrfs_end_transaction(trans); 4872 return ret; 4873 } 4874 4875 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len); 4876 if (ret) { 4877 btrfs_abort_transaction(trans, ret); 4878 } else { 4879 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); 4880 btrfs_update_inode(trans, inode); 4881 } 4882 btrfs_end_transaction(trans); 4883 return ret; 4884 } 4885 4886 /* 4887 * This function puts in dummy file extents for the area we're creating a hole 4888 * for. So if we are truncating this file to a larger size we need to insert 4889 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 4890 * the range between oldsize and size 4891 */ 4892 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) 4893 { 4894 struct btrfs_root *root = inode->root; 4895 struct btrfs_fs_info *fs_info = root->fs_info; 4896 struct extent_io_tree *io_tree = &inode->io_tree; 4897 struct extent_map *em = NULL; 4898 struct extent_state *cached_state = NULL; 4899 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); 4900 u64 block_end = ALIGN(size, fs_info->sectorsize); 4901 u64 last_byte; 4902 u64 cur_offset; 4903 u64 hole_size; 4904 int ret = 0; 4905 4906 /* 4907 * If our size started in the middle of a block we need to zero out the 4908 * rest of the block before we expand the i_size, otherwise we could 4909 * expose stale data. 4910 */ 4911 ret = btrfs_truncate_block(inode, oldsize, 0, 0); 4912 if (ret) 4913 return ret; 4914 4915 if (size <= hole_start) 4916 return 0; 4917 4918 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, 4919 &cached_state); 4920 cur_offset = hole_start; 4921 while (1) { 4922 em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); 4923 if (IS_ERR(em)) { 4924 ret = PTR_ERR(em); 4925 em = NULL; 4926 break; 4927 } 4928 last_byte = min(extent_map_end(em), block_end); 4929 last_byte = ALIGN(last_byte, fs_info->sectorsize); 4930 hole_size = last_byte - cur_offset; 4931 4932 if (!(em->flags & EXTENT_FLAG_PREALLOC)) { 4933 struct extent_map *hole_em; 4934 4935 ret = maybe_insert_hole(inode, cur_offset, hole_size); 4936 if (ret) 4937 break; 4938 4939 ret = btrfs_inode_set_file_extent_range(inode, 4940 cur_offset, hole_size); 4941 if (ret) 4942 break; 4943 4944 hole_em = alloc_extent_map(); 4945 if (!hole_em) { 4946 btrfs_drop_extent_map_range(inode, cur_offset, 4947 cur_offset + hole_size - 1, 4948 false); 4949 btrfs_set_inode_full_sync(inode); 4950 goto next; 4951 } 4952 hole_em->start = cur_offset; 4953 hole_em->len = hole_size; 4954 4955 hole_em->disk_bytenr = EXTENT_MAP_HOLE; 4956 hole_em->disk_num_bytes = 0; 4957 hole_em->ram_bytes = hole_size; 4958 hole_em->generation = btrfs_get_fs_generation(fs_info); 4959 4960 ret = btrfs_replace_extent_map_range(inode, hole_em, true); 4961 free_extent_map(hole_em); 4962 } else { 4963 ret = btrfs_inode_set_file_extent_range(inode, 4964 cur_offset, hole_size); 4965 if (ret) 4966 break; 4967 } 4968 next: 4969 free_extent_map(em); 4970 em = NULL; 4971 cur_offset = last_byte; 4972 if (cur_offset >= block_end) 4973 break; 4974 } 4975 free_extent_map(em); 4976 unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); 4977 return ret; 4978 } 4979 4980 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4981 { 4982 struct btrfs_root *root = BTRFS_I(inode)->root; 4983 struct btrfs_trans_handle *trans; 4984 loff_t oldsize = i_size_read(inode); 4985 loff_t newsize = attr->ia_size; 4986 int mask = attr->ia_valid; 4987 int ret; 4988 4989 /* 4990 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 4991 * special case where we need to update the times despite not having 4992 * these flags set. For all other operations the VFS set these flags 4993 * explicitly if it wants a timestamp update. 4994 */ 4995 if (newsize != oldsize) { 4996 inode_inc_iversion(inode); 4997 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { 4998 inode_set_mtime_to_ts(inode, 4999 inode_set_ctime_current(inode)); 5000 } 5001 } 5002 5003 if (newsize > oldsize) { 5004 /* 5005 * Don't do an expanding truncate while snapshotting is ongoing. 5006 * This is to ensure the snapshot captures a fully consistent 5007 * state of this file - if the snapshot captures this expanding 5008 * truncation, it must capture all writes that happened before 5009 * this truncation. 5010 */ 5011 btrfs_drew_write_lock(&root->snapshot_lock); 5012 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); 5013 if (ret) { 5014 btrfs_drew_write_unlock(&root->snapshot_lock); 5015 return ret; 5016 } 5017 5018 trans = btrfs_start_transaction(root, 1); 5019 if (IS_ERR(trans)) { 5020 btrfs_drew_write_unlock(&root->snapshot_lock); 5021 return PTR_ERR(trans); 5022 } 5023 5024 i_size_write(inode, newsize); 5025 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 5026 pagecache_isize_extended(inode, oldsize, newsize); 5027 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 5028 btrfs_drew_write_unlock(&root->snapshot_lock); 5029 btrfs_end_transaction(trans); 5030 } else { 5031 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 5032 5033 if (btrfs_is_zoned(fs_info)) { 5034 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 5035 ALIGN(newsize, fs_info->sectorsize), 5036 (u64)-1); 5037 if (ret) 5038 return ret; 5039 } 5040 5041 /* 5042 * We're truncating a file that used to have good data down to 5043 * zero. Make sure any new writes to the file get on disk 5044 * on close. 5045 */ 5046 if (newsize == 0) 5047 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 5048 &BTRFS_I(inode)->runtime_flags); 5049 5050 truncate_setsize(inode, newsize); 5051 5052 inode_dio_wait(inode); 5053 5054 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); 5055 if (ret && inode->i_nlink) { 5056 int err; 5057 5058 /* 5059 * Truncate failed, so fix up the in-memory size. We 5060 * adjusted disk_i_size down as we removed extents, so 5061 * wait for disk_i_size to be stable and then update the 5062 * in-memory size to match. 5063 */ 5064 err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 5065 if (err) 5066 return err; 5067 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 5068 } 5069 } 5070 5071 return ret; 5072 } 5073 5074 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 5075 struct iattr *attr) 5076 { 5077 struct inode *inode = d_inode(dentry); 5078 struct btrfs_root *root = BTRFS_I(inode)->root; 5079 int err; 5080 5081 if (btrfs_root_readonly(root)) 5082 return -EROFS; 5083 5084 err = setattr_prepare(idmap, dentry, attr); 5085 if (err) 5086 return err; 5087 5088 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 5089 err = btrfs_setsize(inode, attr); 5090 if (err) 5091 return err; 5092 } 5093 5094 if (attr->ia_valid) { 5095 setattr_copy(idmap, inode, attr); 5096 inode_inc_iversion(inode); 5097 err = btrfs_dirty_inode(BTRFS_I(inode)); 5098 5099 if (!err && attr->ia_valid & ATTR_MODE) 5100 err = posix_acl_chmod(idmap, dentry, inode->i_mode); 5101 } 5102 5103 return err; 5104 } 5105 5106 /* 5107 * While truncating the inode pages during eviction, we get the VFS 5108 * calling btrfs_invalidate_folio() against each folio of the inode. This 5109 * is slow because the calls to btrfs_invalidate_folio() result in a 5110 * huge amount of calls to lock_extent() and clear_extent_bit(), 5111 * which keep merging and splitting extent_state structures over and over, 5112 * wasting lots of time. 5113 * 5114 * Therefore if the inode is being evicted, let btrfs_invalidate_folio() 5115 * skip all those expensive operations on a per folio basis and do only 5116 * the ordered io finishing, while we release here the extent_map and 5117 * extent_state structures, without the excessive merging and splitting. 5118 */ 5119 static void evict_inode_truncate_pages(struct inode *inode) 5120 { 5121 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5122 struct rb_node *node; 5123 5124 ASSERT(inode->i_state & I_FREEING); 5125 truncate_inode_pages_final(&inode->i_data); 5126 5127 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); 5128 5129 /* 5130 * Keep looping until we have no more ranges in the io tree. 5131 * We can have ongoing bios started by readahead that have 5132 * their endio callback (extent_io.c:end_bio_extent_readpage) 5133 * still in progress (unlocked the pages in the bio but did not yet 5134 * unlocked the ranges in the io tree). Therefore this means some 5135 * ranges can still be locked and eviction started because before 5136 * submitting those bios, which are executed by a separate task (work 5137 * queue kthread), inode references (inode->i_count) were not taken 5138 * (which would be dropped in the end io callback of each bio). 5139 * Therefore here we effectively end up waiting for those bios and 5140 * anyone else holding locked ranges without having bumped the inode's 5141 * reference count - if we don't do it, when they access the inode's 5142 * io_tree to unlock a range it may be too late, leading to an 5143 * use-after-free issue. 5144 */ 5145 spin_lock(&io_tree->lock); 5146 while (!RB_EMPTY_ROOT(&io_tree->state)) { 5147 struct extent_state *state; 5148 struct extent_state *cached_state = NULL; 5149 u64 start; 5150 u64 end; 5151 unsigned state_flags; 5152 5153 node = rb_first(&io_tree->state); 5154 state = rb_entry(node, struct extent_state, rb_node); 5155 start = state->start; 5156 end = state->end; 5157 state_flags = state->state; 5158 spin_unlock(&io_tree->lock); 5159 5160 lock_extent(io_tree, start, end, &cached_state); 5161 5162 /* 5163 * If still has DELALLOC flag, the extent didn't reach disk, 5164 * and its reserved space won't be freed by delayed_ref. 5165 * So we need to free its reserved space here. 5166 * (Refer to comment in btrfs_invalidate_folio, case 2) 5167 * 5168 * Note, end is the bytenr of last byte, so we need + 1 here. 5169 */ 5170 if (state_flags & EXTENT_DELALLOC) 5171 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, 5172 end - start + 1, NULL); 5173 5174 clear_extent_bit(io_tree, start, end, 5175 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, 5176 &cached_state); 5177 5178 cond_resched(); 5179 spin_lock(&io_tree->lock); 5180 } 5181 spin_unlock(&io_tree->lock); 5182 } 5183 5184 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, 5185 struct btrfs_block_rsv *rsv) 5186 { 5187 struct btrfs_fs_info *fs_info = root->fs_info; 5188 struct btrfs_trans_handle *trans; 5189 u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1); 5190 int ret; 5191 5192 /* 5193 * Eviction should be taking place at some place safe because of our 5194 * delayed iputs. However the normal flushing code will run delayed 5195 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. 5196 * 5197 * We reserve the delayed_refs_extra here again because we can't use 5198 * btrfs_start_transaction(root, 0) for the same deadlocky reason as 5199 * above. We reserve our extra bit here because we generate a ton of 5200 * delayed refs activity by truncating. 5201 * 5202 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, 5203 * if we fail to make this reservation we can re-try without the 5204 * delayed_refs_extra so we can make some forward progress. 5205 */ 5206 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, 5207 BTRFS_RESERVE_FLUSH_EVICT); 5208 if (ret) { 5209 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, 5210 BTRFS_RESERVE_FLUSH_EVICT); 5211 if (ret) { 5212 btrfs_warn(fs_info, 5213 "could not allocate space for delete; will truncate on mount"); 5214 return ERR_PTR(-ENOSPC); 5215 } 5216 delayed_refs_extra = 0; 5217 } 5218 5219 trans = btrfs_join_transaction(root); 5220 if (IS_ERR(trans)) 5221 return trans; 5222 5223 if (delayed_refs_extra) { 5224 trans->block_rsv = &fs_info->trans_block_rsv; 5225 trans->bytes_reserved = delayed_refs_extra; 5226 btrfs_block_rsv_migrate(rsv, trans->block_rsv, 5227 delayed_refs_extra, true); 5228 } 5229 return trans; 5230 } 5231 5232 void btrfs_evict_inode(struct inode *inode) 5233 { 5234 struct btrfs_fs_info *fs_info; 5235 struct btrfs_trans_handle *trans; 5236 struct btrfs_root *root = BTRFS_I(inode)->root; 5237 struct btrfs_block_rsv *rsv = NULL; 5238 int ret; 5239 5240 trace_btrfs_inode_evict(inode); 5241 5242 if (!root) { 5243 fsverity_cleanup_inode(inode); 5244 clear_inode(inode); 5245 return; 5246 } 5247 5248 fs_info = inode_to_fs_info(inode); 5249 evict_inode_truncate_pages(inode); 5250 5251 if (inode->i_nlink && 5252 ((btrfs_root_refs(&root->root_item) != 0 && 5253 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) || 5254 btrfs_is_free_space_inode(BTRFS_I(inode)))) 5255 goto out; 5256 5257 if (is_bad_inode(inode)) 5258 goto out; 5259 5260 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 5261 goto out; 5262 5263 if (inode->i_nlink > 0) { 5264 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 5265 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID); 5266 goto out; 5267 } 5268 5269 /* 5270 * This makes sure the inode item in tree is uptodate and the space for 5271 * the inode update is released. 5272 */ 5273 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); 5274 if (ret) 5275 goto out; 5276 5277 /* 5278 * This drops any pending insert or delete operations we have for this 5279 * inode. We could have a delayed dir index deletion queued up, but 5280 * we're removing the inode completely so that'll be taken care of in 5281 * the truncate. 5282 */ 5283 btrfs_kill_delayed_inode_items(BTRFS_I(inode)); 5284 5285 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 5286 if (!rsv) 5287 goto out; 5288 rsv->size = btrfs_calc_metadata_size(fs_info, 1); 5289 rsv->failfast = true; 5290 5291 btrfs_i_size_write(BTRFS_I(inode), 0); 5292 5293 while (1) { 5294 struct btrfs_truncate_control control = { 5295 .inode = BTRFS_I(inode), 5296 .ino = btrfs_ino(BTRFS_I(inode)), 5297 .new_size = 0, 5298 .min_type = 0, 5299 }; 5300 5301 trans = evict_refill_and_join(root, rsv); 5302 if (IS_ERR(trans)) 5303 goto out; 5304 5305 trans->block_rsv = rsv; 5306 5307 ret = btrfs_truncate_inode_items(trans, root, &control); 5308 trans->block_rsv = &fs_info->trans_block_rsv; 5309 btrfs_end_transaction(trans); 5310 /* 5311 * We have not added new delayed items for our inode after we 5312 * have flushed its delayed items, so no need to throttle on 5313 * delayed items. However we have modified extent buffers. 5314 */ 5315 btrfs_btree_balance_dirty_nodelay(fs_info); 5316 if (ret && ret != -ENOSPC && ret != -EAGAIN) 5317 goto out; 5318 else if (!ret) 5319 break; 5320 } 5321 5322 /* 5323 * Errors here aren't a big deal, it just means we leave orphan items in 5324 * the tree. They will be cleaned up on the next mount. If the inode 5325 * number gets reused, cleanup deletes the orphan item without doing 5326 * anything, and unlink reuses the existing orphan item. 5327 * 5328 * If it turns out that we are dropping too many of these, we might want 5329 * to add a mechanism for retrying these after a commit. 5330 */ 5331 trans = evict_refill_and_join(root, rsv); 5332 if (!IS_ERR(trans)) { 5333 trans->block_rsv = rsv; 5334 btrfs_orphan_del(trans, BTRFS_I(inode)); 5335 trans->block_rsv = &fs_info->trans_block_rsv; 5336 btrfs_end_transaction(trans); 5337 } 5338 5339 out: 5340 btrfs_free_block_rsv(fs_info, rsv); 5341 /* 5342 * If we didn't successfully delete, the orphan item will still be in 5343 * the tree and we'll retry on the next mount. Again, we might also want 5344 * to retry these periodically in the future. 5345 */ 5346 btrfs_remove_delayed_node(BTRFS_I(inode)); 5347 fsverity_cleanup_inode(inode); 5348 clear_inode(inode); 5349 } 5350 5351 /* 5352 * Return the key found in the dir entry in the location pointer, fill @type 5353 * with BTRFS_FT_*, and return 0. 5354 * 5355 * If no dir entries were found, returns -ENOENT. 5356 * If found a corrupted location in dir entry, returns -EUCLEAN. 5357 */ 5358 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, 5359 struct btrfs_key *location, u8 *type) 5360 { 5361 struct btrfs_dir_item *di; 5362 struct btrfs_path *path; 5363 struct btrfs_root *root = dir->root; 5364 int ret = 0; 5365 struct fscrypt_name fname; 5366 5367 path = btrfs_alloc_path(); 5368 if (!path) 5369 return -ENOMEM; 5370 5371 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); 5372 if (ret < 0) 5373 goto out; 5374 /* 5375 * fscrypt_setup_filename() should never return a positive value, but 5376 * gcc on sparc/parisc thinks it can, so assert that doesn't happen. 5377 */ 5378 ASSERT(ret == 0); 5379 5380 /* This needs to handle no-key deletions later on */ 5381 5382 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), 5383 &fname.disk_name, 0); 5384 if (IS_ERR_OR_NULL(di)) { 5385 ret = di ? PTR_ERR(di) : -ENOENT; 5386 goto out; 5387 } 5388 5389 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 5390 if (location->type != BTRFS_INODE_ITEM_KEY && 5391 location->type != BTRFS_ROOT_ITEM_KEY) { 5392 ret = -EUCLEAN; 5393 btrfs_warn(root->fs_info, 5394 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", 5395 __func__, fname.disk_name.name, btrfs_ino(dir), 5396 location->objectid, location->type, location->offset); 5397 } 5398 if (!ret) 5399 *type = btrfs_dir_ftype(path->nodes[0], di); 5400 out: 5401 fscrypt_free_filename(&fname); 5402 btrfs_free_path(path); 5403 return ret; 5404 } 5405 5406 /* 5407 * when we hit a tree root in a directory, the btrfs part of the inode 5408 * needs to be changed to reflect the root directory of the tree root. This 5409 * is kind of like crossing a mount point. 5410 */ 5411 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, 5412 struct btrfs_inode *dir, 5413 struct dentry *dentry, 5414 struct btrfs_key *location, 5415 struct btrfs_root **sub_root) 5416 { 5417 struct btrfs_path *path; 5418 struct btrfs_root *new_root; 5419 struct btrfs_root_ref *ref; 5420 struct extent_buffer *leaf; 5421 struct btrfs_key key; 5422 int ret; 5423 int err = 0; 5424 struct fscrypt_name fname; 5425 5426 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); 5427 if (ret) 5428 return ret; 5429 5430 path = btrfs_alloc_path(); 5431 if (!path) { 5432 err = -ENOMEM; 5433 goto out; 5434 } 5435 5436 err = -ENOENT; 5437 key.objectid = btrfs_root_id(dir->root); 5438 key.type = BTRFS_ROOT_REF_KEY; 5439 key.offset = location->objectid; 5440 5441 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 5442 if (ret) { 5443 if (ret < 0) 5444 err = ret; 5445 goto out; 5446 } 5447 5448 leaf = path->nodes[0]; 5449 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 5450 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 5451 btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) 5452 goto out; 5453 5454 ret = memcmp_extent_buffer(leaf, fname.disk_name.name, 5455 (unsigned long)(ref + 1), fname.disk_name.len); 5456 if (ret) 5457 goto out; 5458 5459 btrfs_release_path(path); 5460 5461 new_root = btrfs_get_fs_root(fs_info, location->objectid, true); 5462 if (IS_ERR(new_root)) { 5463 err = PTR_ERR(new_root); 5464 goto out; 5465 } 5466 5467 *sub_root = new_root; 5468 location->objectid = btrfs_root_dirid(&new_root->root_item); 5469 location->type = BTRFS_INODE_ITEM_KEY; 5470 location->offset = 0; 5471 err = 0; 5472 out: 5473 btrfs_free_path(path); 5474 fscrypt_free_filename(&fname); 5475 return err; 5476 } 5477 5478 static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) 5479 { 5480 struct btrfs_root *root = inode->root; 5481 struct btrfs_inode *existing; 5482 const u64 ino = btrfs_ino(inode); 5483 int ret; 5484 5485 if (inode_unhashed(&inode->vfs_inode)) 5486 return 0; 5487 5488 if (prealloc) { 5489 ret = xa_reserve(&root->inodes, ino, GFP_NOFS); 5490 if (ret) 5491 return ret; 5492 } 5493 5494 existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); 5495 5496 if (xa_is_err(existing)) { 5497 ret = xa_err(existing); 5498 ASSERT(ret != -EINVAL); 5499 ASSERT(ret != -ENOMEM); 5500 return ret; 5501 } else if (existing) { 5502 WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); 5503 } 5504 5505 return 0; 5506 } 5507 5508 static void btrfs_del_inode_from_root(struct btrfs_inode *inode) 5509 { 5510 struct btrfs_root *root = inode->root; 5511 struct btrfs_inode *entry; 5512 bool empty = false; 5513 5514 xa_lock(&root->inodes); 5515 entry = __xa_erase(&root->inodes, btrfs_ino(inode)); 5516 if (entry == inode) 5517 empty = xa_empty(&root->inodes); 5518 xa_unlock(&root->inodes); 5519 5520 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5521 xa_lock(&root->inodes); 5522 empty = xa_empty(&root->inodes); 5523 xa_unlock(&root->inodes); 5524 if (empty) 5525 btrfs_add_dead_root(root); 5526 } 5527 } 5528 5529 5530 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5531 { 5532 struct btrfs_iget_args *args = p; 5533 5534 btrfs_set_inode_number(BTRFS_I(inode), args->ino); 5535 BTRFS_I(inode)->root = btrfs_grab_root(args->root); 5536 5537 if (args->root && args->root == args->root->fs_info->tree_root && 5538 args->ino != BTRFS_BTREE_INODE_OBJECTID) 5539 set_bit(BTRFS_INODE_FREE_SPACE_INODE, 5540 &BTRFS_I(inode)->runtime_flags); 5541 return 0; 5542 } 5543 5544 static int btrfs_find_actor(struct inode *inode, void *opaque) 5545 { 5546 struct btrfs_iget_args *args = opaque; 5547 5548 return args->ino == btrfs_ino(BTRFS_I(inode)) && 5549 args->root == BTRFS_I(inode)->root; 5550 } 5551 5552 static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) 5553 { 5554 struct inode *inode; 5555 struct btrfs_iget_args args; 5556 unsigned long hashval = btrfs_inode_hash(ino, root); 5557 5558 args.ino = ino; 5559 args.root = root; 5560 5561 inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, 5562 btrfs_init_locked_inode, 5563 (void *)&args); 5564 return inode; 5565 } 5566 5567 /* 5568 * Get an inode object given its inode number and corresponding root. 5569 * Path can be preallocated to prevent recursing back to iget through 5570 * allocator. NULL is also valid but may require an additional allocation 5571 * later. 5572 */ 5573 struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, 5574 struct btrfs_path *path) 5575 { 5576 struct inode *inode; 5577 int ret; 5578 5579 inode = btrfs_iget_locked(ino, root); 5580 if (!inode) 5581 return ERR_PTR(-ENOMEM); 5582 5583 if (!(inode->i_state & I_NEW)) 5584 return inode; 5585 5586 ret = btrfs_read_locked_inode(inode, path); 5587 /* 5588 * ret > 0 can come from btrfs_search_slot called by 5589 * btrfs_read_locked_inode(), this means the inode item was not found. 5590 */ 5591 if (ret > 0) 5592 ret = -ENOENT; 5593 if (ret < 0) 5594 goto error; 5595 5596 ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); 5597 if (ret < 0) 5598 goto error; 5599 5600 unlock_new_inode(inode); 5601 5602 return inode; 5603 error: 5604 iget_failed(inode); 5605 return ERR_PTR(ret); 5606 } 5607 5608 struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) 5609 { 5610 return btrfs_iget_path(ino, root, NULL); 5611 } 5612 5613 static struct inode *new_simple_dir(struct inode *dir, 5614 struct btrfs_key *key, 5615 struct btrfs_root *root) 5616 { 5617 struct timespec64 ts; 5618 struct inode *inode = new_inode(dir->i_sb); 5619 5620 if (!inode) 5621 return ERR_PTR(-ENOMEM); 5622 5623 BTRFS_I(inode)->root = btrfs_grab_root(root); 5624 BTRFS_I(inode)->ref_root_id = key->objectid; 5625 set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); 5626 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 5627 5628 btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); 5629 /* 5630 * We only need lookup, the rest is read-only and there's no inode 5631 * associated with the dentry 5632 */ 5633 inode->i_op = &simple_dir_inode_operations; 5634 inode->i_opflags &= ~IOP_XATTR; 5635 inode->i_fop = &simple_dir_operations; 5636 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5637 5638 ts = inode_set_ctime_current(inode); 5639 inode_set_mtime_to_ts(inode, ts); 5640 inode_set_atime_to_ts(inode, inode_get_atime(dir)); 5641 BTRFS_I(inode)->i_otime_sec = ts.tv_sec; 5642 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; 5643 5644 inode->i_uid = dir->i_uid; 5645 inode->i_gid = dir->i_gid; 5646 5647 return inode; 5648 } 5649 5650 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); 5651 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); 5652 static_assert(BTRFS_FT_DIR == FT_DIR); 5653 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); 5654 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); 5655 static_assert(BTRFS_FT_FIFO == FT_FIFO); 5656 static_assert(BTRFS_FT_SOCK == FT_SOCK); 5657 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); 5658 5659 static inline u8 btrfs_inode_type(struct inode *inode) 5660 { 5661 return fs_umode_to_ftype(inode->i_mode); 5662 } 5663 5664 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 5665 { 5666 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 5667 struct inode *inode; 5668 struct btrfs_root *root = BTRFS_I(dir)->root; 5669 struct btrfs_root *sub_root = root; 5670 struct btrfs_key location = { 0 }; 5671 u8 di_type = 0; 5672 int ret = 0; 5673 5674 if (dentry->d_name.len > BTRFS_NAME_LEN) 5675 return ERR_PTR(-ENAMETOOLONG); 5676 5677 ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); 5678 if (ret < 0) 5679 return ERR_PTR(ret); 5680 5681 if (location.type == BTRFS_INODE_ITEM_KEY) { 5682 inode = btrfs_iget(location.objectid, root); 5683 if (IS_ERR(inode)) 5684 return inode; 5685 5686 /* Do extra check against inode mode with di_type */ 5687 if (btrfs_inode_type(inode) != di_type) { 5688 btrfs_crit(fs_info, 5689 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", 5690 inode->i_mode, btrfs_inode_type(inode), 5691 di_type); 5692 iput(inode); 5693 return ERR_PTR(-EUCLEAN); 5694 } 5695 return inode; 5696 } 5697 5698 ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, 5699 &location, &sub_root); 5700 if (ret < 0) { 5701 if (ret != -ENOENT) 5702 inode = ERR_PTR(ret); 5703 else 5704 inode = new_simple_dir(dir, &location, root); 5705 } else { 5706 inode = btrfs_iget(location.objectid, sub_root); 5707 btrfs_put_root(sub_root); 5708 5709 if (IS_ERR(inode)) 5710 return inode; 5711 5712 down_read(&fs_info->cleanup_work_sem); 5713 if (!sb_rdonly(inode->i_sb)) 5714 ret = btrfs_orphan_cleanup(sub_root); 5715 up_read(&fs_info->cleanup_work_sem); 5716 if (ret) { 5717 iput(inode); 5718 inode = ERR_PTR(ret); 5719 } 5720 } 5721 5722 return inode; 5723 } 5724 5725 static int btrfs_dentry_delete(const struct dentry *dentry) 5726 { 5727 struct btrfs_root *root; 5728 struct inode *inode = d_inode(dentry); 5729 5730 if (!inode && !IS_ROOT(dentry)) 5731 inode = d_inode(dentry->d_parent); 5732 5733 if (inode) { 5734 root = BTRFS_I(inode)->root; 5735 if (btrfs_root_refs(&root->root_item) == 0) 5736 return 1; 5737 5738 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5739 return 1; 5740 } 5741 return 0; 5742 } 5743 5744 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5745 unsigned int flags) 5746 { 5747 struct inode *inode = btrfs_lookup_dentry(dir, dentry); 5748 5749 if (inode == ERR_PTR(-ENOENT)) 5750 inode = NULL; 5751 return d_splice_alias(inode, dentry); 5752 } 5753 5754 /* 5755 * Find the highest existing sequence number in a directory and then set the 5756 * in-memory index_cnt variable to the first free sequence number. 5757 */ 5758 static int btrfs_set_inode_index_count(struct btrfs_inode *inode) 5759 { 5760 struct btrfs_root *root = inode->root; 5761 struct btrfs_key key, found_key; 5762 struct btrfs_path *path; 5763 struct extent_buffer *leaf; 5764 int ret; 5765 5766 key.objectid = btrfs_ino(inode); 5767 key.type = BTRFS_DIR_INDEX_KEY; 5768 key.offset = (u64)-1; 5769 5770 path = btrfs_alloc_path(); 5771 if (!path) 5772 return -ENOMEM; 5773 5774 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5775 if (ret < 0) 5776 goto out; 5777 /* FIXME: we should be able to handle this */ 5778 if (ret == 0) 5779 goto out; 5780 ret = 0; 5781 5782 if (path->slots[0] == 0) { 5783 inode->index_cnt = BTRFS_DIR_START_INDEX; 5784 goto out; 5785 } 5786 5787 path->slots[0]--; 5788 5789 leaf = path->nodes[0]; 5790 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5791 5792 if (found_key.objectid != btrfs_ino(inode) || 5793 found_key.type != BTRFS_DIR_INDEX_KEY) { 5794 inode->index_cnt = BTRFS_DIR_START_INDEX; 5795 goto out; 5796 } 5797 5798 inode->index_cnt = found_key.offset + 1; 5799 out: 5800 btrfs_free_path(path); 5801 return ret; 5802 } 5803 5804 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) 5805 { 5806 int ret = 0; 5807 5808 btrfs_inode_lock(dir, 0); 5809 if (dir->index_cnt == (u64)-1) { 5810 ret = btrfs_inode_delayed_dir_index_count(dir); 5811 if (ret) { 5812 ret = btrfs_set_inode_index_count(dir); 5813 if (ret) 5814 goto out; 5815 } 5816 } 5817 5818 /* index_cnt is the index number of next new entry, so decrement it. */ 5819 *index = dir->index_cnt - 1; 5820 out: 5821 btrfs_inode_unlock(dir, 0); 5822 5823 return ret; 5824 } 5825 5826 /* 5827 * All this infrastructure exists because dir_emit can fault, and we are holding 5828 * the tree lock when doing readdir. For now just allocate a buffer and copy 5829 * our information into that, and then dir_emit from the buffer. This is 5830 * similar to what NFS does, only we don't keep the buffer around in pagecache 5831 * because I'm afraid I'll mess that up. Long term we need to make filldir do 5832 * copy_to_user_inatomic so we don't have to worry about page faulting under the 5833 * tree lock. 5834 */ 5835 static int btrfs_opendir(struct inode *inode, struct file *file) 5836 { 5837 struct btrfs_file_private *private; 5838 u64 last_index; 5839 int ret; 5840 5841 ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); 5842 if (ret) 5843 return ret; 5844 5845 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); 5846 if (!private) 5847 return -ENOMEM; 5848 private->last_index = last_index; 5849 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); 5850 if (!private->filldir_buf) { 5851 kfree(private); 5852 return -ENOMEM; 5853 } 5854 file->private_data = private; 5855 return 0; 5856 } 5857 5858 static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) 5859 { 5860 struct btrfs_file_private *private = file->private_data; 5861 int ret; 5862 5863 ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), 5864 &private->last_index); 5865 if (ret) 5866 return ret; 5867 5868 return generic_file_llseek(file, offset, whence); 5869 } 5870 5871 struct dir_entry { 5872 u64 ino; 5873 u64 offset; 5874 unsigned type; 5875 int name_len; 5876 }; 5877 5878 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) 5879 { 5880 while (entries--) { 5881 struct dir_entry *entry = addr; 5882 char *name = (char *)(entry + 1); 5883 5884 ctx->pos = get_unaligned(&entry->offset); 5885 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), 5886 get_unaligned(&entry->ino), 5887 get_unaligned(&entry->type))) 5888 return 1; 5889 addr += sizeof(struct dir_entry) + 5890 get_unaligned(&entry->name_len); 5891 ctx->pos++; 5892 } 5893 return 0; 5894 } 5895 5896 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 5897 { 5898 struct inode *inode = file_inode(file); 5899 struct btrfs_root *root = BTRFS_I(inode)->root; 5900 struct btrfs_file_private *private = file->private_data; 5901 struct btrfs_dir_item *di; 5902 struct btrfs_key key; 5903 struct btrfs_key found_key; 5904 struct btrfs_path *path; 5905 void *addr; 5906 LIST_HEAD(ins_list); 5907 LIST_HEAD(del_list); 5908 int ret; 5909 char *name_ptr; 5910 int name_len; 5911 int entries = 0; 5912 int total_len = 0; 5913 bool put = false; 5914 struct btrfs_key location; 5915 5916 if (!dir_emit_dots(file, ctx)) 5917 return 0; 5918 5919 path = btrfs_alloc_path(); 5920 if (!path) 5921 return -ENOMEM; 5922 5923 addr = private->filldir_buf; 5924 path->reada = READA_FORWARD; 5925 5926 put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index, 5927 &ins_list, &del_list); 5928 5929 again: 5930 key.type = BTRFS_DIR_INDEX_KEY; 5931 key.offset = ctx->pos; 5932 key.objectid = btrfs_ino(BTRFS_I(inode)); 5933 5934 btrfs_for_each_slot(root, &key, &found_key, path, ret) { 5935 struct dir_entry *entry; 5936 struct extent_buffer *leaf = path->nodes[0]; 5937 u8 ftype; 5938 5939 if (found_key.objectid != key.objectid) 5940 break; 5941 if (found_key.type != BTRFS_DIR_INDEX_KEY) 5942 break; 5943 if (found_key.offset < ctx->pos) 5944 continue; 5945 if (found_key.offset > private->last_index) 5946 break; 5947 if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) 5948 continue; 5949 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 5950 name_len = btrfs_dir_name_len(leaf, di); 5951 if ((total_len + sizeof(struct dir_entry) + name_len) >= 5952 PAGE_SIZE) { 5953 btrfs_release_path(path); 5954 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 5955 if (ret) 5956 goto nopos; 5957 addr = private->filldir_buf; 5958 entries = 0; 5959 total_len = 0; 5960 goto again; 5961 } 5962 5963 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); 5964 entry = addr; 5965 name_ptr = (char *)(entry + 1); 5966 read_extent_buffer(leaf, name_ptr, 5967 (unsigned long)(di + 1), name_len); 5968 put_unaligned(name_len, &entry->name_len); 5969 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); 5970 btrfs_dir_item_key_to_cpu(leaf, di, &location); 5971 put_unaligned(location.objectid, &entry->ino); 5972 put_unaligned(found_key.offset, &entry->offset); 5973 entries++; 5974 addr += sizeof(struct dir_entry) + name_len; 5975 total_len += sizeof(struct dir_entry) + name_len; 5976 } 5977 /* Catch error encountered during iteration */ 5978 if (ret < 0) 5979 goto err; 5980 5981 btrfs_release_path(path); 5982 5983 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 5984 if (ret) 5985 goto nopos; 5986 5987 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); 5988 if (ret) 5989 goto nopos; 5990 5991 /* 5992 * Stop new entries from being returned after we return the last 5993 * entry. 5994 * 5995 * New directory entries are assigned a strictly increasing 5996 * offset. This means that new entries created during readdir 5997 * are *guaranteed* to be seen in the future by that readdir. 5998 * This has broken buggy programs which operate on names as 5999 * they're returned by readdir. Until we re-use freed offsets 6000 * we have this hack to stop new entries from being returned 6001 * under the assumption that they'll never reach this huge 6002 * offset. 6003 * 6004 * This is being careful not to overflow 32bit loff_t unless the 6005 * last entry requires it because doing so has broken 32bit apps 6006 * in the past. 6007 */ 6008 if (ctx->pos >= INT_MAX) 6009 ctx->pos = LLONG_MAX; 6010 else 6011 ctx->pos = INT_MAX; 6012 nopos: 6013 ret = 0; 6014 err: 6015 if (put) 6016 btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); 6017 btrfs_free_path(path); 6018 return ret; 6019 } 6020 6021 /* 6022 * This is somewhat expensive, updating the tree every time the 6023 * inode changes. But, it is most likely to find the inode in cache. 6024 * FIXME, needs more benchmarking...there are no reasons other than performance 6025 * to keep or drop this code. 6026 */ 6027 static int btrfs_dirty_inode(struct btrfs_inode *inode) 6028 { 6029 struct btrfs_root *root = inode->root; 6030 struct btrfs_fs_info *fs_info = root->fs_info; 6031 struct btrfs_trans_handle *trans; 6032 int ret; 6033 6034 if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) 6035 return 0; 6036 6037 trans = btrfs_join_transaction(root); 6038 if (IS_ERR(trans)) 6039 return PTR_ERR(trans); 6040 6041 ret = btrfs_update_inode(trans, inode); 6042 if (ret == -ENOSPC || ret == -EDQUOT) { 6043 /* whoops, lets try again with the full transaction */ 6044 btrfs_end_transaction(trans); 6045 trans = btrfs_start_transaction(root, 1); 6046 if (IS_ERR(trans)) 6047 return PTR_ERR(trans); 6048 6049 ret = btrfs_update_inode(trans, inode); 6050 } 6051 btrfs_end_transaction(trans); 6052 if (inode->delayed_node) 6053 btrfs_balance_delayed_items(fs_info); 6054 6055 return ret; 6056 } 6057 6058 /* 6059 * This is a copy of file_update_time. We need this so we can return error on 6060 * ENOSPC for updating the inode in the case of file write and mmap writes. 6061 */ 6062 static int btrfs_update_time(struct inode *inode, int flags) 6063 { 6064 struct btrfs_root *root = BTRFS_I(inode)->root; 6065 bool dirty; 6066 6067 if (btrfs_root_readonly(root)) 6068 return -EROFS; 6069 6070 dirty = inode_update_timestamps(inode, flags); 6071 return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; 6072 } 6073 6074 /* 6075 * helper to find a free sequence number in a given directory. This current 6076 * code is very simple, later versions will do smarter things in the btree 6077 */ 6078 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) 6079 { 6080 int ret = 0; 6081 6082 if (dir->index_cnt == (u64)-1) { 6083 ret = btrfs_inode_delayed_dir_index_count(dir); 6084 if (ret) { 6085 ret = btrfs_set_inode_index_count(dir); 6086 if (ret) 6087 return ret; 6088 } 6089 } 6090 6091 *index = dir->index_cnt; 6092 dir->index_cnt++; 6093 6094 return ret; 6095 } 6096 6097 static int btrfs_insert_inode_locked(struct inode *inode) 6098 { 6099 struct btrfs_iget_args args; 6100 6101 args.ino = btrfs_ino(BTRFS_I(inode)); 6102 args.root = BTRFS_I(inode)->root; 6103 6104 return insert_inode_locked4(inode, 6105 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), 6106 btrfs_find_actor, &args); 6107 } 6108 6109 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, 6110 unsigned int *trans_num_items) 6111 { 6112 struct inode *dir = args->dir; 6113 struct inode *inode = args->inode; 6114 int ret; 6115 6116 if (!args->orphan) { 6117 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, 6118 &args->fname); 6119 if (ret) 6120 return ret; 6121 } 6122 6123 ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); 6124 if (ret) { 6125 fscrypt_free_filename(&args->fname); 6126 return ret; 6127 } 6128 6129 /* 1 to add inode item */ 6130 *trans_num_items = 1; 6131 /* 1 to add compression property */ 6132 if (BTRFS_I(dir)->prop_compress) 6133 (*trans_num_items)++; 6134 /* 1 to add default ACL xattr */ 6135 if (args->default_acl) 6136 (*trans_num_items)++; 6137 /* 1 to add access ACL xattr */ 6138 if (args->acl) 6139 (*trans_num_items)++; 6140 #ifdef CONFIG_SECURITY 6141 /* 1 to add LSM xattr */ 6142 if (dir->i_security) 6143 (*trans_num_items)++; 6144 #endif 6145 if (args->orphan) { 6146 /* 1 to add orphan item */ 6147 (*trans_num_items)++; 6148 } else { 6149 /* 6150 * 1 to add dir item 6151 * 1 to add dir index 6152 * 1 to update parent inode item 6153 * 6154 * No need for 1 unit for the inode ref item because it is 6155 * inserted in a batch together with the inode item at 6156 * btrfs_create_new_inode(). 6157 */ 6158 *trans_num_items += 3; 6159 } 6160 return 0; 6161 } 6162 6163 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) 6164 { 6165 posix_acl_release(args->acl); 6166 posix_acl_release(args->default_acl); 6167 fscrypt_free_filename(&args->fname); 6168 } 6169 6170 /* 6171 * Inherit flags from the parent inode. 6172 * 6173 * Currently only the compression flags and the cow flags are inherited. 6174 */ 6175 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) 6176 { 6177 unsigned int flags; 6178 6179 flags = dir->flags; 6180 6181 if (flags & BTRFS_INODE_NOCOMPRESS) { 6182 inode->flags &= ~BTRFS_INODE_COMPRESS; 6183 inode->flags |= BTRFS_INODE_NOCOMPRESS; 6184 } else if (flags & BTRFS_INODE_COMPRESS) { 6185 inode->flags &= ~BTRFS_INODE_NOCOMPRESS; 6186 inode->flags |= BTRFS_INODE_COMPRESS; 6187 } 6188 6189 if (flags & BTRFS_INODE_NODATACOW) { 6190 inode->flags |= BTRFS_INODE_NODATACOW; 6191 if (S_ISREG(inode->vfs_inode.i_mode)) 6192 inode->flags |= BTRFS_INODE_NODATASUM; 6193 } 6194 6195 btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); 6196 } 6197 6198 int btrfs_create_new_inode(struct btrfs_trans_handle *trans, 6199 struct btrfs_new_inode_args *args) 6200 { 6201 struct timespec64 ts; 6202 struct inode *dir = args->dir; 6203 struct inode *inode = args->inode; 6204 const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; 6205 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 6206 struct btrfs_root *root; 6207 struct btrfs_inode_item *inode_item; 6208 struct btrfs_path *path; 6209 u64 objectid; 6210 struct btrfs_inode_ref *ref; 6211 struct btrfs_key key[2]; 6212 u32 sizes[2]; 6213 struct btrfs_item_batch batch; 6214 unsigned long ptr; 6215 int ret; 6216 bool xa_reserved = false; 6217 6218 path = btrfs_alloc_path(); 6219 if (!path) 6220 return -ENOMEM; 6221 6222 if (!args->subvol) 6223 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); 6224 root = BTRFS_I(inode)->root; 6225 6226 ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); 6227 if (ret) 6228 goto out; 6229 6230 ret = btrfs_get_free_objectid(root, &objectid); 6231 if (ret) 6232 goto out; 6233 btrfs_set_inode_number(BTRFS_I(inode), objectid); 6234 6235 ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); 6236 if (ret) 6237 goto out; 6238 xa_reserved = true; 6239 6240 if (args->orphan) { 6241 /* 6242 * O_TMPFILE, set link count to 0, so that after this point, we 6243 * fill in an inode item with the correct link count. 6244 */ 6245 set_nlink(inode, 0); 6246 } else { 6247 trace_btrfs_inode_request(dir); 6248 6249 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); 6250 if (ret) 6251 goto out; 6252 } 6253 6254 if (S_ISDIR(inode->i_mode)) 6255 BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; 6256 6257 BTRFS_I(inode)->generation = trans->transid; 6258 inode->i_generation = BTRFS_I(inode)->generation; 6259 6260 /* 6261 * We don't have any capability xattrs set here yet, shortcut any 6262 * queries for the xattrs here. If we add them later via the inode 6263 * security init path or any other path this flag will be cleared. 6264 */ 6265 set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); 6266 6267 /* 6268 * Subvolumes don't inherit flags from their parent directory. 6269 * Originally this was probably by accident, but we probably can't 6270 * change it now without compatibility issues. 6271 */ 6272 if (!args->subvol) 6273 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); 6274 6275 if (S_ISREG(inode->i_mode)) { 6276 if (btrfs_test_opt(fs_info, NODATASUM)) 6277 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6278 if (btrfs_test_opt(fs_info, NODATACOW)) 6279 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6280 BTRFS_INODE_NODATASUM; 6281 } 6282 6283 ret = btrfs_insert_inode_locked(inode); 6284 if (ret < 0) { 6285 if (!args->orphan) 6286 BTRFS_I(dir)->index_cnt--; 6287 goto out; 6288 } 6289 6290 /* 6291 * We could have gotten an inode number from somebody who was fsynced 6292 * and then removed in this same transaction, so let's just set full 6293 * sync since it will be a full sync anyway and this will blow away the 6294 * old info in the log. 6295 */ 6296 btrfs_set_inode_full_sync(BTRFS_I(inode)); 6297 6298 key[0].objectid = objectid; 6299 key[0].type = BTRFS_INODE_ITEM_KEY; 6300 key[0].offset = 0; 6301 6302 sizes[0] = sizeof(struct btrfs_inode_item); 6303 6304 if (!args->orphan) { 6305 /* 6306 * Start new inodes with an inode_ref. This is slightly more 6307 * efficient for small numbers of hard links since they will 6308 * be packed into one item. Extended refs will kick in if we 6309 * add more hard links than can fit in the ref item. 6310 */ 6311 key[1].objectid = objectid; 6312 key[1].type = BTRFS_INODE_REF_KEY; 6313 if (args->subvol) { 6314 key[1].offset = objectid; 6315 sizes[1] = 2 + sizeof(*ref); 6316 } else { 6317 key[1].offset = btrfs_ino(BTRFS_I(dir)); 6318 sizes[1] = name->len + sizeof(*ref); 6319 } 6320 } 6321 6322 batch.keys = &key[0]; 6323 batch.data_sizes = &sizes[0]; 6324 batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); 6325 batch.nr = args->orphan ? 1 : 2; 6326 ret = btrfs_insert_empty_items(trans, root, path, &batch); 6327 if (ret != 0) { 6328 btrfs_abort_transaction(trans, ret); 6329 goto discard; 6330 } 6331 6332 ts = simple_inode_init_ts(inode); 6333 BTRFS_I(inode)->i_otime_sec = ts.tv_sec; 6334 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; 6335 6336 /* 6337 * We're going to fill the inode item now, so at this point the inode 6338 * must be fully initialized. 6339 */ 6340 6341 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 6342 struct btrfs_inode_item); 6343 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, 6344 sizeof(*inode_item)); 6345 fill_inode_item(trans, path->nodes[0], inode_item, inode); 6346 6347 if (!args->orphan) { 6348 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 6349 struct btrfs_inode_ref); 6350 ptr = (unsigned long)(ref + 1); 6351 if (args->subvol) { 6352 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); 6353 btrfs_set_inode_ref_index(path->nodes[0], ref, 0); 6354 write_extent_buffer(path->nodes[0], "..", ptr, 2); 6355 } else { 6356 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 6357 name->len); 6358 btrfs_set_inode_ref_index(path->nodes[0], ref, 6359 BTRFS_I(inode)->dir_index); 6360 write_extent_buffer(path->nodes[0], name->name, ptr, 6361 name->len); 6362 } 6363 } 6364 6365 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 6366 /* 6367 * We don't need the path anymore, plus inheriting properties, adding 6368 * ACLs, security xattrs, orphan item or adding the link, will result in 6369 * allocating yet another path. So just free our path. 6370 */ 6371 btrfs_free_path(path); 6372 path = NULL; 6373 6374 if (args->subvol) { 6375 struct inode *parent; 6376 6377 /* 6378 * Subvolumes inherit properties from their parent subvolume, 6379 * not the directory they were created in. 6380 */ 6381 parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); 6382 if (IS_ERR(parent)) { 6383 ret = PTR_ERR(parent); 6384 } else { 6385 ret = btrfs_inode_inherit_props(trans, inode, parent); 6386 iput(parent); 6387 } 6388 } else { 6389 ret = btrfs_inode_inherit_props(trans, inode, dir); 6390 } 6391 if (ret) { 6392 btrfs_err(fs_info, 6393 "error inheriting props for ino %llu (root %llu): %d", 6394 btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); 6395 } 6396 6397 /* 6398 * Subvolumes don't inherit ACLs or get passed to the LSM. This is 6399 * probably a bug. 6400 */ 6401 if (!args->subvol) { 6402 ret = btrfs_init_inode_security(trans, args); 6403 if (ret) { 6404 btrfs_abort_transaction(trans, ret); 6405 goto discard; 6406 } 6407 } 6408 6409 ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); 6410 if (WARN_ON(ret)) { 6411 /* Shouldn't happen, we used xa_reserve() before. */ 6412 btrfs_abort_transaction(trans, ret); 6413 goto discard; 6414 } 6415 6416 trace_btrfs_inode_new(inode); 6417 btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); 6418 6419 btrfs_update_root_times(trans, root); 6420 6421 if (args->orphan) { 6422 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 6423 } else { 6424 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 6425 0, BTRFS_I(inode)->dir_index); 6426 } 6427 if (ret) { 6428 btrfs_abort_transaction(trans, ret); 6429 goto discard; 6430 } 6431 6432 return 0; 6433 6434 discard: 6435 /* 6436 * discard_new_inode() calls iput(), but the caller owns the reference 6437 * to the inode. 6438 */ 6439 ihold(inode); 6440 discard_new_inode(inode); 6441 out: 6442 if (xa_reserved) 6443 xa_release(&root->inodes, objectid); 6444 6445 btrfs_free_path(path); 6446 return ret; 6447 } 6448 6449 /* 6450 * utility function to add 'inode' into 'parent_inode' with 6451 * a give name and a given sequence number. 6452 * if 'add_backref' is true, also insert a backref from the 6453 * inode to the parent directory. 6454 */ 6455 int btrfs_add_link(struct btrfs_trans_handle *trans, 6456 struct btrfs_inode *parent_inode, struct btrfs_inode *inode, 6457 const struct fscrypt_str *name, int add_backref, u64 index) 6458 { 6459 int ret = 0; 6460 struct btrfs_key key; 6461 struct btrfs_root *root = parent_inode->root; 6462 u64 ino = btrfs_ino(inode); 6463 u64 parent_ino = btrfs_ino(parent_inode); 6464 6465 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6466 memcpy(&key, &inode->root->root_key, sizeof(key)); 6467 } else { 6468 key.objectid = ino; 6469 key.type = BTRFS_INODE_ITEM_KEY; 6470 key.offset = 0; 6471 } 6472 6473 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6474 ret = btrfs_add_root_ref(trans, key.objectid, 6475 btrfs_root_id(root), parent_ino, 6476 index, name); 6477 } else if (add_backref) { 6478 ret = btrfs_insert_inode_ref(trans, root, name, 6479 ino, parent_ino, index); 6480 } 6481 6482 /* Nothing to clean up yet */ 6483 if (ret) 6484 return ret; 6485 6486 ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, 6487 btrfs_inode_type(&inode->vfs_inode), index); 6488 if (ret == -EEXIST || ret == -EOVERFLOW) 6489 goto fail_dir_item; 6490 else if (ret) { 6491 btrfs_abort_transaction(trans, ret); 6492 return ret; 6493 } 6494 6495 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + 6496 name->len * 2); 6497 inode_inc_iversion(&parent_inode->vfs_inode); 6498 /* 6499 * If we are replaying a log tree, we do not want to update the mtime 6500 * and ctime of the parent directory with the current time, since the 6501 * log replay procedure is responsible for setting them to their correct 6502 * values (the ones it had when the fsync was done). 6503 */ 6504 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) 6505 inode_set_mtime_to_ts(&parent_inode->vfs_inode, 6506 inode_set_ctime_current(&parent_inode->vfs_inode)); 6507 6508 ret = btrfs_update_inode(trans, parent_inode); 6509 if (ret) 6510 btrfs_abort_transaction(trans, ret); 6511 return ret; 6512 6513 fail_dir_item: 6514 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6515 u64 local_index; 6516 int err; 6517 err = btrfs_del_root_ref(trans, key.objectid, 6518 btrfs_root_id(root), parent_ino, 6519 &local_index, name); 6520 if (err) 6521 btrfs_abort_transaction(trans, err); 6522 } else if (add_backref) { 6523 u64 local_index; 6524 int err; 6525 6526 err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, 6527 &local_index); 6528 if (err) 6529 btrfs_abort_transaction(trans, err); 6530 } 6531 6532 /* Return the original error code */ 6533 return ret; 6534 } 6535 6536 static int btrfs_create_common(struct inode *dir, struct dentry *dentry, 6537 struct inode *inode) 6538 { 6539 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 6540 struct btrfs_root *root = BTRFS_I(dir)->root; 6541 struct btrfs_new_inode_args new_inode_args = { 6542 .dir = dir, 6543 .dentry = dentry, 6544 .inode = inode, 6545 }; 6546 unsigned int trans_num_items; 6547 struct btrfs_trans_handle *trans; 6548 int err; 6549 6550 err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 6551 if (err) 6552 goto out_inode; 6553 6554 trans = btrfs_start_transaction(root, trans_num_items); 6555 if (IS_ERR(trans)) { 6556 err = PTR_ERR(trans); 6557 goto out_new_inode_args; 6558 } 6559 6560 err = btrfs_create_new_inode(trans, &new_inode_args); 6561 if (!err) 6562 d_instantiate_new(dentry, inode); 6563 6564 btrfs_end_transaction(trans); 6565 btrfs_btree_balance_dirty(fs_info); 6566 out_new_inode_args: 6567 btrfs_new_inode_args_destroy(&new_inode_args); 6568 out_inode: 6569 if (err) 6570 iput(inode); 6571 return err; 6572 } 6573 6574 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, 6575 struct dentry *dentry, umode_t mode, dev_t rdev) 6576 { 6577 struct inode *inode; 6578 6579 inode = new_inode(dir->i_sb); 6580 if (!inode) 6581 return -ENOMEM; 6582 inode_init_owner(idmap, inode, dir, mode); 6583 inode->i_op = &btrfs_special_inode_operations; 6584 init_special_inode(inode, inode->i_mode, rdev); 6585 return btrfs_create_common(dir, dentry, inode); 6586 } 6587 6588 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir, 6589 struct dentry *dentry, umode_t mode, bool excl) 6590 { 6591 struct inode *inode; 6592 6593 inode = new_inode(dir->i_sb); 6594 if (!inode) 6595 return -ENOMEM; 6596 inode_init_owner(idmap, inode, dir, mode); 6597 inode->i_fop = &btrfs_file_operations; 6598 inode->i_op = &btrfs_file_inode_operations; 6599 inode->i_mapping->a_ops = &btrfs_aops; 6600 return btrfs_create_common(dir, dentry, inode); 6601 } 6602 6603 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6604 struct dentry *dentry) 6605 { 6606 struct btrfs_trans_handle *trans = NULL; 6607 struct btrfs_root *root = BTRFS_I(dir)->root; 6608 struct inode *inode = d_inode(old_dentry); 6609 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 6610 struct fscrypt_name fname; 6611 u64 index; 6612 int err; 6613 int drop_inode = 0; 6614 6615 /* do not allow sys_link's with other subvols of the same device */ 6616 if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) 6617 return -EXDEV; 6618 6619 if (inode->i_nlink >= BTRFS_LINK_MAX) 6620 return -EMLINK; 6621 6622 err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); 6623 if (err) 6624 goto fail; 6625 6626 err = btrfs_set_inode_index(BTRFS_I(dir), &index); 6627 if (err) 6628 goto fail; 6629 6630 /* 6631 * 2 items for inode and inode ref 6632 * 2 items for dir items 6633 * 1 item for parent inode 6634 * 1 item for orphan item deletion if O_TMPFILE 6635 */ 6636 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); 6637 if (IS_ERR(trans)) { 6638 err = PTR_ERR(trans); 6639 trans = NULL; 6640 goto fail; 6641 } 6642 6643 /* There are several dir indexes for this inode, clear the cache. */ 6644 BTRFS_I(inode)->dir_index = 0ULL; 6645 inc_nlink(inode); 6646 inode_inc_iversion(inode); 6647 inode_set_ctime_current(inode); 6648 ihold(inode); 6649 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6650 6651 err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6652 &fname.disk_name, 1, index); 6653 6654 if (err) { 6655 drop_inode = 1; 6656 } else { 6657 struct dentry *parent = dentry->d_parent; 6658 6659 err = btrfs_update_inode(trans, BTRFS_I(inode)); 6660 if (err) 6661 goto fail; 6662 if (inode->i_nlink == 1) { 6663 /* 6664 * If new hard link count is 1, it's a file created 6665 * with open(2) O_TMPFILE flag. 6666 */ 6667 err = btrfs_orphan_del(trans, BTRFS_I(inode)); 6668 if (err) 6669 goto fail; 6670 } 6671 d_instantiate(dentry, inode); 6672 btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); 6673 } 6674 6675 fail: 6676 fscrypt_free_filename(&fname); 6677 if (trans) 6678 btrfs_end_transaction(trans); 6679 if (drop_inode) { 6680 inode_dec_link_count(inode); 6681 iput(inode); 6682 } 6683 btrfs_btree_balance_dirty(fs_info); 6684 return err; 6685 } 6686 6687 static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, 6688 struct dentry *dentry, umode_t mode) 6689 { 6690 struct inode *inode; 6691 6692 inode = new_inode(dir->i_sb); 6693 if (!inode) 6694 return -ENOMEM; 6695 inode_init_owner(idmap, inode, dir, S_IFDIR | mode); 6696 inode->i_op = &btrfs_dir_inode_operations; 6697 inode->i_fop = &btrfs_dir_file_operations; 6698 return btrfs_create_common(dir, dentry, inode); 6699 } 6700 6701 static noinline int uncompress_inline(struct btrfs_path *path, 6702 struct page *page, 6703 struct btrfs_file_extent_item *item) 6704 { 6705 int ret; 6706 struct extent_buffer *leaf = path->nodes[0]; 6707 char *tmp; 6708 size_t max_size; 6709 unsigned long inline_size; 6710 unsigned long ptr; 6711 int compress_type; 6712 6713 compress_type = btrfs_file_extent_compression(leaf, item); 6714 max_size = btrfs_file_extent_ram_bytes(leaf, item); 6715 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); 6716 tmp = kmalloc(inline_size, GFP_NOFS); 6717 if (!tmp) 6718 return -ENOMEM; 6719 ptr = btrfs_file_extent_inline_start(item); 6720 6721 read_extent_buffer(leaf, tmp, ptr, inline_size); 6722 6723 max_size = min_t(unsigned long, PAGE_SIZE, max_size); 6724 ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); 6725 6726 /* 6727 * decompression code contains a memset to fill in any space between the end 6728 * of the uncompressed data and the end of max_size in case the decompressed 6729 * data ends up shorter than ram_bytes. That doesn't cover the hole between 6730 * the end of an inline extent and the beginning of the next block, so we 6731 * cover that region here. 6732 */ 6733 6734 if (max_size < PAGE_SIZE) 6735 memzero_page(page, max_size, PAGE_SIZE - max_size); 6736 kfree(tmp); 6737 return ret; 6738 } 6739 6740 static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, 6741 struct page *page) 6742 { 6743 struct btrfs_file_extent_item *fi; 6744 void *kaddr; 6745 size_t copy_size; 6746 6747 if (!page || PageUptodate(page)) 6748 return 0; 6749 6750 ASSERT(page_offset(page) == 0); 6751 6752 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 6753 struct btrfs_file_extent_item); 6754 if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) 6755 return uncompress_inline(path, page, fi); 6756 6757 copy_size = min_t(u64, PAGE_SIZE, 6758 btrfs_file_extent_ram_bytes(path->nodes[0], fi)); 6759 kaddr = kmap_local_page(page); 6760 read_extent_buffer(path->nodes[0], kaddr, 6761 btrfs_file_extent_inline_start(fi), copy_size); 6762 kunmap_local(kaddr); 6763 if (copy_size < PAGE_SIZE) 6764 memzero_page(page, copy_size, PAGE_SIZE - copy_size); 6765 return 0; 6766 } 6767 6768 /* 6769 * Lookup the first extent overlapping a range in a file. 6770 * 6771 * @inode: file to search in 6772 * @page: page to read extent data into if the extent is inline 6773 * @start: file offset 6774 * @len: length of range starting at @start 6775 * 6776 * Return the first &struct extent_map which overlaps the given range, reading 6777 * it from the B-tree and caching it if necessary. Note that there may be more 6778 * extents which overlap the given range after the returned extent_map. 6779 * 6780 * If @page is not NULL and the extent is inline, this also reads the extent 6781 * data directly into the page and marks the extent up to date in the io_tree. 6782 * 6783 * Return: ERR_PTR on error, non-NULL extent_map on success. 6784 */ 6785 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, 6786 struct page *page, u64 start, u64 len) 6787 { 6788 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6789 int ret = 0; 6790 u64 extent_start = 0; 6791 u64 extent_end = 0; 6792 u64 objectid = btrfs_ino(inode); 6793 int extent_type = -1; 6794 struct btrfs_path *path = NULL; 6795 struct btrfs_root *root = inode->root; 6796 struct btrfs_file_extent_item *item; 6797 struct extent_buffer *leaf; 6798 struct btrfs_key found_key; 6799 struct extent_map *em = NULL; 6800 struct extent_map_tree *em_tree = &inode->extent_tree; 6801 6802 read_lock(&em_tree->lock); 6803 em = lookup_extent_mapping(em_tree, start, len); 6804 read_unlock(&em_tree->lock); 6805 6806 if (em) { 6807 if (em->start > start || em->start + em->len <= start) 6808 free_extent_map(em); 6809 else if (em->disk_bytenr == EXTENT_MAP_INLINE && page) 6810 free_extent_map(em); 6811 else 6812 goto out; 6813 } 6814 em = alloc_extent_map(); 6815 if (!em) { 6816 ret = -ENOMEM; 6817 goto out; 6818 } 6819 em->start = EXTENT_MAP_HOLE; 6820 em->disk_bytenr = EXTENT_MAP_HOLE; 6821 em->len = (u64)-1; 6822 6823 path = btrfs_alloc_path(); 6824 if (!path) { 6825 ret = -ENOMEM; 6826 goto out; 6827 } 6828 6829 /* Chances are we'll be called again, so go ahead and do readahead */ 6830 path->reada = READA_FORWARD; 6831 6832 /* 6833 * The same explanation in load_free_space_cache applies here as well, 6834 * we only read when we're loading the free space cache, and at that 6835 * point the commit_root has everything we need. 6836 */ 6837 if (btrfs_is_free_space_inode(inode)) { 6838 path->search_commit_root = 1; 6839 path->skip_locking = 1; 6840 } 6841 6842 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); 6843 if (ret < 0) { 6844 goto out; 6845 } else if (ret > 0) { 6846 if (path->slots[0] == 0) 6847 goto not_found; 6848 path->slots[0]--; 6849 ret = 0; 6850 } 6851 6852 leaf = path->nodes[0]; 6853 item = btrfs_item_ptr(leaf, path->slots[0], 6854 struct btrfs_file_extent_item); 6855 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6856 if (found_key.objectid != objectid || 6857 found_key.type != BTRFS_EXTENT_DATA_KEY) { 6858 /* 6859 * If we backup past the first extent we want to move forward 6860 * and see if there is an extent in front of us, otherwise we'll 6861 * say there is a hole for our whole search range which can 6862 * cause problems. 6863 */ 6864 extent_end = start; 6865 goto next; 6866 } 6867 6868 extent_type = btrfs_file_extent_type(leaf, item); 6869 extent_start = found_key.offset; 6870 extent_end = btrfs_file_extent_end(path); 6871 if (extent_type == BTRFS_FILE_EXTENT_REG || 6872 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 6873 /* Only regular file could have regular/prealloc extent */ 6874 if (!S_ISREG(inode->vfs_inode.i_mode)) { 6875 ret = -EUCLEAN; 6876 btrfs_crit(fs_info, 6877 "regular/prealloc extent found for non-regular inode %llu", 6878 btrfs_ino(inode)); 6879 goto out; 6880 } 6881 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, 6882 extent_start); 6883 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 6884 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, 6885 path->slots[0], 6886 extent_start); 6887 } 6888 next: 6889 if (start >= extent_end) { 6890 path->slots[0]++; 6891 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 6892 ret = btrfs_next_leaf(root, path); 6893 if (ret < 0) 6894 goto out; 6895 else if (ret > 0) 6896 goto not_found; 6897 6898 leaf = path->nodes[0]; 6899 } 6900 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6901 if (found_key.objectid != objectid || 6902 found_key.type != BTRFS_EXTENT_DATA_KEY) 6903 goto not_found; 6904 if (start + len <= found_key.offset) 6905 goto not_found; 6906 if (start > found_key.offset) 6907 goto next; 6908 6909 /* New extent overlaps with existing one */ 6910 em->start = start; 6911 em->len = found_key.offset - start; 6912 em->disk_bytenr = EXTENT_MAP_HOLE; 6913 goto insert; 6914 } 6915 6916 btrfs_extent_item_to_extent_map(inode, path, item, em); 6917 6918 if (extent_type == BTRFS_FILE_EXTENT_REG || 6919 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 6920 goto insert; 6921 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 6922 /* 6923 * Inline extent can only exist at file offset 0. This is 6924 * ensured by tree-checker and inline extent creation path. 6925 * Thus all members representing file offsets should be zero. 6926 */ 6927 ASSERT(extent_start == 0); 6928 ASSERT(em->start == 0); 6929 6930 /* 6931 * btrfs_extent_item_to_extent_map() should have properly 6932 * initialized em members already. 6933 * 6934 * Other members are not utilized for inline extents. 6935 */ 6936 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); 6937 ASSERT(em->len == fs_info->sectorsize); 6938 6939 ret = read_inline_extent(inode, path, page); 6940 if (ret < 0) 6941 goto out; 6942 goto insert; 6943 } 6944 not_found: 6945 em->start = start; 6946 em->len = len; 6947 em->disk_bytenr = EXTENT_MAP_HOLE; 6948 insert: 6949 ret = 0; 6950 btrfs_release_path(path); 6951 if (em->start > start || extent_map_end(em) <= start) { 6952 btrfs_err(fs_info, 6953 "bad extent! em: [%llu %llu] passed [%llu %llu]", 6954 em->start, em->len, start, len); 6955 ret = -EIO; 6956 goto out; 6957 } 6958 6959 write_lock(&em_tree->lock); 6960 ret = btrfs_add_extent_mapping(inode, &em, start, len); 6961 write_unlock(&em_tree->lock); 6962 out: 6963 btrfs_free_path(path); 6964 6965 trace_btrfs_get_extent(root, inode, em); 6966 6967 if (ret) { 6968 free_extent_map(em); 6969 return ERR_PTR(ret); 6970 } 6971 return em; 6972 } 6973 6974 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 6975 { 6976 struct btrfs_block_group *block_group; 6977 bool readonly = false; 6978 6979 block_group = btrfs_lookup_block_group(fs_info, bytenr); 6980 if (!block_group || block_group->ro) 6981 readonly = true; 6982 if (block_group) 6983 btrfs_put_block_group(block_group); 6984 return readonly; 6985 } 6986 6987 /* 6988 * Check if we can do nocow write into the range [@offset, @offset + @len) 6989 * 6990 * @offset: File offset 6991 * @len: The length to write, will be updated to the nocow writeable 6992 * range 6993 * @orig_start: (optional) Return the original file offset of the file extent 6994 * @orig_len: (optional) Return the original on-disk length of the file extent 6995 * @ram_bytes: (optional) Return the ram_bytes of the file extent 6996 * @strict: if true, omit optimizations that might force us into unnecessary 6997 * cow. e.g., don't trust generation number. 6998 * 6999 * Return: 7000 * >0 and update @len if we can do nocow write 7001 * 0 if we can't do nocow write 7002 * <0 if error happened 7003 * 7004 * NOTE: This only checks the file extents, caller is responsible to wait for 7005 * any ordered extents. 7006 */ 7007 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 7008 struct btrfs_file_extent *file_extent, 7009 bool nowait, bool strict) 7010 { 7011 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 7012 struct can_nocow_file_extent_args nocow_args = { 0 }; 7013 struct btrfs_path *path; 7014 int ret; 7015 struct extent_buffer *leaf; 7016 struct btrfs_root *root = BTRFS_I(inode)->root; 7017 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7018 struct btrfs_file_extent_item *fi; 7019 struct btrfs_key key; 7020 int found_type; 7021 7022 path = btrfs_alloc_path(); 7023 if (!path) 7024 return -ENOMEM; 7025 path->nowait = nowait; 7026 7027 ret = btrfs_lookup_file_extent(NULL, root, path, 7028 btrfs_ino(BTRFS_I(inode)), offset, 0); 7029 if (ret < 0) 7030 goto out; 7031 7032 if (ret == 1) { 7033 if (path->slots[0] == 0) { 7034 /* can't find the item, must cow */ 7035 ret = 0; 7036 goto out; 7037 } 7038 path->slots[0]--; 7039 } 7040 ret = 0; 7041 leaf = path->nodes[0]; 7042 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 7043 if (key.objectid != btrfs_ino(BTRFS_I(inode)) || 7044 key.type != BTRFS_EXTENT_DATA_KEY) { 7045 /* not our file or wrong item type, must cow */ 7046 goto out; 7047 } 7048 7049 if (key.offset > offset) { 7050 /* Wrong offset, must cow */ 7051 goto out; 7052 } 7053 7054 if (btrfs_file_extent_end(path) <= offset) 7055 goto out; 7056 7057 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 7058 found_type = btrfs_file_extent_type(leaf, fi); 7059 7060 nocow_args.start = offset; 7061 nocow_args.end = offset + *len - 1; 7062 nocow_args.strict = strict; 7063 nocow_args.free_path = true; 7064 7065 ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); 7066 /* can_nocow_file_extent() has freed the path. */ 7067 path = NULL; 7068 7069 if (ret != 1) { 7070 /* Treat errors as not being able to NOCOW. */ 7071 ret = 0; 7072 goto out; 7073 } 7074 7075 ret = 0; 7076 if (btrfs_extent_readonly(fs_info, 7077 nocow_args.file_extent.disk_bytenr + 7078 nocow_args.file_extent.offset)) 7079 goto out; 7080 7081 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 7082 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7083 u64 range_end; 7084 7085 range_end = round_up(offset + nocow_args.file_extent.num_bytes, 7086 root->fs_info->sectorsize) - 1; 7087 ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); 7088 if (ret) { 7089 ret = -EAGAIN; 7090 goto out; 7091 } 7092 } 7093 7094 if (file_extent) 7095 memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); 7096 7097 *len = nocow_args.file_extent.num_bytes; 7098 ret = 1; 7099 out: 7100 btrfs_free_path(path); 7101 return ret; 7102 } 7103 7104 /* The callers of this must take lock_extent() */ 7105 struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, 7106 const struct btrfs_file_extent *file_extent, 7107 int type) 7108 { 7109 struct extent_map *em; 7110 int ret; 7111 7112 /* 7113 * Note the missing NOCOW type. 7114 * 7115 * For pure NOCOW writes, we should not create an io extent map, but 7116 * just reusing the existing one. 7117 * Only PREALLOC writes (NOCOW write into preallocated range) can 7118 * create an io extent map. 7119 */ 7120 ASSERT(type == BTRFS_ORDERED_PREALLOC || 7121 type == BTRFS_ORDERED_COMPRESSED || 7122 type == BTRFS_ORDERED_REGULAR); 7123 7124 switch (type) { 7125 case BTRFS_ORDERED_PREALLOC: 7126 /* We're only referring part of a larger preallocated extent. */ 7127 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); 7128 break; 7129 case BTRFS_ORDERED_REGULAR: 7130 /* COW results a new extent matching our file extent size. */ 7131 ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); 7132 ASSERT(file_extent->ram_bytes == file_extent->num_bytes); 7133 7134 /* Since it's a new extent, we should not have any offset. */ 7135 ASSERT(file_extent->offset == 0); 7136 break; 7137 case BTRFS_ORDERED_COMPRESSED: 7138 /* Must be compressed. */ 7139 ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE); 7140 7141 /* 7142 * Encoded write can make us to refer to part of the 7143 * uncompressed extent. 7144 */ 7145 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); 7146 break; 7147 } 7148 7149 em = alloc_extent_map(); 7150 if (!em) 7151 return ERR_PTR(-ENOMEM); 7152 7153 em->start = start; 7154 em->len = file_extent->num_bytes; 7155 em->disk_bytenr = file_extent->disk_bytenr; 7156 em->disk_num_bytes = file_extent->disk_num_bytes; 7157 em->ram_bytes = file_extent->ram_bytes; 7158 em->generation = -1; 7159 em->offset = file_extent->offset; 7160 em->flags |= EXTENT_FLAG_PINNED; 7161 if (type == BTRFS_ORDERED_COMPRESSED) 7162 extent_map_set_compression(em, file_extent->compression); 7163 7164 ret = btrfs_replace_extent_map_range(inode, em, true); 7165 if (ret) { 7166 free_extent_map(em); 7167 return ERR_PTR(ret); 7168 } 7169 7170 /* em got 2 refs now, callers needs to do free_extent_map once. */ 7171 return em; 7172 } 7173 7174 /* 7175 * For release_folio() and invalidate_folio() we have a race window where 7176 * folio_end_writeback() is called but the subpage spinlock is not yet released. 7177 * If we continue to release/invalidate the page, we could cause use-after-free 7178 * for subpage spinlock. So this function is to spin and wait for subpage 7179 * spinlock. 7180 */ 7181 static void wait_subpage_spinlock(struct page *page) 7182 { 7183 struct btrfs_fs_info *fs_info = page_to_fs_info(page); 7184 struct folio *folio = page_folio(page); 7185 struct btrfs_subpage *subpage; 7186 7187 if (!btrfs_is_subpage(fs_info, page->mapping)) 7188 return; 7189 7190 ASSERT(folio_test_private(folio) && folio_get_private(folio)); 7191 subpage = folio_get_private(folio); 7192 7193 /* 7194 * This may look insane as we just acquire the spinlock and release it, 7195 * without doing anything. But we just want to make sure no one is 7196 * still holding the subpage spinlock. 7197 * And since the page is not dirty nor writeback, and we have page 7198 * locked, the only possible way to hold a spinlock is from the endio 7199 * function to clear page writeback. 7200 * 7201 * Here we just acquire the spinlock so that all existing callers 7202 * should exit and we're safe to release/invalidate the page. 7203 */ 7204 spin_lock_irq(&subpage->lock); 7205 spin_unlock_irq(&subpage->lock); 7206 } 7207 7208 static int btrfs_launder_folio(struct folio *folio) 7209 { 7210 return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), 7211 PAGE_SIZE, NULL); 7212 } 7213 7214 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) 7215 { 7216 if (try_release_extent_mapping(&folio->page, gfp_flags)) { 7217 wait_subpage_spinlock(&folio->page); 7218 clear_page_extent_mapped(&folio->page); 7219 return true; 7220 } 7221 return false; 7222 } 7223 7224 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) 7225 { 7226 if (folio_test_writeback(folio) || folio_test_dirty(folio)) 7227 return false; 7228 return __btrfs_release_folio(folio, gfp_flags); 7229 } 7230 7231 #ifdef CONFIG_MIGRATION 7232 static int btrfs_migrate_folio(struct address_space *mapping, 7233 struct folio *dst, struct folio *src, 7234 enum migrate_mode mode) 7235 { 7236 int ret = filemap_migrate_folio(mapping, dst, src, mode); 7237 7238 if (ret != MIGRATEPAGE_SUCCESS) 7239 return ret; 7240 7241 if (folio_test_ordered(src)) { 7242 folio_clear_ordered(src); 7243 folio_set_ordered(dst); 7244 } 7245 7246 return MIGRATEPAGE_SUCCESS; 7247 } 7248 #else 7249 #define btrfs_migrate_folio NULL 7250 #endif 7251 7252 static void btrfs_invalidate_folio(struct folio *folio, size_t offset, 7253 size_t length) 7254 { 7255 struct btrfs_inode *inode = folio_to_inode(folio); 7256 struct btrfs_fs_info *fs_info = inode->root->fs_info; 7257 struct extent_io_tree *tree = &inode->io_tree; 7258 struct extent_state *cached_state = NULL; 7259 u64 page_start = folio_pos(folio); 7260 u64 page_end = page_start + folio_size(folio) - 1; 7261 u64 cur; 7262 int inode_evicting = inode->vfs_inode.i_state & I_FREEING; 7263 7264 /* 7265 * We have folio locked so no new ordered extent can be created on this 7266 * page, nor bio can be submitted for this folio. 7267 * 7268 * But already submitted bio can still be finished on this folio. 7269 * Furthermore, endio function won't skip folio which has Ordered 7270 * (Private2) already cleared, so it's possible for endio and 7271 * invalidate_folio to do the same ordered extent accounting twice 7272 * on one folio. 7273 * 7274 * So here we wait for any submitted bios to finish, so that we won't 7275 * do double ordered extent accounting on the same folio. 7276 */ 7277 folio_wait_writeback(folio); 7278 wait_subpage_spinlock(&folio->page); 7279 7280 /* 7281 * For subpage case, we have call sites like 7282 * btrfs_punch_hole_lock_range() which passes range not aligned to 7283 * sectorsize. 7284 * If the range doesn't cover the full folio, we don't need to and 7285 * shouldn't clear page extent mapped, as folio->private can still 7286 * record subpage dirty bits for other part of the range. 7287 * 7288 * For cases that invalidate the full folio even the range doesn't 7289 * cover the full folio, like invalidating the last folio, we're 7290 * still safe to wait for ordered extent to finish. 7291 */ 7292 if (!(offset == 0 && length == folio_size(folio))) { 7293 btrfs_release_folio(folio, GFP_NOFS); 7294 return; 7295 } 7296 7297 if (!inode_evicting) 7298 lock_extent(tree, page_start, page_end, &cached_state); 7299 7300 cur = page_start; 7301 while (cur < page_end) { 7302 struct btrfs_ordered_extent *ordered; 7303 u64 range_end; 7304 u32 range_len; 7305 u32 extra_flags = 0; 7306 7307 ordered = btrfs_lookup_first_ordered_range(inode, cur, 7308 page_end + 1 - cur); 7309 if (!ordered) { 7310 range_end = page_end; 7311 /* 7312 * No ordered extent covering this range, we are safe 7313 * to delete all extent states in the range. 7314 */ 7315 extra_flags = EXTENT_CLEAR_ALL_BITS; 7316 goto next; 7317 } 7318 if (ordered->file_offset > cur) { 7319 /* 7320 * There is a range between [cur, oe->file_offset) not 7321 * covered by any ordered extent. 7322 * We are safe to delete all extent states, and handle 7323 * the ordered extent in the next iteration. 7324 */ 7325 range_end = ordered->file_offset - 1; 7326 extra_flags = EXTENT_CLEAR_ALL_BITS; 7327 goto next; 7328 } 7329 7330 range_end = min(ordered->file_offset + ordered->num_bytes - 1, 7331 page_end); 7332 ASSERT(range_end + 1 - cur < U32_MAX); 7333 range_len = range_end + 1 - cur; 7334 if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { 7335 /* 7336 * If Ordered (Private2) is cleared, it means endio has 7337 * already been executed for the range. 7338 * We can't delete the extent states as 7339 * btrfs_finish_ordered_io() may still use some of them. 7340 */ 7341 goto next; 7342 } 7343 btrfs_folio_clear_ordered(fs_info, folio, cur, range_len); 7344 7345 /* 7346 * IO on this page will never be started, so we need to account 7347 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW 7348 * here, must leave that up for the ordered extent completion. 7349 * 7350 * This will also unlock the range for incoming 7351 * btrfs_finish_ordered_io(). 7352 */ 7353 if (!inode_evicting) 7354 clear_extent_bit(tree, cur, range_end, 7355 EXTENT_DELALLOC | 7356 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 7357 EXTENT_DEFRAG, &cached_state); 7358 7359 spin_lock_irq(&inode->ordered_tree_lock); 7360 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 7361 ordered->truncated_len = min(ordered->truncated_len, 7362 cur - ordered->file_offset); 7363 spin_unlock_irq(&inode->ordered_tree_lock); 7364 7365 /* 7366 * If the ordered extent has finished, we're safe to delete all 7367 * the extent states of the range, otherwise 7368 * btrfs_finish_ordered_io() will get executed by endio for 7369 * other pages, so we can't delete extent states. 7370 */ 7371 if (btrfs_dec_test_ordered_pending(inode, &ordered, 7372 cur, range_end + 1 - cur)) { 7373 btrfs_finish_ordered_io(ordered); 7374 /* 7375 * The ordered extent has finished, now we're again 7376 * safe to delete all extent states of the range. 7377 */ 7378 extra_flags = EXTENT_CLEAR_ALL_BITS; 7379 } 7380 next: 7381 if (ordered) 7382 btrfs_put_ordered_extent(ordered); 7383 /* 7384 * Qgroup reserved space handler 7385 * Sector(s) here will be either: 7386 * 7387 * 1) Already written to disk or bio already finished 7388 * Then its QGROUP_RESERVED bit in io_tree is already cleared. 7389 * Qgroup will be handled by its qgroup_record then. 7390 * btrfs_qgroup_free_data() call will do nothing here. 7391 * 7392 * 2) Not written to disk yet 7393 * Then btrfs_qgroup_free_data() call will clear the 7394 * QGROUP_RESERVED bit of its io_tree, and free the qgroup 7395 * reserved data space. 7396 * Since the IO will never happen for this page. 7397 */ 7398 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); 7399 if (!inode_evicting) { 7400 clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | 7401 EXTENT_DELALLOC | EXTENT_UPTODATE | 7402 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | 7403 extra_flags, &cached_state); 7404 } 7405 cur = range_end + 1; 7406 } 7407 /* 7408 * We have iterated through all ordered extents of the page, the page 7409 * should not have Ordered (Private2) anymore, or the above iteration 7410 * did something wrong. 7411 */ 7412 ASSERT(!folio_test_ordered(folio)); 7413 btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); 7414 if (!inode_evicting) 7415 __btrfs_release_folio(folio, GFP_NOFS); 7416 clear_page_extent_mapped(&folio->page); 7417 } 7418 7419 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) 7420 { 7421 struct btrfs_truncate_control control = { 7422 .inode = inode, 7423 .ino = btrfs_ino(inode), 7424 .min_type = BTRFS_EXTENT_DATA_KEY, 7425 .clear_extent_range = true, 7426 }; 7427 struct btrfs_root *root = inode->root; 7428 struct btrfs_fs_info *fs_info = root->fs_info; 7429 struct btrfs_block_rsv *rsv; 7430 int ret; 7431 struct btrfs_trans_handle *trans; 7432 u64 mask = fs_info->sectorsize - 1; 7433 const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); 7434 7435 if (!skip_writeback) { 7436 ret = btrfs_wait_ordered_range(inode, 7437 inode->vfs_inode.i_size & (~mask), 7438 (u64)-1); 7439 if (ret) 7440 return ret; 7441 } 7442 7443 /* 7444 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of 7445 * things going on here: 7446 * 7447 * 1) We need to reserve space to update our inode. 7448 * 7449 * 2) We need to have something to cache all the space that is going to 7450 * be free'd up by the truncate operation, but also have some slack 7451 * space reserved in case it uses space during the truncate (thank you 7452 * very much snapshotting). 7453 * 7454 * And we need these to be separate. The fact is we can use a lot of 7455 * space doing the truncate, and we have no earthly idea how much space 7456 * we will use, so we need the truncate reservation to be separate so it 7457 * doesn't end up using space reserved for updating the inode. We also 7458 * need to be able to stop the transaction and start a new one, which 7459 * means we need to be able to update the inode several times, and we 7460 * have no idea of knowing how many times that will be, so we can't just 7461 * reserve 1 item for the entirety of the operation, so that has to be 7462 * done separately as well. 7463 * 7464 * So that leaves us with 7465 * 7466 * 1) rsv - for the truncate reservation, which we will steal from the 7467 * transaction reservation. 7468 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for 7469 * updating the inode. 7470 */ 7471 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 7472 if (!rsv) 7473 return -ENOMEM; 7474 rsv->size = min_size; 7475 rsv->failfast = true; 7476 7477 /* 7478 * 1 for the truncate slack space 7479 * 1 for updating the inode. 7480 */ 7481 trans = btrfs_start_transaction(root, 2); 7482 if (IS_ERR(trans)) { 7483 ret = PTR_ERR(trans); 7484 goto out; 7485 } 7486 7487 /* Migrate the slack space for the truncate to our reserve */ 7488 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, 7489 min_size, false); 7490 /* 7491 * We have reserved 2 metadata units when we started the transaction and 7492 * min_size matches 1 unit, so this should never fail, but if it does, 7493 * it's not critical we just fail truncation. 7494 */ 7495 if (WARN_ON(ret)) { 7496 btrfs_end_transaction(trans); 7497 goto out; 7498 } 7499 7500 trans->block_rsv = rsv; 7501 7502 while (1) { 7503 struct extent_state *cached_state = NULL; 7504 const u64 new_size = inode->vfs_inode.i_size; 7505 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); 7506 7507 control.new_size = new_size; 7508 lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); 7509 /* 7510 * We want to drop from the next block forward in case this new 7511 * size is not block aligned since we will be keeping the last 7512 * block of the extent just the way it is. 7513 */ 7514 btrfs_drop_extent_map_range(inode, 7515 ALIGN(new_size, fs_info->sectorsize), 7516 (u64)-1, false); 7517 7518 ret = btrfs_truncate_inode_items(trans, root, &control); 7519 7520 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 7521 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 7522 7523 unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); 7524 7525 trans->block_rsv = &fs_info->trans_block_rsv; 7526 if (ret != -ENOSPC && ret != -EAGAIN) 7527 break; 7528 7529 ret = btrfs_update_inode(trans, inode); 7530 if (ret) 7531 break; 7532 7533 btrfs_end_transaction(trans); 7534 btrfs_btree_balance_dirty(fs_info); 7535 7536 trans = btrfs_start_transaction(root, 2); 7537 if (IS_ERR(trans)) { 7538 ret = PTR_ERR(trans); 7539 trans = NULL; 7540 break; 7541 } 7542 7543 btrfs_block_rsv_release(fs_info, rsv, -1, NULL); 7544 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 7545 rsv, min_size, false); 7546 /* 7547 * We have reserved 2 metadata units when we started the 7548 * transaction and min_size matches 1 unit, so this should never 7549 * fail, but if it does, it's not critical we just fail truncation. 7550 */ 7551 if (WARN_ON(ret)) 7552 break; 7553 7554 trans->block_rsv = rsv; 7555 } 7556 7557 /* 7558 * We can't call btrfs_truncate_block inside a trans handle as we could 7559 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we 7560 * know we've truncated everything except the last little bit, and can 7561 * do btrfs_truncate_block and then update the disk_i_size. 7562 */ 7563 if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { 7564 btrfs_end_transaction(trans); 7565 btrfs_btree_balance_dirty(fs_info); 7566 7567 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); 7568 if (ret) 7569 goto out; 7570 trans = btrfs_start_transaction(root, 1); 7571 if (IS_ERR(trans)) { 7572 ret = PTR_ERR(trans); 7573 goto out; 7574 } 7575 btrfs_inode_safe_disk_i_size_write(inode, 0); 7576 } 7577 7578 if (trans) { 7579 int ret2; 7580 7581 trans->block_rsv = &fs_info->trans_block_rsv; 7582 ret2 = btrfs_update_inode(trans, inode); 7583 if (ret2 && !ret) 7584 ret = ret2; 7585 7586 ret2 = btrfs_end_transaction(trans); 7587 if (ret2 && !ret) 7588 ret = ret2; 7589 btrfs_btree_balance_dirty(fs_info); 7590 } 7591 out: 7592 btrfs_free_block_rsv(fs_info, rsv); 7593 /* 7594 * So if we truncate and then write and fsync we normally would just 7595 * write the extents that changed, which is a problem if we need to 7596 * first truncate that entire inode. So set this flag so we write out 7597 * all of the extents in the inode to the sync log so we're completely 7598 * safe. 7599 * 7600 * If no extents were dropped or trimmed we don't need to force the next 7601 * fsync to truncate all the inode's items from the log and re-log them 7602 * all. This means the truncate operation did not change the file size, 7603 * or changed it to a smaller size but there was only an implicit hole 7604 * between the old i_size and the new i_size, and there were no prealloc 7605 * extents beyond i_size to drop. 7606 */ 7607 if (control.extents_found > 0) 7608 btrfs_set_inode_full_sync(inode); 7609 7610 return ret; 7611 } 7612 7613 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, 7614 struct inode *dir) 7615 { 7616 struct inode *inode; 7617 7618 inode = new_inode(dir->i_sb); 7619 if (inode) { 7620 /* 7621 * Subvolumes don't inherit the sgid bit or the parent's gid if 7622 * the parent's sgid bit is set. This is probably a bug. 7623 */ 7624 inode_init_owner(idmap, inode, NULL, 7625 S_IFDIR | (~current_umask() & S_IRWXUGO)); 7626 inode->i_op = &btrfs_dir_inode_operations; 7627 inode->i_fop = &btrfs_dir_file_operations; 7628 } 7629 return inode; 7630 } 7631 7632 struct inode *btrfs_alloc_inode(struct super_block *sb) 7633 { 7634 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 7635 struct btrfs_inode *ei; 7636 struct inode *inode; 7637 7638 ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); 7639 if (!ei) 7640 return NULL; 7641 7642 ei->root = NULL; 7643 ei->generation = 0; 7644 ei->last_trans = 0; 7645 ei->last_sub_trans = 0; 7646 ei->logged_trans = 0; 7647 ei->delalloc_bytes = 0; 7648 ei->new_delalloc_bytes = 0; 7649 ei->defrag_bytes = 0; 7650 ei->disk_i_size = 0; 7651 ei->flags = 0; 7652 ei->ro_flags = 0; 7653 /* 7654 * ->index_cnt will be properly initialized later when creating a new 7655 * inode (btrfs_create_new_inode()) or when reading an existing inode 7656 * from disk (btrfs_read_locked_inode()). 7657 */ 7658 ei->csum_bytes = 0; 7659 ei->dir_index = 0; 7660 ei->last_unlink_trans = 0; 7661 ei->last_reflink_trans = 0; 7662 ei->last_log_commit = 0; 7663 7664 spin_lock_init(&ei->lock); 7665 ei->outstanding_extents = 0; 7666 if (sb->s_magic != BTRFS_TEST_MAGIC) 7667 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, 7668 BTRFS_BLOCK_RSV_DELALLOC); 7669 ei->runtime_flags = 0; 7670 ei->prop_compress = BTRFS_COMPRESS_NONE; 7671 ei->defrag_compress = BTRFS_COMPRESS_NONE; 7672 7673 ei->delayed_node = NULL; 7674 7675 ei->i_otime_sec = 0; 7676 ei->i_otime_nsec = 0; 7677 7678 inode = &ei->vfs_inode; 7679 extent_map_tree_init(&ei->extent_tree); 7680 7681 /* This io tree sets the valid inode. */ 7682 extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); 7683 ei->io_tree.inode = ei; 7684 7685 ei->file_extent_tree = NULL; 7686 7687 mutex_init(&ei->log_mutex); 7688 spin_lock_init(&ei->ordered_tree_lock); 7689 ei->ordered_tree = RB_ROOT; 7690 ei->ordered_tree_last = NULL; 7691 INIT_LIST_HEAD(&ei->delalloc_inodes); 7692 INIT_LIST_HEAD(&ei->delayed_iput); 7693 init_rwsem(&ei->i_mmap_lock); 7694 7695 return inode; 7696 } 7697 7698 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 7699 void btrfs_test_destroy_inode(struct inode *inode) 7700 { 7701 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); 7702 kfree(BTRFS_I(inode)->file_extent_tree); 7703 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 7704 } 7705 #endif 7706 7707 void btrfs_free_inode(struct inode *inode) 7708 { 7709 kfree(BTRFS_I(inode)->file_extent_tree); 7710 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 7711 } 7712 7713 void btrfs_destroy_inode(struct inode *vfs_inode) 7714 { 7715 struct btrfs_ordered_extent *ordered; 7716 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 7717 struct btrfs_root *root = inode->root; 7718 bool freespace_inode; 7719 7720 WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); 7721 WARN_ON(vfs_inode->i_data.nrpages); 7722 WARN_ON(inode->block_rsv.reserved); 7723 WARN_ON(inode->block_rsv.size); 7724 WARN_ON(inode->outstanding_extents); 7725 if (!S_ISDIR(vfs_inode->i_mode)) { 7726 WARN_ON(inode->delalloc_bytes); 7727 WARN_ON(inode->new_delalloc_bytes); 7728 WARN_ON(inode->csum_bytes); 7729 } 7730 if (!root || !btrfs_is_data_reloc_root(root)) 7731 WARN_ON(inode->defrag_bytes); 7732 7733 /* 7734 * This can happen where we create an inode, but somebody else also 7735 * created the same inode and we need to destroy the one we already 7736 * created. 7737 */ 7738 if (!root) 7739 return; 7740 7741 /* 7742 * If this is a free space inode do not take the ordered extents lockdep 7743 * map. 7744 */ 7745 freespace_inode = btrfs_is_free_space_inode(inode); 7746 7747 while (1) { 7748 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 7749 if (!ordered) 7750 break; 7751 else { 7752 btrfs_err(root->fs_info, 7753 "found ordered extent %llu %llu on inode cleanup", 7754 ordered->file_offset, ordered->num_bytes); 7755 7756 if (!freespace_inode) 7757 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); 7758 7759 btrfs_remove_ordered_extent(inode, ordered); 7760 btrfs_put_ordered_extent(ordered); 7761 btrfs_put_ordered_extent(ordered); 7762 } 7763 } 7764 btrfs_qgroup_check_reserved_leak(inode); 7765 btrfs_del_inode_from_root(inode); 7766 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 7767 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); 7768 btrfs_put_root(inode->root); 7769 } 7770 7771 int btrfs_drop_inode(struct inode *inode) 7772 { 7773 struct btrfs_root *root = BTRFS_I(inode)->root; 7774 7775 if (root == NULL) 7776 return 1; 7777 7778 /* the snap/subvol tree is on deleting */ 7779 if (btrfs_root_refs(&root->root_item) == 0) 7780 return 1; 7781 else 7782 return generic_drop_inode(inode); 7783 } 7784 7785 static void init_once(void *foo) 7786 { 7787 struct btrfs_inode *ei = foo; 7788 7789 inode_init_once(&ei->vfs_inode); 7790 } 7791 7792 void __cold btrfs_destroy_cachep(void) 7793 { 7794 /* 7795 * Make sure all delayed rcu free inodes are flushed before we 7796 * destroy cache. 7797 */ 7798 rcu_barrier(); 7799 kmem_cache_destroy(btrfs_inode_cachep); 7800 } 7801 7802 int __init btrfs_init_cachep(void) 7803 { 7804 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 7805 sizeof(struct btrfs_inode), 0, 7806 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, 7807 init_once); 7808 if (!btrfs_inode_cachep) 7809 return -ENOMEM; 7810 7811 return 0; 7812 } 7813 7814 static int btrfs_getattr(struct mnt_idmap *idmap, 7815 const struct path *path, struct kstat *stat, 7816 u32 request_mask, unsigned int flags) 7817 { 7818 u64 delalloc_bytes; 7819 u64 inode_bytes; 7820 struct inode *inode = d_inode(path->dentry); 7821 u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize; 7822 u32 bi_flags = BTRFS_I(inode)->flags; 7823 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; 7824 7825 stat->result_mask |= STATX_BTIME; 7826 stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec; 7827 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec; 7828 if (bi_flags & BTRFS_INODE_APPEND) 7829 stat->attributes |= STATX_ATTR_APPEND; 7830 if (bi_flags & BTRFS_INODE_COMPRESS) 7831 stat->attributes |= STATX_ATTR_COMPRESSED; 7832 if (bi_flags & BTRFS_INODE_IMMUTABLE) 7833 stat->attributes |= STATX_ATTR_IMMUTABLE; 7834 if (bi_flags & BTRFS_INODE_NODUMP) 7835 stat->attributes |= STATX_ATTR_NODUMP; 7836 if (bi_ro_flags & BTRFS_INODE_RO_VERITY) 7837 stat->attributes |= STATX_ATTR_VERITY; 7838 7839 stat->attributes_mask |= (STATX_ATTR_APPEND | 7840 STATX_ATTR_COMPRESSED | 7841 STATX_ATTR_IMMUTABLE | 7842 STATX_ATTR_NODUMP); 7843 7844 generic_fillattr(idmap, request_mask, inode, stat); 7845 stat->dev = BTRFS_I(inode)->root->anon_dev; 7846 7847 stat->subvol = BTRFS_I(inode)->root->root_key.objectid; 7848 stat->result_mask |= STATX_SUBVOL; 7849 7850 spin_lock(&BTRFS_I(inode)->lock); 7851 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; 7852 inode_bytes = inode_get_bytes(inode); 7853 spin_unlock(&BTRFS_I(inode)->lock); 7854 stat->blocks = (ALIGN(inode_bytes, blocksize) + 7855 ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT; 7856 return 0; 7857 } 7858 7859 static int btrfs_rename_exchange(struct inode *old_dir, 7860 struct dentry *old_dentry, 7861 struct inode *new_dir, 7862 struct dentry *new_dentry) 7863 { 7864 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); 7865 struct btrfs_trans_handle *trans; 7866 unsigned int trans_num_items; 7867 struct btrfs_root *root = BTRFS_I(old_dir)->root; 7868 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 7869 struct inode *new_inode = new_dentry->d_inode; 7870 struct inode *old_inode = old_dentry->d_inode; 7871 struct btrfs_rename_ctx old_rename_ctx; 7872 struct btrfs_rename_ctx new_rename_ctx; 7873 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 7874 u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); 7875 u64 old_idx = 0; 7876 u64 new_idx = 0; 7877 int ret; 7878 int ret2; 7879 bool need_abort = false; 7880 struct fscrypt_name old_fname, new_fname; 7881 struct fscrypt_str *old_name, *new_name; 7882 7883 /* 7884 * For non-subvolumes allow exchange only within one subvolume, in the 7885 * same inode namespace. Two subvolumes (represented as directory) can 7886 * be exchanged as they're a logical link and have a fixed inode number. 7887 */ 7888 if (root != dest && 7889 (old_ino != BTRFS_FIRST_FREE_OBJECTID || 7890 new_ino != BTRFS_FIRST_FREE_OBJECTID)) 7891 return -EXDEV; 7892 7893 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); 7894 if (ret) 7895 return ret; 7896 7897 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); 7898 if (ret) { 7899 fscrypt_free_filename(&old_fname); 7900 return ret; 7901 } 7902 7903 old_name = &old_fname.disk_name; 7904 new_name = &new_fname.disk_name; 7905 7906 /* close the race window with snapshot create/destroy ioctl */ 7907 if (old_ino == BTRFS_FIRST_FREE_OBJECTID || 7908 new_ino == BTRFS_FIRST_FREE_OBJECTID) 7909 down_read(&fs_info->subvol_sem); 7910 7911 /* 7912 * For each inode: 7913 * 1 to remove old dir item 7914 * 1 to remove old dir index 7915 * 1 to add new dir item 7916 * 1 to add new dir index 7917 * 1 to update parent inode 7918 * 7919 * If the parents are the same, we only need to account for one 7920 */ 7921 trans_num_items = (old_dir == new_dir ? 9 : 10); 7922 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 7923 /* 7924 * 1 to remove old root ref 7925 * 1 to remove old root backref 7926 * 1 to add new root ref 7927 * 1 to add new root backref 7928 */ 7929 trans_num_items += 4; 7930 } else { 7931 /* 7932 * 1 to update inode item 7933 * 1 to remove old inode ref 7934 * 1 to add new inode ref 7935 */ 7936 trans_num_items += 3; 7937 } 7938 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) 7939 trans_num_items += 4; 7940 else 7941 trans_num_items += 3; 7942 trans = btrfs_start_transaction(root, trans_num_items); 7943 if (IS_ERR(trans)) { 7944 ret = PTR_ERR(trans); 7945 goto out_notrans; 7946 } 7947 7948 if (dest != root) { 7949 ret = btrfs_record_root_in_trans(trans, dest); 7950 if (ret) 7951 goto out_fail; 7952 } 7953 7954 /* 7955 * We need to find a free sequence number both in the source and 7956 * in the destination directory for the exchange. 7957 */ 7958 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); 7959 if (ret) 7960 goto out_fail; 7961 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); 7962 if (ret) 7963 goto out_fail; 7964 7965 BTRFS_I(old_inode)->dir_index = 0ULL; 7966 BTRFS_I(new_inode)->dir_index = 0ULL; 7967 7968 /* Reference for the source. */ 7969 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 7970 /* force full log commit if subvolume involved. */ 7971 btrfs_set_log_full_commit(trans); 7972 } else { 7973 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, 7974 btrfs_ino(BTRFS_I(new_dir)), 7975 old_idx); 7976 if (ret) 7977 goto out_fail; 7978 need_abort = true; 7979 } 7980 7981 /* And now for the dest. */ 7982 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 7983 /* force full log commit if subvolume involved. */ 7984 btrfs_set_log_full_commit(trans); 7985 } else { 7986 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, 7987 btrfs_ino(BTRFS_I(old_dir)), 7988 new_idx); 7989 if (ret) { 7990 if (need_abort) 7991 btrfs_abort_transaction(trans, ret); 7992 goto out_fail; 7993 } 7994 } 7995 7996 /* Update inode version and ctime/mtime. */ 7997 inode_inc_iversion(old_dir); 7998 inode_inc_iversion(new_dir); 7999 inode_inc_iversion(old_inode); 8000 inode_inc_iversion(new_inode); 8001 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 8002 8003 if (old_dentry->d_parent != new_dentry->d_parent) { 8004 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 8005 BTRFS_I(old_inode), true); 8006 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), 8007 BTRFS_I(new_inode), true); 8008 } 8009 8010 /* src is a subvolume */ 8011 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 8012 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); 8013 } else { /* src is an inode */ 8014 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), 8015 BTRFS_I(old_dentry->d_inode), 8016 old_name, &old_rename_ctx); 8017 if (!ret) 8018 ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); 8019 } 8020 if (ret) { 8021 btrfs_abort_transaction(trans, ret); 8022 goto out_fail; 8023 } 8024 8025 /* dest is a subvolume */ 8026 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 8027 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); 8028 } else { /* dest is an inode */ 8029 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), 8030 BTRFS_I(new_dentry->d_inode), 8031 new_name, &new_rename_ctx); 8032 if (!ret) 8033 ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); 8034 } 8035 if (ret) { 8036 btrfs_abort_transaction(trans, ret); 8037 goto out_fail; 8038 } 8039 8040 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 8041 new_name, 0, old_idx); 8042 if (ret) { 8043 btrfs_abort_transaction(trans, ret); 8044 goto out_fail; 8045 } 8046 8047 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), 8048 old_name, 0, new_idx); 8049 if (ret) { 8050 btrfs_abort_transaction(trans, ret); 8051 goto out_fail; 8052 } 8053 8054 if (old_inode->i_nlink == 1) 8055 BTRFS_I(old_inode)->dir_index = old_idx; 8056 if (new_inode->i_nlink == 1) 8057 BTRFS_I(new_inode)->dir_index = new_idx; 8058 8059 /* 8060 * Now pin the logs of the roots. We do it to ensure that no other task 8061 * can sync the logs while we are in progress with the rename, because 8062 * that could result in an inconsistency in case any of the inodes that 8063 * are part of this rename operation were logged before. 8064 */ 8065 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 8066 btrfs_pin_log_trans(root); 8067 if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 8068 btrfs_pin_log_trans(dest); 8069 8070 /* Do the log updates for all inodes. */ 8071 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 8072 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), 8073 old_rename_ctx.index, new_dentry->d_parent); 8074 if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 8075 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), 8076 new_rename_ctx.index, old_dentry->d_parent); 8077 8078 /* Now unpin the logs. */ 8079 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 8080 btrfs_end_log_trans(root); 8081 if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 8082 btrfs_end_log_trans(dest); 8083 out_fail: 8084 ret2 = btrfs_end_transaction(trans); 8085 ret = ret ? ret : ret2; 8086 out_notrans: 8087 if (new_ino == BTRFS_FIRST_FREE_OBJECTID || 8088 old_ino == BTRFS_FIRST_FREE_OBJECTID) 8089 up_read(&fs_info->subvol_sem); 8090 8091 fscrypt_free_filename(&new_fname); 8092 fscrypt_free_filename(&old_fname); 8093 return ret; 8094 } 8095 8096 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap, 8097 struct inode *dir) 8098 { 8099 struct inode *inode; 8100 8101 inode = new_inode(dir->i_sb); 8102 if (inode) { 8103 inode_init_owner(idmap, inode, dir, 8104 S_IFCHR | WHITEOUT_MODE); 8105 inode->i_op = &btrfs_special_inode_operations; 8106 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); 8107 } 8108 return inode; 8109 } 8110 8111 static int btrfs_rename(struct mnt_idmap *idmap, 8112 struct inode *old_dir, struct dentry *old_dentry, 8113 struct inode *new_dir, struct dentry *new_dentry, 8114 unsigned int flags) 8115 { 8116 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); 8117 struct btrfs_new_inode_args whiteout_args = { 8118 .dir = old_dir, 8119 .dentry = old_dentry, 8120 }; 8121 struct btrfs_trans_handle *trans; 8122 unsigned int trans_num_items; 8123 struct btrfs_root *root = BTRFS_I(old_dir)->root; 8124 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 8125 struct inode *new_inode = d_inode(new_dentry); 8126 struct inode *old_inode = d_inode(old_dentry); 8127 struct btrfs_rename_ctx rename_ctx; 8128 u64 index = 0; 8129 int ret; 8130 int ret2; 8131 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 8132 struct fscrypt_name old_fname, new_fname; 8133 8134 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 8135 return -EPERM; 8136 8137 /* we only allow rename subvolume link between subvolumes */ 8138 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 8139 return -EXDEV; 8140 8141 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 8142 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) 8143 return -ENOTEMPTY; 8144 8145 if (S_ISDIR(old_inode->i_mode) && new_inode && 8146 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 8147 return -ENOTEMPTY; 8148 8149 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); 8150 if (ret) 8151 return ret; 8152 8153 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); 8154 if (ret) { 8155 fscrypt_free_filename(&old_fname); 8156 return ret; 8157 } 8158 8159 /* check for collisions, even if the name isn't there */ 8160 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); 8161 if (ret) { 8162 if (ret == -EEXIST) { 8163 /* we shouldn't get 8164 * eexist without a new_inode */ 8165 if (WARN_ON(!new_inode)) { 8166 goto out_fscrypt_names; 8167 } 8168 } else { 8169 /* maybe -EOVERFLOW */ 8170 goto out_fscrypt_names; 8171 } 8172 } 8173 ret = 0; 8174 8175 /* 8176 * we're using rename to replace one file with another. Start IO on it 8177 * now so we don't add too much work to the end of the transaction 8178 */ 8179 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 8180 filemap_flush(old_inode->i_mapping); 8181 8182 if (flags & RENAME_WHITEOUT) { 8183 whiteout_args.inode = new_whiteout_inode(idmap, old_dir); 8184 if (!whiteout_args.inode) { 8185 ret = -ENOMEM; 8186 goto out_fscrypt_names; 8187 } 8188 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); 8189 if (ret) 8190 goto out_whiteout_inode; 8191 } else { 8192 /* 1 to update the old parent inode. */ 8193 trans_num_items = 1; 8194 } 8195 8196 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 8197 /* Close the race window with snapshot create/destroy ioctl */ 8198 down_read(&fs_info->subvol_sem); 8199 /* 8200 * 1 to remove old root ref 8201 * 1 to remove old root backref 8202 * 1 to add new root ref 8203 * 1 to add new root backref 8204 */ 8205 trans_num_items += 4; 8206 } else { 8207 /* 8208 * 1 to update inode 8209 * 1 to remove old inode ref 8210 * 1 to add new inode ref 8211 */ 8212 trans_num_items += 3; 8213 } 8214 /* 8215 * 1 to remove old dir item 8216 * 1 to remove old dir index 8217 * 1 to add new dir item 8218 * 1 to add new dir index 8219 */ 8220 trans_num_items += 4; 8221 /* 1 to update new parent inode if it's not the same as the old parent */ 8222 if (new_dir != old_dir) 8223 trans_num_items++; 8224 if (new_inode) { 8225 /* 8226 * 1 to update inode 8227 * 1 to remove inode ref 8228 * 1 to remove dir item 8229 * 1 to remove dir index 8230 * 1 to possibly add orphan item 8231 */ 8232 trans_num_items += 5; 8233 } 8234 trans = btrfs_start_transaction(root, trans_num_items); 8235 if (IS_ERR(trans)) { 8236 ret = PTR_ERR(trans); 8237 goto out_notrans; 8238 } 8239 8240 if (dest != root) { 8241 ret = btrfs_record_root_in_trans(trans, dest); 8242 if (ret) 8243 goto out_fail; 8244 } 8245 8246 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); 8247 if (ret) 8248 goto out_fail; 8249 8250 BTRFS_I(old_inode)->dir_index = 0ULL; 8251 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8252 /* force full log commit if subvolume involved. */ 8253 btrfs_set_log_full_commit(trans); 8254 } else { 8255 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, 8256 old_ino, btrfs_ino(BTRFS_I(new_dir)), 8257 index); 8258 if (ret) 8259 goto out_fail; 8260 } 8261 8262 inode_inc_iversion(old_dir); 8263 inode_inc_iversion(new_dir); 8264 inode_inc_iversion(old_inode); 8265 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 8266 8267 if (old_dentry->d_parent != new_dentry->d_parent) 8268 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 8269 BTRFS_I(old_inode), true); 8270 8271 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8272 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); 8273 } else { 8274 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), 8275 BTRFS_I(d_inode(old_dentry)), 8276 &old_fname.disk_name, &rename_ctx); 8277 if (!ret) 8278 ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); 8279 } 8280 if (ret) { 8281 btrfs_abort_transaction(trans, ret); 8282 goto out_fail; 8283 } 8284 8285 if (new_inode) { 8286 inode_inc_iversion(new_inode); 8287 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == 8288 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 8289 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); 8290 BUG_ON(new_inode->i_nlink == 0); 8291 } else { 8292 ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), 8293 BTRFS_I(d_inode(new_dentry)), 8294 &new_fname.disk_name); 8295 } 8296 if (!ret && new_inode->i_nlink == 0) 8297 ret = btrfs_orphan_add(trans, 8298 BTRFS_I(d_inode(new_dentry))); 8299 if (ret) { 8300 btrfs_abort_transaction(trans, ret); 8301 goto out_fail; 8302 } 8303 } 8304 8305 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 8306 &new_fname.disk_name, 0, index); 8307 if (ret) { 8308 btrfs_abort_transaction(trans, ret); 8309 goto out_fail; 8310 } 8311 8312 if (old_inode->i_nlink == 1) 8313 BTRFS_I(old_inode)->dir_index = index; 8314 8315 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 8316 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), 8317 rename_ctx.index, new_dentry->d_parent); 8318 8319 if (flags & RENAME_WHITEOUT) { 8320 ret = btrfs_create_new_inode(trans, &whiteout_args); 8321 if (ret) { 8322 btrfs_abort_transaction(trans, ret); 8323 goto out_fail; 8324 } else { 8325 unlock_new_inode(whiteout_args.inode); 8326 iput(whiteout_args.inode); 8327 whiteout_args.inode = NULL; 8328 } 8329 } 8330 out_fail: 8331 ret2 = btrfs_end_transaction(trans); 8332 ret = ret ? ret : ret2; 8333 out_notrans: 8334 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 8335 up_read(&fs_info->subvol_sem); 8336 if (flags & RENAME_WHITEOUT) 8337 btrfs_new_inode_args_destroy(&whiteout_args); 8338 out_whiteout_inode: 8339 if (flags & RENAME_WHITEOUT) 8340 iput(whiteout_args.inode); 8341 out_fscrypt_names: 8342 fscrypt_free_filename(&old_fname); 8343 fscrypt_free_filename(&new_fname); 8344 return ret; 8345 } 8346 8347 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, 8348 struct dentry *old_dentry, struct inode *new_dir, 8349 struct dentry *new_dentry, unsigned int flags) 8350 { 8351 int ret; 8352 8353 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 8354 return -EINVAL; 8355 8356 if (flags & RENAME_EXCHANGE) 8357 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, 8358 new_dentry); 8359 else 8360 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir, 8361 new_dentry, flags); 8362 8363 btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); 8364 8365 return ret; 8366 } 8367 8368 struct btrfs_delalloc_work { 8369 struct inode *inode; 8370 struct completion completion; 8371 struct list_head list; 8372 struct btrfs_work work; 8373 }; 8374 8375 static void btrfs_run_delalloc_work(struct btrfs_work *work) 8376 { 8377 struct btrfs_delalloc_work *delalloc_work; 8378 struct inode *inode; 8379 8380 delalloc_work = container_of(work, struct btrfs_delalloc_work, 8381 work); 8382 inode = delalloc_work->inode; 8383 filemap_flush(inode->i_mapping); 8384 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 8385 &BTRFS_I(inode)->runtime_flags)) 8386 filemap_flush(inode->i_mapping); 8387 8388 iput(inode); 8389 complete(&delalloc_work->completion); 8390 } 8391 8392 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) 8393 { 8394 struct btrfs_delalloc_work *work; 8395 8396 work = kmalloc(sizeof(*work), GFP_NOFS); 8397 if (!work) 8398 return NULL; 8399 8400 init_completion(&work->completion); 8401 INIT_LIST_HEAD(&work->list); 8402 work->inode = inode; 8403 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL); 8404 8405 return work; 8406 } 8407 8408 /* 8409 * some fairly slow code that needs optimization. This walks the list 8410 * of all the inodes with pending delalloc and forces them to disk. 8411 */ 8412 static int start_delalloc_inodes(struct btrfs_root *root, 8413 struct writeback_control *wbc, bool snapshot, 8414 bool in_reclaim_context) 8415 { 8416 struct btrfs_inode *binode; 8417 struct inode *inode; 8418 struct btrfs_delalloc_work *work, *next; 8419 LIST_HEAD(works); 8420 LIST_HEAD(splice); 8421 int ret = 0; 8422 bool full_flush = wbc->nr_to_write == LONG_MAX; 8423 8424 mutex_lock(&root->delalloc_mutex); 8425 spin_lock(&root->delalloc_lock); 8426 list_splice_init(&root->delalloc_inodes, &splice); 8427 while (!list_empty(&splice)) { 8428 binode = list_entry(splice.next, struct btrfs_inode, 8429 delalloc_inodes); 8430 8431 list_move_tail(&binode->delalloc_inodes, 8432 &root->delalloc_inodes); 8433 8434 if (in_reclaim_context && 8435 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) 8436 continue; 8437 8438 inode = igrab(&binode->vfs_inode); 8439 if (!inode) { 8440 cond_resched_lock(&root->delalloc_lock); 8441 continue; 8442 } 8443 spin_unlock(&root->delalloc_lock); 8444 8445 if (snapshot) 8446 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 8447 &binode->runtime_flags); 8448 if (full_flush) { 8449 work = btrfs_alloc_delalloc_work(inode); 8450 if (!work) { 8451 iput(inode); 8452 ret = -ENOMEM; 8453 goto out; 8454 } 8455 list_add_tail(&work->list, &works); 8456 btrfs_queue_work(root->fs_info->flush_workers, 8457 &work->work); 8458 } else { 8459 ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); 8460 btrfs_add_delayed_iput(BTRFS_I(inode)); 8461 if (ret || wbc->nr_to_write <= 0) 8462 goto out; 8463 } 8464 cond_resched(); 8465 spin_lock(&root->delalloc_lock); 8466 } 8467 spin_unlock(&root->delalloc_lock); 8468 8469 out: 8470 list_for_each_entry_safe(work, next, &works, list) { 8471 list_del_init(&work->list); 8472 wait_for_completion(&work->completion); 8473 kfree(work); 8474 } 8475 8476 if (!list_empty(&splice)) { 8477 spin_lock(&root->delalloc_lock); 8478 list_splice_tail(&splice, &root->delalloc_inodes); 8479 spin_unlock(&root->delalloc_lock); 8480 } 8481 mutex_unlock(&root->delalloc_mutex); 8482 return ret; 8483 } 8484 8485 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) 8486 { 8487 struct writeback_control wbc = { 8488 .nr_to_write = LONG_MAX, 8489 .sync_mode = WB_SYNC_NONE, 8490 .range_start = 0, 8491 .range_end = LLONG_MAX, 8492 }; 8493 struct btrfs_fs_info *fs_info = root->fs_info; 8494 8495 if (BTRFS_FS_ERROR(fs_info)) 8496 return -EROFS; 8497 8498 return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); 8499 } 8500 8501 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, 8502 bool in_reclaim_context) 8503 { 8504 struct writeback_control wbc = { 8505 .nr_to_write = nr, 8506 .sync_mode = WB_SYNC_NONE, 8507 .range_start = 0, 8508 .range_end = LLONG_MAX, 8509 }; 8510 struct btrfs_root *root; 8511 LIST_HEAD(splice); 8512 int ret; 8513 8514 if (BTRFS_FS_ERROR(fs_info)) 8515 return -EROFS; 8516 8517 mutex_lock(&fs_info->delalloc_root_mutex); 8518 spin_lock(&fs_info->delalloc_root_lock); 8519 list_splice_init(&fs_info->delalloc_roots, &splice); 8520 while (!list_empty(&splice)) { 8521 /* 8522 * Reset nr_to_write here so we know that we're doing a full 8523 * flush. 8524 */ 8525 if (nr == LONG_MAX) 8526 wbc.nr_to_write = LONG_MAX; 8527 8528 root = list_first_entry(&splice, struct btrfs_root, 8529 delalloc_root); 8530 root = btrfs_grab_root(root); 8531 BUG_ON(!root); 8532 list_move_tail(&root->delalloc_root, 8533 &fs_info->delalloc_roots); 8534 spin_unlock(&fs_info->delalloc_root_lock); 8535 8536 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); 8537 btrfs_put_root(root); 8538 if (ret < 0 || wbc.nr_to_write <= 0) 8539 goto out; 8540 spin_lock(&fs_info->delalloc_root_lock); 8541 } 8542 spin_unlock(&fs_info->delalloc_root_lock); 8543 8544 ret = 0; 8545 out: 8546 if (!list_empty(&splice)) { 8547 spin_lock(&fs_info->delalloc_root_lock); 8548 list_splice_tail(&splice, &fs_info->delalloc_roots); 8549 spin_unlock(&fs_info->delalloc_root_lock); 8550 } 8551 mutex_unlock(&fs_info->delalloc_root_mutex); 8552 return ret; 8553 } 8554 8555 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, 8556 struct dentry *dentry, const char *symname) 8557 { 8558 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 8559 struct btrfs_trans_handle *trans; 8560 struct btrfs_root *root = BTRFS_I(dir)->root; 8561 struct btrfs_path *path; 8562 struct btrfs_key key; 8563 struct inode *inode; 8564 struct btrfs_new_inode_args new_inode_args = { 8565 .dir = dir, 8566 .dentry = dentry, 8567 }; 8568 unsigned int trans_num_items; 8569 int err; 8570 int name_len; 8571 int datasize; 8572 unsigned long ptr; 8573 struct btrfs_file_extent_item *ei; 8574 struct extent_buffer *leaf; 8575 8576 name_len = strlen(symname); 8577 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) 8578 return -ENAMETOOLONG; 8579 8580 inode = new_inode(dir->i_sb); 8581 if (!inode) 8582 return -ENOMEM; 8583 inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO); 8584 inode->i_op = &btrfs_symlink_inode_operations; 8585 inode_nohighmem(inode); 8586 inode->i_mapping->a_ops = &btrfs_aops; 8587 btrfs_i_size_write(BTRFS_I(inode), name_len); 8588 inode_set_bytes(inode, name_len); 8589 8590 new_inode_args.inode = inode; 8591 err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 8592 if (err) 8593 goto out_inode; 8594 /* 1 additional item for the inline extent */ 8595 trans_num_items++; 8596 8597 trans = btrfs_start_transaction(root, trans_num_items); 8598 if (IS_ERR(trans)) { 8599 err = PTR_ERR(trans); 8600 goto out_new_inode_args; 8601 } 8602 8603 err = btrfs_create_new_inode(trans, &new_inode_args); 8604 if (err) 8605 goto out; 8606 8607 path = btrfs_alloc_path(); 8608 if (!path) { 8609 err = -ENOMEM; 8610 btrfs_abort_transaction(trans, err); 8611 discard_new_inode(inode); 8612 inode = NULL; 8613 goto out; 8614 } 8615 key.objectid = btrfs_ino(BTRFS_I(inode)); 8616 key.offset = 0; 8617 key.type = BTRFS_EXTENT_DATA_KEY; 8618 datasize = btrfs_file_extent_calc_inline_size(name_len); 8619 err = btrfs_insert_empty_item(trans, root, path, &key, 8620 datasize); 8621 if (err) { 8622 btrfs_abort_transaction(trans, err); 8623 btrfs_free_path(path); 8624 discard_new_inode(inode); 8625 inode = NULL; 8626 goto out; 8627 } 8628 leaf = path->nodes[0]; 8629 ei = btrfs_item_ptr(leaf, path->slots[0], 8630 struct btrfs_file_extent_item); 8631 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 8632 btrfs_set_file_extent_type(leaf, ei, 8633 BTRFS_FILE_EXTENT_INLINE); 8634 btrfs_set_file_extent_encryption(leaf, ei, 0); 8635 btrfs_set_file_extent_compression(leaf, ei, 0); 8636 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 8637 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 8638 8639 ptr = btrfs_file_extent_inline_start(ei); 8640 write_extent_buffer(leaf, symname, ptr, name_len); 8641 btrfs_mark_buffer_dirty(trans, leaf); 8642 btrfs_free_path(path); 8643 8644 d_instantiate_new(dentry, inode); 8645 err = 0; 8646 out: 8647 btrfs_end_transaction(trans); 8648 btrfs_btree_balance_dirty(fs_info); 8649 out_new_inode_args: 8650 btrfs_new_inode_args_destroy(&new_inode_args); 8651 out_inode: 8652 if (err) 8653 iput(inode); 8654 return err; 8655 } 8656 8657 static struct btrfs_trans_handle *insert_prealloc_file_extent( 8658 struct btrfs_trans_handle *trans_in, 8659 struct btrfs_inode *inode, 8660 struct btrfs_key *ins, 8661 u64 file_offset) 8662 { 8663 struct btrfs_file_extent_item stack_fi; 8664 struct btrfs_replace_extent_info extent_info; 8665 struct btrfs_trans_handle *trans = trans_in; 8666 struct btrfs_path *path; 8667 u64 start = ins->objectid; 8668 u64 len = ins->offset; 8669 u64 qgroup_released = 0; 8670 int ret; 8671 8672 memset(&stack_fi, 0, sizeof(stack_fi)); 8673 8674 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); 8675 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); 8676 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); 8677 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); 8678 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); 8679 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); 8680 /* Encryption and other encoding is reserved and all 0 */ 8681 8682 ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); 8683 if (ret < 0) 8684 return ERR_PTR(ret); 8685 8686 if (trans) { 8687 ret = insert_reserved_file_extent(trans, inode, 8688 file_offset, &stack_fi, 8689 true, qgroup_released); 8690 if (ret) 8691 goto free_qgroup; 8692 return trans; 8693 } 8694 8695 extent_info.disk_offset = start; 8696 extent_info.disk_len = len; 8697 extent_info.data_offset = 0; 8698 extent_info.data_len = len; 8699 extent_info.file_offset = file_offset; 8700 extent_info.extent_buf = (char *)&stack_fi; 8701 extent_info.is_new_extent = true; 8702 extent_info.update_times = true; 8703 extent_info.qgroup_reserved = qgroup_released; 8704 extent_info.insertions = 0; 8705 8706 path = btrfs_alloc_path(); 8707 if (!path) { 8708 ret = -ENOMEM; 8709 goto free_qgroup; 8710 } 8711 8712 ret = btrfs_replace_file_extents(inode, path, file_offset, 8713 file_offset + len - 1, &extent_info, 8714 &trans); 8715 btrfs_free_path(path); 8716 if (ret) 8717 goto free_qgroup; 8718 return trans; 8719 8720 free_qgroup: 8721 /* 8722 * We have released qgroup data range at the beginning of the function, 8723 * and normally qgroup_released bytes will be freed when committing 8724 * transaction. 8725 * But if we error out early, we have to free what we have released 8726 * or we leak qgroup data reservation. 8727 */ 8728 btrfs_qgroup_free_refroot(inode->root->fs_info, 8729 btrfs_root_id(inode->root), qgroup_released, 8730 BTRFS_QGROUP_RSV_DATA); 8731 return ERR_PTR(ret); 8732 } 8733 8734 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 8735 u64 start, u64 num_bytes, u64 min_size, 8736 loff_t actual_len, u64 *alloc_hint, 8737 struct btrfs_trans_handle *trans) 8738 { 8739 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 8740 struct extent_map *em; 8741 struct btrfs_root *root = BTRFS_I(inode)->root; 8742 struct btrfs_key ins; 8743 u64 cur_offset = start; 8744 u64 clear_offset = start; 8745 u64 i_size; 8746 u64 cur_bytes; 8747 u64 last_alloc = (u64)-1; 8748 int ret = 0; 8749 bool own_trans = true; 8750 u64 end = start + num_bytes - 1; 8751 8752 if (trans) 8753 own_trans = false; 8754 while (num_bytes > 0) { 8755 cur_bytes = min_t(u64, num_bytes, SZ_256M); 8756 cur_bytes = max(cur_bytes, min_size); 8757 /* 8758 * If we are severely fragmented we could end up with really 8759 * small allocations, so if the allocator is returning small 8760 * chunks lets make its job easier by only searching for those 8761 * sized chunks. 8762 */ 8763 cur_bytes = min(cur_bytes, last_alloc); 8764 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, 8765 min_size, 0, *alloc_hint, &ins, 1, 0); 8766 if (ret) 8767 break; 8768 8769 /* 8770 * We've reserved this space, and thus converted it from 8771 * ->bytes_may_use to ->bytes_reserved. Any error that happens 8772 * from here on out we will only need to clear our reservation 8773 * for the remaining unreserved area, so advance our 8774 * clear_offset by our extent size. 8775 */ 8776 clear_offset += ins.offset; 8777 8778 last_alloc = ins.offset; 8779 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), 8780 &ins, cur_offset); 8781 /* 8782 * Now that we inserted the prealloc extent we can finally 8783 * decrement the number of reservations in the block group. 8784 * If we did it before, we could race with relocation and have 8785 * relocation miss the reserved extent, making it fail later. 8786 */ 8787 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 8788 if (IS_ERR(trans)) { 8789 ret = PTR_ERR(trans); 8790 btrfs_free_reserved_extent(fs_info, ins.objectid, 8791 ins.offset, 0); 8792 break; 8793 } 8794 8795 em = alloc_extent_map(); 8796 if (!em) { 8797 btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, 8798 cur_offset + ins.offset - 1, false); 8799 btrfs_set_inode_full_sync(BTRFS_I(inode)); 8800 goto next; 8801 } 8802 8803 em->start = cur_offset; 8804 em->len = ins.offset; 8805 em->disk_bytenr = ins.objectid; 8806 em->offset = 0; 8807 em->disk_num_bytes = ins.offset; 8808 em->ram_bytes = ins.offset; 8809 em->flags |= EXTENT_FLAG_PREALLOC; 8810 em->generation = trans->transid; 8811 8812 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); 8813 free_extent_map(em); 8814 next: 8815 num_bytes -= ins.offset; 8816 cur_offset += ins.offset; 8817 *alloc_hint = ins.objectid + ins.offset; 8818 8819 inode_inc_iversion(inode); 8820 inode_set_ctime_current(inode); 8821 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 8822 if (!(mode & FALLOC_FL_KEEP_SIZE) && 8823 (actual_len > inode->i_size) && 8824 (cur_offset > inode->i_size)) { 8825 if (cur_offset > actual_len) 8826 i_size = actual_len; 8827 else 8828 i_size = cur_offset; 8829 i_size_write(inode, i_size); 8830 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 8831 } 8832 8833 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 8834 8835 if (ret) { 8836 btrfs_abort_transaction(trans, ret); 8837 if (own_trans) 8838 btrfs_end_transaction(trans); 8839 break; 8840 } 8841 8842 if (own_trans) { 8843 btrfs_end_transaction(trans); 8844 trans = NULL; 8845 } 8846 } 8847 if (clear_offset < end) 8848 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, 8849 end - clear_offset + 1); 8850 return ret; 8851 } 8852 8853 int btrfs_prealloc_file_range(struct inode *inode, int mode, 8854 u64 start, u64 num_bytes, u64 min_size, 8855 loff_t actual_len, u64 *alloc_hint) 8856 { 8857 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 8858 min_size, actual_len, alloc_hint, 8859 NULL); 8860 } 8861 8862 int btrfs_prealloc_file_range_trans(struct inode *inode, 8863 struct btrfs_trans_handle *trans, int mode, 8864 u64 start, u64 num_bytes, u64 min_size, 8865 loff_t actual_len, u64 *alloc_hint) 8866 { 8867 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 8868 min_size, actual_len, alloc_hint, trans); 8869 } 8870 8871 static int btrfs_permission(struct mnt_idmap *idmap, 8872 struct inode *inode, int mask) 8873 { 8874 struct btrfs_root *root = BTRFS_I(inode)->root; 8875 umode_t mode = inode->i_mode; 8876 8877 if (mask & MAY_WRITE && 8878 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 8879 if (btrfs_root_readonly(root)) 8880 return -EROFS; 8881 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 8882 return -EACCES; 8883 } 8884 return generic_permission(idmap, inode, mask); 8885 } 8886 8887 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, 8888 struct file *file, umode_t mode) 8889 { 8890 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 8891 struct btrfs_trans_handle *trans; 8892 struct btrfs_root *root = BTRFS_I(dir)->root; 8893 struct inode *inode; 8894 struct btrfs_new_inode_args new_inode_args = { 8895 .dir = dir, 8896 .dentry = file->f_path.dentry, 8897 .orphan = true, 8898 }; 8899 unsigned int trans_num_items; 8900 int ret; 8901 8902 inode = new_inode(dir->i_sb); 8903 if (!inode) 8904 return -ENOMEM; 8905 inode_init_owner(idmap, inode, dir, mode); 8906 inode->i_fop = &btrfs_file_operations; 8907 inode->i_op = &btrfs_file_inode_operations; 8908 inode->i_mapping->a_ops = &btrfs_aops; 8909 8910 new_inode_args.inode = inode; 8911 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 8912 if (ret) 8913 goto out_inode; 8914 8915 trans = btrfs_start_transaction(root, trans_num_items); 8916 if (IS_ERR(trans)) { 8917 ret = PTR_ERR(trans); 8918 goto out_new_inode_args; 8919 } 8920 8921 ret = btrfs_create_new_inode(trans, &new_inode_args); 8922 8923 /* 8924 * We set number of links to 0 in btrfs_create_new_inode(), and here we 8925 * set it to 1 because d_tmpfile() will issue a warning if the count is 8926 * 0, through: 8927 * 8928 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 8929 */ 8930 set_nlink(inode, 1); 8931 8932 if (!ret) { 8933 d_tmpfile(file, inode); 8934 unlock_new_inode(inode); 8935 mark_inode_dirty(inode); 8936 } 8937 8938 btrfs_end_transaction(trans); 8939 btrfs_btree_balance_dirty(fs_info); 8940 out_new_inode_args: 8941 btrfs_new_inode_args_destroy(&new_inode_args); 8942 out_inode: 8943 if (ret) 8944 iput(inode); 8945 return finish_open_simple(file, ret); 8946 } 8947 8948 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) 8949 { 8950 struct btrfs_fs_info *fs_info = inode->root->fs_info; 8951 unsigned long index = start >> PAGE_SHIFT; 8952 unsigned long end_index = end >> PAGE_SHIFT; 8953 struct page *page; 8954 u32 len; 8955 8956 ASSERT(end + 1 - start <= U32_MAX); 8957 len = end + 1 - start; 8958 while (index <= end_index) { 8959 page = find_get_page(inode->vfs_inode.i_mapping, index); 8960 ASSERT(page); /* Pages should be in the extent_io_tree */ 8961 8962 /* This is for data, which doesn't yet support larger folio. */ 8963 ASSERT(folio_order(page_folio(page)) == 0); 8964 btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); 8965 put_page(page); 8966 index++; 8967 } 8968 } 8969 8970 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, 8971 int compress_type) 8972 { 8973 switch (compress_type) { 8974 case BTRFS_COMPRESS_NONE: 8975 return BTRFS_ENCODED_IO_COMPRESSION_NONE; 8976 case BTRFS_COMPRESS_ZLIB: 8977 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; 8978 case BTRFS_COMPRESS_LZO: 8979 /* 8980 * The LZO format depends on the sector size. 64K is the maximum 8981 * sector size that we support. 8982 */ 8983 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) 8984 return -EINVAL; 8985 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 8986 (fs_info->sectorsize_bits - 12); 8987 case BTRFS_COMPRESS_ZSTD: 8988 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; 8989 default: 8990 return -EUCLEAN; 8991 } 8992 } 8993 8994 static ssize_t btrfs_encoded_read_inline( 8995 struct kiocb *iocb, 8996 struct iov_iter *iter, u64 start, 8997 u64 lockend, 8998 struct extent_state **cached_state, 8999 u64 extent_start, size_t count, 9000 struct btrfs_ioctl_encoded_io_args *encoded, 9001 bool *unlocked) 9002 { 9003 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 9004 struct btrfs_root *root = inode->root; 9005 struct btrfs_fs_info *fs_info = root->fs_info; 9006 struct extent_io_tree *io_tree = &inode->io_tree; 9007 struct btrfs_path *path; 9008 struct extent_buffer *leaf; 9009 struct btrfs_file_extent_item *item; 9010 u64 ram_bytes; 9011 unsigned long ptr; 9012 void *tmp; 9013 ssize_t ret; 9014 9015 path = btrfs_alloc_path(); 9016 if (!path) { 9017 ret = -ENOMEM; 9018 goto out; 9019 } 9020 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), 9021 extent_start, 0); 9022 if (ret) { 9023 if (ret > 0) { 9024 /* The extent item disappeared? */ 9025 ret = -EIO; 9026 } 9027 goto out; 9028 } 9029 leaf = path->nodes[0]; 9030 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 9031 9032 ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); 9033 ptr = btrfs_file_extent_inline_start(item); 9034 9035 encoded->len = min_t(u64, extent_start + ram_bytes, 9036 inode->vfs_inode.i_size) - iocb->ki_pos; 9037 ret = btrfs_encoded_io_compression_from_extent(fs_info, 9038 btrfs_file_extent_compression(leaf, item)); 9039 if (ret < 0) 9040 goto out; 9041 encoded->compression = ret; 9042 if (encoded->compression) { 9043 size_t inline_size; 9044 9045 inline_size = btrfs_file_extent_inline_item_len(leaf, 9046 path->slots[0]); 9047 if (inline_size > count) { 9048 ret = -ENOBUFS; 9049 goto out; 9050 } 9051 count = inline_size; 9052 encoded->unencoded_len = ram_bytes; 9053 encoded->unencoded_offset = iocb->ki_pos - extent_start; 9054 } else { 9055 count = min_t(u64, count, encoded->len); 9056 encoded->len = count; 9057 encoded->unencoded_len = count; 9058 ptr += iocb->ki_pos - extent_start; 9059 } 9060 9061 tmp = kmalloc(count, GFP_NOFS); 9062 if (!tmp) { 9063 ret = -ENOMEM; 9064 goto out; 9065 } 9066 read_extent_buffer(leaf, tmp, ptr, count); 9067 btrfs_release_path(path); 9068 unlock_extent(io_tree, start, lockend, cached_state); 9069 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9070 *unlocked = true; 9071 9072 ret = copy_to_iter(tmp, count, iter); 9073 if (ret != count) 9074 ret = -EFAULT; 9075 kfree(tmp); 9076 out: 9077 btrfs_free_path(path); 9078 return ret; 9079 } 9080 9081 struct btrfs_encoded_read_private { 9082 wait_queue_head_t wait; 9083 atomic_t pending; 9084 blk_status_t status; 9085 }; 9086 9087 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) 9088 { 9089 struct btrfs_encoded_read_private *priv = bbio->private; 9090 9091 if (bbio->bio.bi_status) { 9092 /* 9093 * The memory barrier implied by the atomic_dec_return() here 9094 * pairs with the memory barrier implied by the 9095 * atomic_dec_return() or io_wait_event() in 9096 * btrfs_encoded_read_regular_fill_pages() to ensure that this 9097 * write is observed before the load of status in 9098 * btrfs_encoded_read_regular_fill_pages(). 9099 */ 9100 WRITE_ONCE(priv->status, bbio->bio.bi_status); 9101 } 9102 if (!atomic_dec_return(&priv->pending)) 9103 wake_up(&priv->wait); 9104 bio_put(&bbio->bio); 9105 } 9106 9107 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, 9108 u64 file_offset, u64 disk_bytenr, 9109 u64 disk_io_size, struct page **pages) 9110 { 9111 struct btrfs_fs_info *fs_info = inode->root->fs_info; 9112 struct btrfs_encoded_read_private priv = { 9113 .pending = ATOMIC_INIT(1), 9114 }; 9115 unsigned long i = 0; 9116 struct btrfs_bio *bbio; 9117 9118 init_waitqueue_head(&priv.wait); 9119 9120 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, 9121 btrfs_encoded_read_endio, &priv); 9122 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 9123 bbio->inode = inode; 9124 9125 do { 9126 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); 9127 9128 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { 9129 atomic_inc(&priv.pending); 9130 btrfs_submit_bio(bbio, 0); 9131 9132 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, 9133 btrfs_encoded_read_endio, &priv); 9134 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 9135 bbio->inode = inode; 9136 continue; 9137 } 9138 9139 i++; 9140 disk_bytenr += bytes; 9141 disk_io_size -= bytes; 9142 } while (disk_io_size); 9143 9144 atomic_inc(&priv.pending); 9145 btrfs_submit_bio(bbio, 0); 9146 9147 if (atomic_dec_return(&priv.pending)) 9148 io_wait_event(priv.wait, !atomic_read(&priv.pending)); 9149 /* See btrfs_encoded_read_endio() for ordering. */ 9150 return blk_status_to_errno(READ_ONCE(priv.status)); 9151 } 9152 9153 static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, 9154 struct iov_iter *iter, 9155 u64 start, u64 lockend, 9156 struct extent_state **cached_state, 9157 u64 disk_bytenr, u64 disk_io_size, 9158 size_t count, bool compressed, 9159 bool *unlocked) 9160 { 9161 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 9162 struct extent_io_tree *io_tree = &inode->io_tree; 9163 struct page **pages; 9164 unsigned long nr_pages, i; 9165 u64 cur; 9166 size_t page_offset; 9167 ssize_t ret; 9168 9169 nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); 9170 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 9171 if (!pages) 9172 return -ENOMEM; 9173 ret = btrfs_alloc_page_array(nr_pages, pages, false); 9174 if (ret) { 9175 ret = -ENOMEM; 9176 goto out; 9177 } 9178 9179 ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, 9180 disk_io_size, pages); 9181 if (ret) 9182 goto out; 9183 9184 unlock_extent(io_tree, start, lockend, cached_state); 9185 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9186 *unlocked = true; 9187 9188 if (compressed) { 9189 i = 0; 9190 page_offset = 0; 9191 } else { 9192 i = (iocb->ki_pos - start) >> PAGE_SHIFT; 9193 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); 9194 } 9195 cur = 0; 9196 while (cur < count) { 9197 size_t bytes = min_t(size_t, count - cur, 9198 PAGE_SIZE - page_offset); 9199 9200 if (copy_page_to_iter(pages[i], page_offset, bytes, 9201 iter) != bytes) { 9202 ret = -EFAULT; 9203 goto out; 9204 } 9205 i++; 9206 cur += bytes; 9207 page_offset = 0; 9208 } 9209 ret = count; 9210 out: 9211 for (i = 0; i < nr_pages; i++) { 9212 if (pages[i]) 9213 __free_page(pages[i]); 9214 } 9215 kfree(pages); 9216 return ret; 9217 } 9218 9219 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, 9220 struct btrfs_ioctl_encoded_io_args *encoded) 9221 { 9222 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 9223 struct btrfs_fs_info *fs_info = inode->root->fs_info; 9224 struct extent_io_tree *io_tree = &inode->io_tree; 9225 ssize_t ret; 9226 size_t count = iov_iter_count(iter); 9227 u64 start, lockend, disk_bytenr, disk_io_size; 9228 struct extent_state *cached_state = NULL; 9229 struct extent_map *em; 9230 bool unlocked = false; 9231 9232 file_accessed(iocb->ki_filp); 9233 9234 btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); 9235 9236 if (iocb->ki_pos >= inode->vfs_inode.i_size) { 9237 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9238 return 0; 9239 } 9240 start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); 9241 /* 9242 * We don't know how long the extent containing iocb->ki_pos is, but if 9243 * it's compressed we know that it won't be longer than this. 9244 */ 9245 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; 9246 9247 for (;;) { 9248 struct btrfs_ordered_extent *ordered; 9249 9250 ret = btrfs_wait_ordered_range(inode, start, 9251 lockend - start + 1); 9252 if (ret) 9253 goto out_unlock_inode; 9254 lock_extent(io_tree, start, lockend, &cached_state); 9255 ordered = btrfs_lookup_ordered_range(inode, start, 9256 lockend - start + 1); 9257 if (!ordered) 9258 break; 9259 btrfs_put_ordered_extent(ordered); 9260 unlock_extent(io_tree, start, lockend, &cached_state); 9261 cond_resched(); 9262 } 9263 9264 em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); 9265 if (IS_ERR(em)) { 9266 ret = PTR_ERR(em); 9267 goto out_unlock_extent; 9268 } 9269 9270 if (em->disk_bytenr == EXTENT_MAP_INLINE) { 9271 u64 extent_start = em->start; 9272 9273 /* 9274 * For inline extents we get everything we need out of the 9275 * extent item. 9276 */ 9277 free_extent_map(em); 9278 em = NULL; 9279 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, 9280 &cached_state, extent_start, 9281 count, encoded, &unlocked); 9282 goto out; 9283 } 9284 9285 /* 9286 * We only want to return up to EOF even if the extent extends beyond 9287 * that. 9288 */ 9289 encoded->len = min_t(u64, extent_map_end(em), 9290 inode->vfs_inode.i_size) - iocb->ki_pos; 9291 if (em->disk_bytenr == EXTENT_MAP_HOLE || 9292 (em->flags & EXTENT_FLAG_PREALLOC)) { 9293 disk_bytenr = EXTENT_MAP_HOLE; 9294 count = min_t(u64, count, encoded->len); 9295 encoded->len = count; 9296 encoded->unencoded_len = count; 9297 } else if (extent_map_is_compressed(em)) { 9298 disk_bytenr = em->disk_bytenr; 9299 /* 9300 * Bail if the buffer isn't large enough to return the whole 9301 * compressed extent. 9302 */ 9303 if (em->disk_num_bytes > count) { 9304 ret = -ENOBUFS; 9305 goto out_em; 9306 } 9307 disk_io_size = em->disk_num_bytes; 9308 count = em->disk_num_bytes; 9309 encoded->unencoded_len = em->ram_bytes; 9310 encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); 9311 ret = btrfs_encoded_io_compression_from_extent(fs_info, 9312 extent_map_compression(em)); 9313 if (ret < 0) 9314 goto out_em; 9315 encoded->compression = ret; 9316 } else { 9317 disk_bytenr = extent_map_block_start(em) + (start - em->start); 9318 if (encoded->len > count) 9319 encoded->len = count; 9320 /* 9321 * Don't read beyond what we locked. This also limits the page 9322 * allocations that we'll do. 9323 */ 9324 disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; 9325 count = start + disk_io_size - iocb->ki_pos; 9326 encoded->len = count; 9327 encoded->unencoded_len = count; 9328 disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); 9329 } 9330 free_extent_map(em); 9331 em = NULL; 9332 9333 if (disk_bytenr == EXTENT_MAP_HOLE) { 9334 unlock_extent(io_tree, start, lockend, &cached_state); 9335 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9336 unlocked = true; 9337 ret = iov_iter_zero(count, iter); 9338 if (ret != count) 9339 ret = -EFAULT; 9340 } else { 9341 ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, 9342 &cached_state, disk_bytenr, 9343 disk_io_size, count, 9344 encoded->compression, 9345 &unlocked); 9346 } 9347 9348 out: 9349 if (ret >= 0) 9350 iocb->ki_pos += encoded->len; 9351 out_em: 9352 free_extent_map(em); 9353 out_unlock_extent: 9354 if (!unlocked) 9355 unlock_extent(io_tree, start, lockend, &cached_state); 9356 out_unlock_inode: 9357 if (!unlocked) 9358 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9359 return ret; 9360 } 9361 9362 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, 9363 const struct btrfs_ioctl_encoded_io_args *encoded) 9364 { 9365 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 9366 struct btrfs_root *root = inode->root; 9367 struct btrfs_fs_info *fs_info = root->fs_info; 9368 struct extent_io_tree *io_tree = &inode->io_tree; 9369 struct extent_changeset *data_reserved = NULL; 9370 struct extent_state *cached_state = NULL; 9371 struct btrfs_ordered_extent *ordered; 9372 struct btrfs_file_extent file_extent; 9373 int compression; 9374 size_t orig_count; 9375 u64 start, end; 9376 u64 num_bytes, ram_bytes, disk_num_bytes; 9377 unsigned long nr_folios, i; 9378 struct folio **folios; 9379 struct btrfs_key ins; 9380 bool extent_reserved = false; 9381 struct extent_map *em; 9382 ssize_t ret; 9383 9384 switch (encoded->compression) { 9385 case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: 9386 compression = BTRFS_COMPRESS_ZLIB; 9387 break; 9388 case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: 9389 compression = BTRFS_COMPRESS_ZSTD; 9390 break; 9391 case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: 9392 case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: 9393 case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: 9394 case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: 9395 case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: 9396 /* The sector size must match for LZO. */ 9397 if (encoded->compression - 9398 BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != 9399 fs_info->sectorsize_bits) 9400 return -EINVAL; 9401 compression = BTRFS_COMPRESS_LZO; 9402 break; 9403 default: 9404 return -EINVAL; 9405 } 9406 if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) 9407 return -EINVAL; 9408 9409 /* 9410 * Compressed extents should always have checksums, so error out if we 9411 * have a NOCOW file or inode was created while mounted with NODATASUM. 9412 */ 9413 if (inode->flags & BTRFS_INODE_NODATASUM) 9414 return -EINVAL; 9415 9416 orig_count = iov_iter_count(from); 9417 9418 /* The extent size must be sane. */ 9419 if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || 9420 orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) 9421 return -EINVAL; 9422 9423 /* 9424 * The compressed data must be smaller than the decompressed data. 9425 * 9426 * It's of course possible for data to compress to larger or the same 9427 * size, but the buffered I/O path falls back to no compression for such 9428 * data, and we don't want to break any assumptions by creating these 9429 * extents. 9430 * 9431 * Note that this is less strict than the current check we have that the 9432 * compressed data must be at least one sector smaller than the 9433 * decompressed data. We only want to enforce the weaker requirement 9434 * from old kernels that it is at least one byte smaller. 9435 */ 9436 if (orig_count >= encoded->unencoded_len) 9437 return -EINVAL; 9438 9439 /* The extent must start on a sector boundary. */ 9440 start = iocb->ki_pos; 9441 if (!IS_ALIGNED(start, fs_info->sectorsize)) 9442 return -EINVAL; 9443 9444 /* 9445 * The extent must end on a sector boundary. However, we allow a write 9446 * which ends at or extends i_size to have an unaligned length; we round 9447 * up the extent size and set i_size to the unaligned end. 9448 */ 9449 if (start + encoded->len < inode->vfs_inode.i_size && 9450 !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) 9451 return -EINVAL; 9452 9453 /* Finally, the offset in the unencoded data must be sector-aligned. */ 9454 if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) 9455 return -EINVAL; 9456 9457 num_bytes = ALIGN(encoded->len, fs_info->sectorsize); 9458 ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); 9459 end = start + num_bytes - 1; 9460 9461 /* 9462 * If the extent cannot be inline, the compressed data on disk must be 9463 * sector-aligned. For convenience, we extend it with zeroes if it 9464 * isn't. 9465 */ 9466 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); 9467 nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); 9468 folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); 9469 if (!folios) 9470 return -ENOMEM; 9471 for (i = 0; i < nr_folios; i++) { 9472 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); 9473 char *kaddr; 9474 9475 folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); 9476 if (!folios[i]) { 9477 ret = -ENOMEM; 9478 goto out_folios; 9479 } 9480 kaddr = kmap_local_folio(folios[i], 0); 9481 if (copy_from_iter(kaddr, bytes, from) != bytes) { 9482 kunmap_local(kaddr); 9483 ret = -EFAULT; 9484 goto out_folios; 9485 } 9486 if (bytes < PAGE_SIZE) 9487 memset(kaddr + bytes, 0, PAGE_SIZE - bytes); 9488 kunmap_local(kaddr); 9489 } 9490 9491 for (;;) { 9492 struct btrfs_ordered_extent *ordered; 9493 9494 ret = btrfs_wait_ordered_range(inode, start, num_bytes); 9495 if (ret) 9496 goto out_folios; 9497 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, 9498 start >> PAGE_SHIFT, 9499 end >> PAGE_SHIFT); 9500 if (ret) 9501 goto out_folios; 9502 lock_extent(io_tree, start, end, &cached_state); 9503 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); 9504 if (!ordered && 9505 !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) 9506 break; 9507 if (ordered) 9508 btrfs_put_ordered_extent(ordered); 9509 unlock_extent(io_tree, start, end, &cached_state); 9510 cond_resched(); 9511 } 9512 9513 /* 9514 * We don't use the higher-level delalloc space functions because our 9515 * num_bytes and disk_num_bytes are different. 9516 */ 9517 ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); 9518 if (ret) 9519 goto out_unlock; 9520 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); 9521 if (ret) 9522 goto out_free_data_space; 9523 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, 9524 false); 9525 if (ret) 9526 goto out_qgroup_free_data; 9527 9528 /* Try an inline extent first. */ 9529 if (encoded->unencoded_len == encoded->len && 9530 encoded->unencoded_offset == 0 && 9531 can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { 9532 ret = __cow_file_range_inline(inode, start, encoded->len, 9533 orig_count, compression, folios[0], 9534 true); 9535 if (ret <= 0) { 9536 if (ret == 0) 9537 ret = orig_count; 9538 goto out_delalloc_release; 9539 } 9540 } 9541 9542 ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, 9543 disk_num_bytes, 0, 0, &ins, 1, 1); 9544 if (ret) 9545 goto out_delalloc_release; 9546 extent_reserved = true; 9547 9548 file_extent.disk_bytenr = ins.objectid; 9549 file_extent.disk_num_bytes = ins.offset; 9550 file_extent.num_bytes = num_bytes; 9551 file_extent.ram_bytes = ram_bytes; 9552 file_extent.offset = encoded->unencoded_offset; 9553 file_extent.compression = compression; 9554 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); 9555 if (IS_ERR(em)) { 9556 ret = PTR_ERR(em); 9557 goto out_free_reserved; 9558 } 9559 free_extent_map(em); 9560 9561 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 9562 (1 << BTRFS_ORDERED_ENCODED) | 9563 (1 << BTRFS_ORDERED_COMPRESSED)); 9564 if (IS_ERR(ordered)) { 9565 btrfs_drop_extent_map_range(inode, start, end, false); 9566 ret = PTR_ERR(ordered); 9567 goto out_free_reserved; 9568 } 9569 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 9570 9571 if (start + encoded->len > inode->vfs_inode.i_size) 9572 i_size_write(&inode->vfs_inode, start + encoded->len); 9573 9574 unlock_extent(io_tree, start, end, &cached_state); 9575 9576 btrfs_delalloc_release_extents(inode, num_bytes); 9577 9578 btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); 9579 ret = orig_count; 9580 goto out; 9581 9582 out_free_reserved: 9583 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 9584 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 9585 out_delalloc_release: 9586 btrfs_delalloc_release_extents(inode, num_bytes); 9587 btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); 9588 out_qgroup_free_data: 9589 if (ret < 0) 9590 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); 9591 out_free_data_space: 9592 /* 9593 * If btrfs_reserve_extent() succeeded, then we already decremented 9594 * bytes_may_use. 9595 */ 9596 if (!extent_reserved) 9597 btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); 9598 out_unlock: 9599 unlock_extent(io_tree, start, end, &cached_state); 9600 out_folios: 9601 for (i = 0; i < nr_folios; i++) { 9602 if (folios[i]) 9603 folio_put(folios[i]); 9604 } 9605 kvfree(folios); 9606 out: 9607 if (ret >= 0) 9608 iocb->ki_pos += encoded->len; 9609 return ret; 9610 } 9611 9612 #ifdef CONFIG_SWAP 9613 /* 9614 * Add an entry indicating a block group or device which is pinned by a 9615 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a 9616 * negative errno on failure. 9617 */ 9618 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, 9619 bool is_block_group) 9620 { 9621 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 9622 struct btrfs_swapfile_pin *sp, *entry; 9623 struct rb_node **p; 9624 struct rb_node *parent = NULL; 9625 9626 sp = kmalloc(sizeof(*sp), GFP_NOFS); 9627 if (!sp) 9628 return -ENOMEM; 9629 sp->ptr = ptr; 9630 sp->inode = inode; 9631 sp->is_block_group = is_block_group; 9632 sp->bg_extent_count = 1; 9633 9634 spin_lock(&fs_info->swapfile_pins_lock); 9635 p = &fs_info->swapfile_pins.rb_node; 9636 while (*p) { 9637 parent = *p; 9638 entry = rb_entry(parent, struct btrfs_swapfile_pin, node); 9639 if (sp->ptr < entry->ptr || 9640 (sp->ptr == entry->ptr && sp->inode < entry->inode)) { 9641 p = &(*p)->rb_left; 9642 } else if (sp->ptr > entry->ptr || 9643 (sp->ptr == entry->ptr && sp->inode > entry->inode)) { 9644 p = &(*p)->rb_right; 9645 } else { 9646 if (is_block_group) 9647 entry->bg_extent_count++; 9648 spin_unlock(&fs_info->swapfile_pins_lock); 9649 kfree(sp); 9650 return 1; 9651 } 9652 } 9653 rb_link_node(&sp->node, parent, p); 9654 rb_insert_color(&sp->node, &fs_info->swapfile_pins); 9655 spin_unlock(&fs_info->swapfile_pins_lock); 9656 return 0; 9657 } 9658 9659 /* Free all of the entries pinned by this swapfile. */ 9660 static void btrfs_free_swapfile_pins(struct inode *inode) 9661 { 9662 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 9663 struct btrfs_swapfile_pin *sp; 9664 struct rb_node *node, *next; 9665 9666 spin_lock(&fs_info->swapfile_pins_lock); 9667 node = rb_first(&fs_info->swapfile_pins); 9668 while (node) { 9669 next = rb_next(node); 9670 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 9671 if (sp->inode == inode) { 9672 rb_erase(&sp->node, &fs_info->swapfile_pins); 9673 if (sp->is_block_group) { 9674 btrfs_dec_block_group_swap_extents(sp->ptr, 9675 sp->bg_extent_count); 9676 btrfs_put_block_group(sp->ptr); 9677 } 9678 kfree(sp); 9679 } 9680 node = next; 9681 } 9682 spin_unlock(&fs_info->swapfile_pins_lock); 9683 } 9684 9685 struct btrfs_swap_info { 9686 u64 start; 9687 u64 block_start; 9688 u64 block_len; 9689 u64 lowest_ppage; 9690 u64 highest_ppage; 9691 unsigned long nr_pages; 9692 int nr_extents; 9693 }; 9694 9695 static int btrfs_add_swap_extent(struct swap_info_struct *sis, 9696 struct btrfs_swap_info *bsi) 9697 { 9698 unsigned long nr_pages; 9699 unsigned long max_pages; 9700 u64 first_ppage, first_ppage_reported, next_ppage; 9701 int ret; 9702 9703 /* 9704 * Our swapfile may have had its size extended after the swap header was 9705 * written. In that case activating the swapfile should not go beyond 9706 * the max size set in the swap header. 9707 */ 9708 if (bsi->nr_pages >= sis->max) 9709 return 0; 9710 9711 max_pages = sis->max - bsi->nr_pages; 9712 first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; 9713 next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; 9714 9715 if (first_ppage >= next_ppage) 9716 return 0; 9717 nr_pages = next_ppage - first_ppage; 9718 nr_pages = min(nr_pages, max_pages); 9719 9720 first_ppage_reported = first_ppage; 9721 if (bsi->start == 0) 9722 first_ppage_reported++; 9723 if (bsi->lowest_ppage > first_ppage_reported) 9724 bsi->lowest_ppage = first_ppage_reported; 9725 if (bsi->highest_ppage < (next_ppage - 1)) 9726 bsi->highest_ppage = next_ppage - 1; 9727 9728 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); 9729 if (ret < 0) 9730 return ret; 9731 bsi->nr_extents += ret; 9732 bsi->nr_pages += nr_pages; 9733 return 0; 9734 } 9735 9736 static void btrfs_swap_deactivate(struct file *file) 9737 { 9738 struct inode *inode = file_inode(file); 9739 9740 btrfs_free_swapfile_pins(inode); 9741 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); 9742 } 9743 9744 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 9745 sector_t *span) 9746 { 9747 struct inode *inode = file_inode(file); 9748 struct btrfs_root *root = BTRFS_I(inode)->root; 9749 struct btrfs_fs_info *fs_info = root->fs_info; 9750 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 9751 struct extent_state *cached_state = NULL; 9752 struct extent_map *em = NULL; 9753 struct btrfs_chunk_map *map = NULL; 9754 struct btrfs_device *device = NULL; 9755 struct btrfs_swap_info bsi = { 9756 .lowest_ppage = (sector_t)-1ULL, 9757 }; 9758 int ret = 0; 9759 u64 isize; 9760 u64 start; 9761 9762 /* 9763 * If the swap file was just created, make sure delalloc is done. If the 9764 * file changes again after this, the user is doing something stupid and 9765 * we don't really care. 9766 */ 9767 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 9768 if (ret) 9769 return ret; 9770 9771 /* 9772 * The inode is locked, so these flags won't change after we check them. 9773 */ 9774 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { 9775 btrfs_warn(fs_info, "swapfile must not be compressed"); 9776 return -EINVAL; 9777 } 9778 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { 9779 btrfs_warn(fs_info, "swapfile must not be copy-on-write"); 9780 return -EINVAL; 9781 } 9782 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 9783 btrfs_warn(fs_info, "swapfile must not be checksummed"); 9784 return -EINVAL; 9785 } 9786 9787 /* 9788 * Balance or device remove/replace/resize can move stuff around from 9789 * under us. The exclop protection makes sure they aren't running/won't 9790 * run concurrently while we are mapping the swap extents, and 9791 * fs_info->swapfile_pins prevents them from running while the swap 9792 * file is active and moving the extents. Note that this also prevents 9793 * a concurrent device add which isn't actually necessary, but it's not 9794 * really worth the trouble to allow it. 9795 */ 9796 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { 9797 btrfs_warn(fs_info, 9798 "cannot activate swapfile while exclusive operation is running"); 9799 return -EBUSY; 9800 } 9801 9802 /* 9803 * Prevent snapshot creation while we are activating the swap file. 9804 * We do not want to race with snapshot creation. If snapshot creation 9805 * already started before we bumped nr_swapfiles from 0 to 1 and 9806 * completes before the first write into the swap file after it is 9807 * activated, than that write would fallback to COW. 9808 */ 9809 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { 9810 btrfs_exclop_finish(fs_info); 9811 btrfs_warn(fs_info, 9812 "cannot activate swapfile because snapshot creation is in progress"); 9813 return -EINVAL; 9814 } 9815 /* 9816 * Snapshots can create extents which require COW even if NODATACOW is 9817 * set. We use this counter to prevent snapshots. We must increment it 9818 * before walking the extents because we don't want a concurrent 9819 * snapshot to run after we've already checked the extents. 9820 * 9821 * It is possible that subvolume is marked for deletion but still not 9822 * removed yet. To prevent this race, we check the root status before 9823 * activating the swapfile. 9824 */ 9825 spin_lock(&root->root_item_lock); 9826 if (btrfs_root_dead(root)) { 9827 spin_unlock(&root->root_item_lock); 9828 9829 btrfs_exclop_finish(fs_info); 9830 btrfs_warn(fs_info, 9831 "cannot activate swapfile because subvolume %llu is being deleted", 9832 btrfs_root_id(root)); 9833 return -EPERM; 9834 } 9835 atomic_inc(&root->nr_swapfiles); 9836 spin_unlock(&root->root_item_lock); 9837 9838 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); 9839 9840 lock_extent(io_tree, 0, isize - 1, &cached_state); 9841 start = 0; 9842 while (start < isize) { 9843 u64 logical_block_start, physical_block_start; 9844 struct btrfs_block_group *bg; 9845 u64 len = isize - start; 9846 9847 em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); 9848 if (IS_ERR(em)) { 9849 ret = PTR_ERR(em); 9850 goto out; 9851 } 9852 9853 if (em->disk_bytenr == EXTENT_MAP_HOLE) { 9854 btrfs_warn(fs_info, "swapfile must not have holes"); 9855 ret = -EINVAL; 9856 goto out; 9857 } 9858 if (em->disk_bytenr == EXTENT_MAP_INLINE) { 9859 /* 9860 * It's unlikely we'll ever actually find ourselves 9861 * here, as a file small enough to fit inline won't be 9862 * big enough to store more than the swap header, but in 9863 * case something changes in the future, let's catch it 9864 * here rather than later. 9865 */ 9866 btrfs_warn(fs_info, "swapfile must not be inline"); 9867 ret = -EINVAL; 9868 goto out; 9869 } 9870 if (extent_map_is_compressed(em)) { 9871 btrfs_warn(fs_info, "swapfile must not be compressed"); 9872 ret = -EINVAL; 9873 goto out; 9874 } 9875 9876 logical_block_start = extent_map_block_start(em) + (start - em->start); 9877 len = min(len, em->len - (start - em->start)); 9878 free_extent_map(em); 9879 em = NULL; 9880 9881 ret = can_nocow_extent(inode, start, &len, NULL, false, true); 9882 if (ret < 0) { 9883 goto out; 9884 } else if (ret) { 9885 ret = 0; 9886 } else { 9887 btrfs_warn(fs_info, 9888 "swapfile must not be copy-on-write"); 9889 ret = -EINVAL; 9890 goto out; 9891 } 9892 9893 map = btrfs_get_chunk_map(fs_info, logical_block_start, len); 9894 if (IS_ERR(map)) { 9895 ret = PTR_ERR(map); 9896 goto out; 9897 } 9898 9899 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 9900 btrfs_warn(fs_info, 9901 "swapfile must have single data profile"); 9902 ret = -EINVAL; 9903 goto out; 9904 } 9905 9906 if (device == NULL) { 9907 device = map->stripes[0].dev; 9908 ret = btrfs_add_swapfile_pin(inode, device, false); 9909 if (ret == 1) 9910 ret = 0; 9911 else if (ret) 9912 goto out; 9913 } else if (device != map->stripes[0].dev) { 9914 btrfs_warn(fs_info, "swapfile must be on one device"); 9915 ret = -EINVAL; 9916 goto out; 9917 } 9918 9919 physical_block_start = (map->stripes[0].physical + 9920 (logical_block_start - map->start)); 9921 len = min(len, map->chunk_len - (logical_block_start - map->start)); 9922 btrfs_free_chunk_map(map); 9923 map = NULL; 9924 9925 bg = btrfs_lookup_block_group(fs_info, logical_block_start); 9926 if (!bg) { 9927 btrfs_warn(fs_info, 9928 "could not find block group containing swapfile"); 9929 ret = -EINVAL; 9930 goto out; 9931 } 9932 9933 if (!btrfs_inc_block_group_swap_extents(bg)) { 9934 btrfs_warn(fs_info, 9935 "block group for swapfile at %llu is read-only%s", 9936 bg->start, 9937 atomic_read(&fs_info->scrubs_running) ? 9938 " (scrub running)" : ""); 9939 btrfs_put_block_group(bg); 9940 ret = -EINVAL; 9941 goto out; 9942 } 9943 9944 ret = btrfs_add_swapfile_pin(inode, bg, true); 9945 if (ret) { 9946 btrfs_put_block_group(bg); 9947 if (ret == 1) 9948 ret = 0; 9949 else 9950 goto out; 9951 } 9952 9953 if (bsi.block_len && 9954 bsi.block_start + bsi.block_len == physical_block_start) { 9955 bsi.block_len += len; 9956 } else { 9957 if (bsi.block_len) { 9958 ret = btrfs_add_swap_extent(sis, &bsi); 9959 if (ret) 9960 goto out; 9961 } 9962 bsi.start = start; 9963 bsi.block_start = physical_block_start; 9964 bsi.block_len = len; 9965 } 9966 9967 start += len; 9968 } 9969 9970 if (bsi.block_len) 9971 ret = btrfs_add_swap_extent(sis, &bsi); 9972 9973 out: 9974 if (!IS_ERR_OR_NULL(em)) 9975 free_extent_map(em); 9976 if (!IS_ERR_OR_NULL(map)) 9977 btrfs_free_chunk_map(map); 9978 9979 unlock_extent(io_tree, 0, isize - 1, &cached_state); 9980 9981 if (ret) 9982 btrfs_swap_deactivate(file); 9983 9984 btrfs_drew_write_unlock(&root->snapshot_lock); 9985 9986 btrfs_exclop_finish(fs_info); 9987 9988 if (ret) 9989 return ret; 9990 9991 if (device) 9992 sis->bdev = device->bdev; 9993 *span = bsi.highest_ppage - bsi.lowest_ppage + 1; 9994 sis->max = bsi.nr_pages; 9995 sis->pages = bsi.nr_pages - 1; 9996 sis->highest_bit = bsi.nr_pages - 1; 9997 return bsi.nr_extents; 9998 } 9999 #else 10000 static void btrfs_swap_deactivate(struct file *file) 10001 { 10002 } 10003 10004 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 10005 sector_t *span) 10006 { 10007 return -EOPNOTSUPP; 10008 } 10009 #endif 10010 10011 /* 10012 * Update the number of bytes used in the VFS' inode. When we replace extents in 10013 * a range (clone, dedupe, fallocate's zero range), we must update the number of 10014 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls 10015 * always get a correct value. 10016 */ 10017 void btrfs_update_inode_bytes(struct btrfs_inode *inode, 10018 const u64 add_bytes, 10019 const u64 del_bytes) 10020 { 10021 if (add_bytes == del_bytes) 10022 return; 10023 10024 spin_lock(&inode->lock); 10025 if (del_bytes > 0) 10026 inode_sub_bytes(&inode->vfs_inode, del_bytes); 10027 if (add_bytes > 0) 10028 inode_add_bytes(&inode->vfs_inode, add_bytes); 10029 spin_unlock(&inode->lock); 10030 } 10031 10032 /* 10033 * Verify that there are no ordered extents for a given file range. 10034 * 10035 * @inode: The target inode. 10036 * @start: Start offset of the file range, should be sector size aligned. 10037 * @end: End offset (inclusive) of the file range, its value +1 should be 10038 * sector size aligned. 10039 * 10040 * This should typically be used for cases where we locked an inode's VFS lock in 10041 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, 10042 * we have flushed all delalloc in the range, we have waited for all ordered 10043 * extents in the range to complete and finally we have locked the file range in 10044 * the inode's io_tree. 10045 */ 10046 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) 10047 { 10048 struct btrfs_root *root = inode->root; 10049 struct btrfs_ordered_extent *ordered; 10050 10051 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 10052 return; 10053 10054 ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); 10055 if (ordered) { 10056 btrfs_err(root->fs_info, 10057 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", 10058 start, end, btrfs_ino(inode), btrfs_root_id(root), 10059 ordered->file_offset, 10060 ordered->file_offset + ordered->num_bytes - 1); 10061 btrfs_put_ordered_extent(ordered); 10062 } 10063 10064 ASSERT(ordered == NULL); 10065 } 10066 10067 /* 10068 * Find the first inode with a minimum number. 10069 * 10070 * @root: The root to search for. 10071 * @min_ino: The minimum inode number. 10072 * 10073 * Find the first inode in the @root with a number >= @min_ino and return it. 10074 * Returns NULL if no such inode found. 10075 */ 10076 struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) 10077 { 10078 struct btrfs_inode *inode; 10079 unsigned long from = min_ino; 10080 10081 xa_lock(&root->inodes); 10082 while (true) { 10083 inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); 10084 if (!inode) 10085 break; 10086 if (igrab(&inode->vfs_inode)) 10087 break; 10088 10089 from = btrfs_ino(inode) + 1; 10090 cond_resched_lock(&root->inodes.xa_lock); 10091 } 10092 xa_unlock(&root->inodes); 10093 10094 return inode; 10095 } 10096 10097 static const struct inode_operations btrfs_dir_inode_operations = { 10098 .getattr = btrfs_getattr, 10099 .lookup = btrfs_lookup, 10100 .create = btrfs_create, 10101 .unlink = btrfs_unlink, 10102 .link = btrfs_link, 10103 .mkdir = btrfs_mkdir, 10104 .rmdir = btrfs_rmdir, 10105 .rename = btrfs_rename2, 10106 .symlink = btrfs_symlink, 10107 .setattr = btrfs_setattr, 10108 .mknod = btrfs_mknod, 10109 .listxattr = btrfs_listxattr, 10110 .permission = btrfs_permission, 10111 .get_inode_acl = btrfs_get_acl, 10112 .set_acl = btrfs_set_acl, 10113 .update_time = btrfs_update_time, 10114 .tmpfile = btrfs_tmpfile, 10115 .fileattr_get = btrfs_fileattr_get, 10116 .fileattr_set = btrfs_fileattr_set, 10117 }; 10118 10119 static const struct file_operations btrfs_dir_file_operations = { 10120 .llseek = btrfs_dir_llseek, 10121 .read = generic_read_dir, 10122 .iterate_shared = btrfs_real_readdir, 10123 .open = btrfs_opendir, 10124 .unlocked_ioctl = btrfs_ioctl, 10125 #ifdef CONFIG_COMPAT 10126 .compat_ioctl = btrfs_compat_ioctl, 10127 #endif 10128 .release = btrfs_release_file, 10129 .fsync = btrfs_sync_file, 10130 }; 10131 10132 /* 10133 * btrfs doesn't support the bmap operation because swapfiles 10134 * use bmap to make a mapping of extents in the file. They assume 10135 * these extents won't change over the life of the file and they 10136 * use the bmap result to do IO directly to the drive. 10137 * 10138 * the btrfs bmap call would return logical addresses that aren't 10139 * suitable for IO and they also will change frequently as COW 10140 * operations happen. So, swapfile + btrfs == corruption. 10141 * 10142 * For now we're avoiding this by dropping bmap. 10143 */ 10144 static const struct address_space_operations btrfs_aops = { 10145 .read_folio = btrfs_read_folio, 10146 .writepages = btrfs_writepages, 10147 .readahead = btrfs_readahead, 10148 .invalidate_folio = btrfs_invalidate_folio, 10149 .launder_folio = btrfs_launder_folio, 10150 .release_folio = btrfs_release_folio, 10151 .migrate_folio = btrfs_migrate_folio, 10152 .dirty_folio = filemap_dirty_folio, 10153 .error_remove_folio = generic_error_remove_folio, 10154 .swap_activate = btrfs_swap_activate, 10155 .swap_deactivate = btrfs_swap_deactivate, 10156 }; 10157 10158 static const struct inode_operations btrfs_file_inode_operations = { 10159 .getattr = btrfs_getattr, 10160 .setattr = btrfs_setattr, 10161 .listxattr = btrfs_listxattr, 10162 .permission = btrfs_permission, 10163 .fiemap = btrfs_fiemap, 10164 .get_inode_acl = btrfs_get_acl, 10165 .set_acl = btrfs_set_acl, 10166 .update_time = btrfs_update_time, 10167 .fileattr_get = btrfs_fileattr_get, 10168 .fileattr_set = btrfs_fileattr_set, 10169 }; 10170 static const struct inode_operations btrfs_special_inode_operations = { 10171 .getattr = btrfs_getattr, 10172 .setattr = btrfs_setattr, 10173 .permission = btrfs_permission, 10174 .listxattr = btrfs_listxattr, 10175 .get_inode_acl = btrfs_get_acl, 10176 .set_acl = btrfs_set_acl, 10177 .update_time = btrfs_update_time, 10178 }; 10179 static const struct inode_operations btrfs_symlink_inode_operations = { 10180 .get_link = page_get_link, 10181 .getattr = btrfs_getattr, 10182 .setattr = btrfs_setattr, 10183 .permission = btrfs_permission, 10184 .listxattr = btrfs_listxattr, 10185 .update_time = btrfs_update_time, 10186 }; 10187 10188 const struct dentry_operations btrfs_dentry_operations = { 10189 .d_delete = btrfs_dentry_delete, 10190 }; 10191