1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <crypto/hash.h> 7 #include <linux/kernel.h> 8 #include <linux/bio.h> 9 #include <linux/blk-cgroup.h> 10 #include <linux/file.h> 11 #include <linux/fs.h> 12 #include <linux/fs_struct.h> 13 #include <linux/pagemap.h> 14 #include <linux/highmem.h> 15 #include <linux/time.h> 16 #include <linux/init.h> 17 #include <linux/string.h> 18 #include <linux/backing-dev.h> 19 #include <linux/writeback.h> 20 #include <linux/compat.h> 21 #include <linux/xattr.h> 22 #include <linux/posix_acl.h> 23 #include <linux/falloc.h> 24 #include <linux/slab.h> 25 #include <linux/ratelimit.h> 26 #include <linux/btrfs.h> 27 #include <linux/blkdev.h> 28 #include <linux/posix_acl_xattr.h> 29 #include <linux/uio.h> 30 #include <linux/magic.h> 31 #include <linux/iversion.h> 32 #include <linux/swap.h> 33 #include <linux/migrate.h> 34 #include <linux/sched/mm.h> 35 #include <linux/iomap.h> 36 #include <linux/unaligned.h> 37 #include <linux/fsverity.h> 38 #include "misc.h" 39 #include "ctree.h" 40 #include "disk-io.h" 41 #include "transaction.h" 42 #include "btrfs_inode.h" 43 #include "ordered-data.h" 44 #include "xattr.h" 45 #include "tree-log.h" 46 #include "bio.h" 47 #include "compression.h" 48 #include "locking.h" 49 #include "props.h" 50 #include "qgroup.h" 51 #include "delalloc-space.h" 52 #include "block-group.h" 53 #include "space-info.h" 54 #include "zoned.h" 55 #include "subpage.h" 56 #include "inode-item.h" 57 #include "fs.h" 58 #include "accessors.h" 59 #include "extent-tree.h" 60 #include "root-tree.h" 61 #include "defrag.h" 62 #include "dir-item.h" 63 #include "file-item.h" 64 #include "uuid-tree.h" 65 #include "ioctl.h" 66 #include "file.h" 67 #include "acl.h" 68 #include "relocation.h" 69 #include "verity.h" 70 #include "super.h" 71 #include "orphan.h" 72 #include "backref.h" 73 #include "raid-stripe-tree.h" 74 #include "fiemap.h" 75 76 #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) 77 #define COW_FILE_RANGE_NO_INLINE (1UL << 1) 78 79 struct btrfs_iget_args { 80 u64 ino; 81 struct btrfs_root *root; 82 }; 83 84 struct btrfs_rename_ctx { 85 /* Output field. Stores the index number of the old directory entry. */ 86 u64 index; 87 }; 88 89 /* 90 * Used by data_reloc_print_warning_inode() to pass needed info for filename 91 * resolution and output of error message. 92 */ 93 struct data_reloc_warn { 94 struct btrfs_path path; 95 struct btrfs_fs_info *fs_info; 96 u64 extent_item_size; 97 u64 logical; 98 int mirror_num; 99 }; 100 101 /* 102 * For the file_extent_tree, we want to hold the inode lock when we lookup and 103 * update the disk_i_size, but lockdep will complain because our io_tree we hold 104 * the tree lock and get the inode lock when setting delalloc. These two things 105 * are unrelated, so make a class for the file_extent_tree so we don't get the 106 * two locking patterns mixed up. 107 */ 108 static struct lock_class_key file_extent_tree_class; 109 110 static const struct inode_operations btrfs_dir_inode_operations; 111 static const struct inode_operations btrfs_symlink_inode_operations; 112 static const struct inode_operations btrfs_special_inode_operations; 113 static const struct inode_operations btrfs_file_inode_operations; 114 static const struct address_space_operations btrfs_aops; 115 static const struct file_operations btrfs_dir_file_operations; 116 117 static struct kmem_cache *btrfs_inode_cachep; 118 119 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 120 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); 121 122 static noinline int run_delalloc_cow(struct btrfs_inode *inode, 123 struct folio *locked_folio, u64 start, 124 u64 end, struct writeback_control *wbc, 125 bool pages_dirty); 126 127 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, 128 u64 root, void *warn_ctx) 129 { 130 struct data_reloc_warn *warn = warn_ctx; 131 struct btrfs_fs_info *fs_info = warn->fs_info; 132 struct extent_buffer *eb; 133 struct btrfs_inode_item *inode_item; 134 struct inode_fs_paths *ipath = NULL; 135 struct btrfs_root *local_root; 136 struct btrfs_key key; 137 unsigned int nofs_flag; 138 u32 nlink; 139 int ret; 140 141 local_root = btrfs_get_fs_root(fs_info, root, true); 142 if (IS_ERR(local_root)) { 143 ret = PTR_ERR(local_root); 144 goto err; 145 } 146 147 /* This makes the path point to (inum INODE_ITEM ioff). */ 148 key.objectid = inum; 149 key.type = BTRFS_INODE_ITEM_KEY; 150 key.offset = 0; 151 152 ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); 153 if (ret) { 154 btrfs_put_root(local_root); 155 btrfs_release_path(&warn->path); 156 goto err; 157 } 158 159 eb = warn->path.nodes[0]; 160 inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); 161 nlink = btrfs_inode_nlink(eb, inode_item); 162 btrfs_release_path(&warn->path); 163 164 nofs_flag = memalloc_nofs_save(); 165 ipath = init_ipath(4096, local_root, &warn->path); 166 memalloc_nofs_restore(nofs_flag); 167 if (IS_ERR(ipath)) { 168 btrfs_put_root(local_root); 169 ret = PTR_ERR(ipath); 170 ipath = NULL; 171 /* 172 * -ENOMEM, not a critical error, just output an generic error 173 * without filename. 174 */ 175 btrfs_warn(fs_info, 176 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", 177 warn->logical, warn->mirror_num, root, inum, offset); 178 return ret; 179 } 180 ret = paths_from_inode(inum, ipath); 181 if (ret < 0) { 182 btrfs_put_root(local_root); 183 goto err; 184 } 185 186 /* 187 * We deliberately ignore the bit ipath might have been too small to 188 * hold all of the paths here 189 */ 190 for (int i = 0; i < ipath->fspath->elem_cnt; i++) { 191 btrfs_warn(fs_info, 192 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", 193 warn->logical, warn->mirror_num, root, inum, offset, 194 fs_info->sectorsize, nlink, 195 (char *)(unsigned long)ipath->fspath->val[i]); 196 } 197 198 btrfs_put_root(local_root); 199 free_ipath(ipath); 200 return 0; 201 202 err: 203 btrfs_warn(fs_info, 204 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", 205 warn->logical, warn->mirror_num, root, inum, offset, ret); 206 207 free_ipath(ipath); 208 return ret; 209 } 210 211 /* 212 * Do extra user-friendly error output (e.g. lookup all the affected files). 213 * 214 * Return true if we succeeded doing the backref lookup. 215 * Return false if such lookup failed, and has to fallback to the old error message. 216 */ 217 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, 218 const u8 *csum, const u8 *csum_expected, 219 int mirror_num) 220 { 221 struct btrfs_fs_info *fs_info = inode->root->fs_info; 222 struct btrfs_path path = { 0 }; 223 struct btrfs_key found_key = { 0 }; 224 struct extent_buffer *eb; 225 struct btrfs_extent_item *ei; 226 const u32 csum_size = fs_info->csum_size; 227 u64 logical; 228 u64 flags; 229 u32 item_size; 230 int ret; 231 232 mutex_lock(&fs_info->reloc_mutex); 233 logical = btrfs_get_reloc_bg_bytenr(fs_info); 234 mutex_unlock(&fs_info->reloc_mutex); 235 236 if (logical == U64_MAX) { 237 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); 238 btrfs_warn_rl(fs_info, 239 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 240 btrfs_root_id(inode->root), btrfs_ino(inode), file_off, 241 CSUM_FMT_VALUE(csum_size, csum), 242 CSUM_FMT_VALUE(csum_size, csum_expected), 243 mirror_num); 244 return; 245 } 246 247 logical += file_off; 248 btrfs_warn_rl(fs_info, 249 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 250 btrfs_root_id(inode->root), 251 btrfs_ino(inode), file_off, logical, 252 CSUM_FMT_VALUE(csum_size, csum), 253 CSUM_FMT_VALUE(csum_size, csum_expected), 254 mirror_num); 255 256 ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); 257 if (ret < 0) { 258 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", 259 logical, ret); 260 return; 261 } 262 eb = path.nodes[0]; 263 ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); 264 item_size = btrfs_item_size(eb, path.slots[0]); 265 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 266 unsigned long ptr = 0; 267 u64 ref_root; 268 u8 ref_level; 269 270 while (true) { 271 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 272 item_size, &ref_root, 273 &ref_level); 274 if (ret < 0) { 275 btrfs_warn_rl(fs_info, 276 "failed to resolve tree backref for logical %llu: %d", 277 logical, ret); 278 break; 279 } 280 if (ret > 0) 281 break; 282 283 btrfs_warn_rl(fs_info, 284 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", 285 logical, mirror_num, 286 (ref_level ? "node" : "leaf"), 287 ref_level, ref_root); 288 } 289 btrfs_release_path(&path); 290 } else { 291 struct btrfs_backref_walk_ctx ctx = { 0 }; 292 struct data_reloc_warn reloc_warn = { 0 }; 293 294 btrfs_release_path(&path); 295 296 ctx.bytenr = found_key.objectid; 297 ctx.extent_item_pos = logical - found_key.objectid; 298 ctx.fs_info = fs_info; 299 300 reloc_warn.logical = logical; 301 reloc_warn.extent_item_size = found_key.offset; 302 reloc_warn.mirror_num = mirror_num; 303 reloc_warn.fs_info = fs_info; 304 305 iterate_extent_inodes(&ctx, true, 306 data_reloc_print_warning_inode, &reloc_warn); 307 } 308 } 309 310 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, 311 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) 312 { 313 struct btrfs_root *root = inode->root; 314 const u32 csum_size = root->fs_info->csum_size; 315 316 /* For data reloc tree, it's better to do a backref lookup instead. */ 317 if (btrfs_is_data_reloc_root(root)) 318 return print_data_reloc_error(inode, logical_start, csum, 319 csum_expected, mirror_num); 320 321 /* Output without objectid, which is more meaningful */ 322 if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { 323 btrfs_warn_rl(root->fs_info, 324 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 325 btrfs_root_id(root), btrfs_ino(inode), 326 logical_start, 327 CSUM_FMT_VALUE(csum_size, csum), 328 CSUM_FMT_VALUE(csum_size, csum_expected), 329 mirror_num); 330 } else { 331 btrfs_warn_rl(root->fs_info, 332 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 333 btrfs_root_id(root), btrfs_ino(inode), 334 logical_start, 335 CSUM_FMT_VALUE(csum_size, csum), 336 CSUM_FMT_VALUE(csum_size, csum_expected), 337 mirror_num); 338 } 339 } 340 341 /* 342 * Lock inode i_rwsem based on arguments passed. 343 * 344 * ilock_flags can have the following bit set: 345 * 346 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode 347 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt 348 * return -EAGAIN 349 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock 350 */ 351 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) 352 { 353 if (ilock_flags & BTRFS_ILOCK_SHARED) { 354 if (ilock_flags & BTRFS_ILOCK_TRY) { 355 if (!inode_trylock_shared(&inode->vfs_inode)) 356 return -EAGAIN; 357 else 358 return 0; 359 } 360 inode_lock_shared(&inode->vfs_inode); 361 } else { 362 if (ilock_flags & BTRFS_ILOCK_TRY) { 363 if (!inode_trylock(&inode->vfs_inode)) 364 return -EAGAIN; 365 else 366 return 0; 367 } 368 inode_lock(&inode->vfs_inode); 369 } 370 if (ilock_flags & BTRFS_ILOCK_MMAP) 371 down_write(&inode->i_mmap_lock); 372 return 0; 373 } 374 375 /* 376 * Unlock inode i_rwsem. 377 * 378 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() 379 * to decide whether the lock acquired is shared or exclusive. 380 */ 381 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) 382 { 383 if (ilock_flags & BTRFS_ILOCK_MMAP) 384 up_write(&inode->i_mmap_lock); 385 if (ilock_flags & BTRFS_ILOCK_SHARED) 386 inode_unlock_shared(&inode->vfs_inode); 387 else 388 inode_unlock(&inode->vfs_inode); 389 } 390 391 /* 392 * Cleanup all submitted ordered extents in specified range to handle errors 393 * from the btrfs_run_delalloc_range() callback. 394 * 395 * NOTE: caller must ensure that when an error happens, it can not call 396 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING 397 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata 398 * to be released, which we want to happen only when finishing the ordered 399 * extent (btrfs_finish_ordered_io()). 400 */ 401 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, 402 u64 offset, u64 bytes) 403 { 404 pgoff_t index = offset >> PAGE_SHIFT; 405 const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT; 406 struct folio *folio; 407 408 while (index <= end_index) { 409 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); 410 if (IS_ERR(folio)) { 411 index++; 412 continue; 413 } 414 415 index = folio_next_index(folio); 416 /* 417 * Here we just clear all Ordered bits for every page in the 418 * range, then btrfs_mark_ordered_io_finished() will handle 419 * the ordered extent accounting for the range. 420 */ 421 btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, 422 offset, bytes); 423 folio_put(folio); 424 } 425 426 return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); 427 } 428 429 static int btrfs_dirty_inode(struct btrfs_inode *inode); 430 431 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 432 struct btrfs_new_inode_args *args) 433 { 434 int ret; 435 436 if (args->default_acl) { 437 ret = __btrfs_set_acl(trans, args->inode, args->default_acl, 438 ACL_TYPE_DEFAULT); 439 if (ret) 440 return ret; 441 } 442 if (args->acl) { 443 ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); 444 if (ret) 445 return ret; 446 } 447 if (!args->default_acl && !args->acl) 448 cache_no_acl(args->inode); 449 return btrfs_xattr_security_init(trans, args->inode, args->dir, 450 &args->dentry->d_name); 451 } 452 453 /* 454 * this does all the hard work for inserting an inline extent into 455 * the btree. The caller should have done a btrfs_drop_extents so that 456 * no overlapping inline items exist in the btree 457 */ 458 static int insert_inline_extent(struct btrfs_trans_handle *trans, 459 struct btrfs_path *path, 460 struct btrfs_inode *inode, bool extent_inserted, 461 size_t size, size_t compressed_size, 462 int compress_type, 463 struct folio *compressed_folio, 464 bool update_i_size) 465 { 466 struct btrfs_root *root = inode->root; 467 struct extent_buffer *leaf; 468 const u32 sectorsize = trans->fs_info->sectorsize; 469 char *kaddr; 470 unsigned long ptr; 471 struct btrfs_file_extent_item *ei; 472 int ret; 473 size_t cur_size = size; 474 u64 i_size; 475 476 /* 477 * The decompressed size must still be no larger than a sector. Under 478 * heavy race, we can have size == 0 passed in, but that shouldn't be a 479 * big deal and we can continue the insertion. 480 */ 481 ASSERT(size <= sectorsize); 482 483 /* 484 * The compressed size also needs to be no larger than a sector. 485 * That's also why we only need one page as the parameter. 486 */ 487 if (compressed_folio) 488 ASSERT(compressed_size <= sectorsize); 489 else 490 ASSERT(compressed_size == 0); 491 492 if (compressed_size && compressed_folio) 493 cur_size = compressed_size; 494 495 if (!extent_inserted) { 496 struct btrfs_key key; 497 size_t datasize; 498 499 key.objectid = btrfs_ino(inode); 500 key.type = BTRFS_EXTENT_DATA_KEY; 501 key.offset = 0; 502 503 datasize = btrfs_file_extent_calc_inline_size(cur_size); 504 ret = btrfs_insert_empty_item(trans, root, path, &key, 505 datasize); 506 if (ret) 507 goto fail; 508 } 509 leaf = path->nodes[0]; 510 ei = btrfs_item_ptr(leaf, path->slots[0], 511 struct btrfs_file_extent_item); 512 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 513 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 514 btrfs_set_file_extent_encryption(leaf, ei, 0); 515 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 516 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 517 ptr = btrfs_file_extent_inline_start(ei); 518 519 if (compress_type != BTRFS_COMPRESS_NONE) { 520 kaddr = kmap_local_folio(compressed_folio, 0); 521 write_extent_buffer(leaf, kaddr, ptr, compressed_size); 522 kunmap_local(kaddr); 523 524 btrfs_set_file_extent_compression(leaf, ei, 525 compress_type); 526 } else { 527 struct folio *folio; 528 529 folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); 530 ASSERT(!IS_ERR(folio)); 531 btrfs_set_file_extent_compression(leaf, ei, 0); 532 kaddr = kmap_local_folio(folio, 0); 533 write_extent_buffer(leaf, kaddr, ptr, size); 534 kunmap_local(kaddr); 535 folio_put(folio); 536 } 537 btrfs_release_path(path); 538 539 /* 540 * We align size to sectorsize for inline extents just for simplicity 541 * sake. 542 */ 543 ret = btrfs_inode_set_file_extent_range(inode, 0, 544 ALIGN(size, root->fs_info->sectorsize)); 545 if (ret) 546 goto fail; 547 548 /* 549 * We're an inline extent, so nobody can extend the file past i_size 550 * without locking a page we already have locked. 551 * 552 * We must do any i_size and inode updates before we unlock the pages. 553 * Otherwise we could end up racing with unlink. 554 */ 555 i_size = i_size_read(&inode->vfs_inode); 556 if (update_i_size && size > i_size) { 557 i_size_write(&inode->vfs_inode, size); 558 i_size = size; 559 } 560 inode->disk_i_size = i_size; 561 562 fail: 563 return ret; 564 } 565 566 static bool can_cow_file_range_inline(struct btrfs_inode *inode, 567 u64 offset, u64 size, 568 size_t compressed_size) 569 { 570 struct btrfs_fs_info *fs_info = inode->root->fs_info; 571 u64 data_len = (compressed_size ?: size); 572 573 /* Inline extents must start at offset 0. */ 574 if (offset != 0) 575 return false; 576 577 /* Inline extents are limited to sectorsize. */ 578 if (size > fs_info->sectorsize) 579 return false; 580 581 /* We do not allow a non-compressed extent to be as large as block size. */ 582 if (data_len >= fs_info->sectorsize) 583 return false; 584 585 /* We cannot exceed the maximum inline data size. */ 586 if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) 587 return false; 588 589 /* We cannot exceed the user specified max_inline size. */ 590 if (data_len > fs_info->max_inline) 591 return false; 592 593 /* Inline extents must be the entirety of the file. */ 594 if (size < i_size_read(&inode->vfs_inode)) 595 return false; 596 597 return true; 598 } 599 600 /* 601 * conditionally insert an inline extent into the file. This 602 * does the checks required to make sure the data is small enough 603 * to fit as an inline extent. 604 * 605 * If being used directly, you must have already checked we're allowed to cow 606 * the range by getting true from can_cow_file_range_inline(). 607 */ 608 static noinline int __cow_file_range_inline(struct btrfs_inode *inode, 609 u64 size, size_t compressed_size, 610 int compress_type, 611 struct folio *compressed_folio, 612 bool update_i_size) 613 { 614 struct btrfs_drop_extents_args drop_args = { 0 }; 615 struct btrfs_root *root = inode->root; 616 struct btrfs_fs_info *fs_info = root->fs_info; 617 struct btrfs_trans_handle *trans; 618 u64 data_len = (compressed_size ?: size); 619 int ret; 620 struct btrfs_path *path; 621 622 path = btrfs_alloc_path(); 623 if (!path) 624 return -ENOMEM; 625 626 trans = btrfs_join_transaction(root); 627 if (IS_ERR(trans)) { 628 btrfs_free_path(path); 629 return PTR_ERR(trans); 630 } 631 trans->block_rsv = &inode->block_rsv; 632 633 drop_args.path = path; 634 drop_args.start = 0; 635 drop_args.end = fs_info->sectorsize; 636 drop_args.drop_cache = true; 637 drop_args.replace_extent = true; 638 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); 639 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 640 if (unlikely(ret)) { 641 btrfs_abort_transaction(trans, ret); 642 goto out; 643 } 644 645 ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, 646 size, compressed_size, compress_type, 647 compressed_folio, update_i_size); 648 if (unlikely(ret && ret != -ENOSPC)) { 649 btrfs_abort_transaction(trans, ret); 650 goto out; 651 } else if (ret == -ENOSPC) { 652 ret = 1; 653 goto out; 654 } 655 656 btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); 657 ret = btrfs_update_inode(trans, inode); 658 if (unlikely(ret && ret != -ENOSPC)) { 659 btrfs_abort_transaction(trans, ret); 660 goto out; 661 } else if (ret == -ENOSPC) { 662 ret = 1; 663 goto out; 664 } 665 666 btrfs_set_inode_full_sync(inode); 667 out: 668 /* 669 * Don't forget to free the reserved space, as for inlined extent 670 * it won't count as data extent, free them directly here. 671 * And at reserve time, it's always aligned to page size, so 672 * just free one page here. 673 */ 674 btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); 675 btrfs_free_path(path); 676 btrfs_end_transaction(trans); 677 return ret; 678 } 679 680 static noinline int cow_file_range_inline(struct btrfs_inode *inode, 681 struct folio *locked_folio, 682 u64 offset, u64 end, 683 size_t compressed_size, 684 int compress_type, 685 struct folio *compressed_folio, 686 bool update_i_size) 687 { 688 struct extent_state *cached = NULL; 689 unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 690 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; 691 u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); 692 int ret; 693 694 if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) 695 return 1; 696 697 btrfs_lock_extent(&inode->io_tree, offset, end, &cached); 698 ret = __cow_file_range_inline(inode, size, compressed_size, 699 compress_type, compressed_folio, 700 update_i_size); 701 if (ret > 0) { 702 btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); 703 return ret; 704 } 705 706 /* 707 * In the successful case (ret == 0 here), cow_file_range will return 1. 708 * 709 * Quite a bit further up the callstack in extent_writepage(), ret == 1 710 * is treated as a short circuited success and does not unlock the folio, 711 * so we must do it here. 712 * 713 * In the failure case, the locked_folio does get unlocked by 714 * btrfs_folio_end_all_writers, which asserts that it is still locked 715 * at that point, so we must *not* unlock it here. 716 * 717 * The other two callsites in compress_file_range do not have a 718 * locked_folio, so they are not relevant to this logic. 719 */ 720 if (ret == 0) 721 locked_folio = NULL; 722 723 extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, 724 clear_flags, PAGE_UNLOCK | 725 PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); 726 return ret; 727 } 728 729 struct async_extent { 730 u64 start; 731 u64 ram_size; 732 u64 compressed_size; 733 struct folio **folios; 734 unsigned long nr_folios; 735 int compress_type; 736 struct list_head list; 737 }; 738 739 struct async_chunk { 740 struct btrfs_inode *inode; 741 struct folio *locked_folio; 742 u64 start; 743 u64 end; 744 blk_opf_t write_flags; 745 struct list_head extents; 746 struct cgroup_subsys_state *blkcg_css; 747 struct btrfs_work work; 748 struct async_cow *async_cow; 749 }; 750 751 struct async_cow { 752 atomic_t num_chunks; 753 struct async_chunk chunks[]; 754 }; 755 756 static noinline int add_async_extent(struct async_chunk *cow, 757 u64 start, u64 ram_size, 758 u64 compressed_size, 759 struct folio **folios, 760 unsigned long nr_folios, 761 int compress_type) 762 { 763 struct async_extent *async_extent; 764 765 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 766 if (!async_extent) 767 return -ENOMEM; 768 async_extent->start = start; 769 async_extent->ram_size = ram_size; 770 async_extent->compressed_size = compressed_size; 771 async_extent->folios = folios; 772 async_extent->nr_folios = nr_folios; 773 async_extent->compress_type = compress_type; 774 list_add_tail(&async_extent->list, &cow->extents); 775 return 0; 776 } 777 778 /* 779 * Check if the inode needs to be submitted to compression, based on mount 780 * options, defragmentation, properties or heuristics. 781 */ 782 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, 783 u64 end) 784 { 785 struct btrfs_fs_info *fs_info = inode->root->fs_info; 786 787 if (!btrfs_inode_can_compress(inode)) { 788 DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); 789 return 0; 790 } 791 792 /* Defrag ioctl takes precedence over mount options and properties. */ 793 if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) 794 return 0; 795 if (BTRFS_COMPRESS_NONE < inode->defrag_compress && 796 inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) 797 return 1; 798 /* force compress */ 799 if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) 800 return 1; 801 /* bad compression ratios */ 802 if (inode->flags & BTRFS_INODE_NOCOMPRESS) 803 return 0; 804 if (btrfs_test_opt(fs_info, COMPRESS) || 805 inode->flags & BTRFS_INODE_COMPRESS || 806 inode->prop_compress) 807 return btrfs_compress_heuristic(inode, start, end); 808 return 0; 809 } 810 811 static inline void inode_should_defrag(struct btrfs_inode *inode, 812 u64 start, u64 end, u64 num_bytes, u32 small_write) 813 { 814 /* If this is a small write inside eof, kick off a defrag */ 815 if (num_bytes < small_write && 816 (start > 0 || end + 1 < inode->disk_i_size)) 817 btrfs_add_inode_defrag(inode, small_write); 818 } 819 820 static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) 821 { 822 const pgoff_t end_index = end >> PAGE_SHIFT; 823 struct folio *folio; 824 int ret = 0; 825 826 for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) { 827 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); 828 if (IS_ERR(folio)) { 829 if (!ret) 830 ret = PTR_ERR(folio); 831 continue; 832 } 833 btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, 834 end + 1 - start); 835 folio_put(folio); 836 } 837 return ret; 838 } 839 840 /* 841 * Work queue call back to started compression on a file and pages. 842 * 843 * This is done inside an ordered work queue, and the compression is spread 844 * across many cpus. The actual IO submission is step two, and the ordered work 845 * queue takes care of making sure that happens in the same order things were 846 * put onto the queue by writepages and friends. 847 * 848 * If this code finds it can't get good compression, it puts an entry onto the 849 * work queue to write the uncompressed bytes. This makes sure that both 850 * compressed inodes and uncompressed inodes are written in the same order that 851 * the flusher thread sent them down. 852 */ 853 static void compress_file_range(struct btrfs_work *work) 854 { 855 struct async_chunk *async_chunk = 856 container_of(work, struct async_chunk, work); 857 struct btrfs_inode *inode = async_chunk->inode; 858 struct btrfs_fs_info *fs_info = inode->root->fs_info; 859 struct address_space *mapping = inode->vfs_inode.i_mapping; 860 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 861 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 862 u64 blocksize = fs_info->sectorsize; 863 u64 start = async_chunk->start; 864 u64 end = async_chunk->end; 865 u64 actual_end; 866 u64 i_size; 867 int ret = 0; 868 struct folio **folios; 869 unsigned long nr_folios; 870 unsigned long total_compressed = 0; 871 unsigned long total_in = 0; 872 unsigned int loff; 873 int i; 874 int compress_type = fs_info->compress_type; 875 int compress_level = fs_info->compress_level; 876 877 inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); 878 879 /* 880 * We need to call clear_page_dirty_for_io on each page in the range. 881 * Otherwise applications with the file mmap'd can wander in and change 882 * the page contents while we are compressing them. 883 */ 884 ret = extent_range_clear_dirty_for_io(inode, start, end); 885 886 /* 887 * All the folios should have been locked thus no failure. 888 * 889 * And even if some folios are missing, btrfs_compress_folios() 890 * would handle them correctly, so here just do an ASSERT() check for 891 * early logic errors. 892 */ 893 ASSERT(ret == 0); 894 895 /* 896 * We need to save i_size before now because it could change in between 897 * us evaluating the size and assigning it. This is because we lock and 898 * unlock the page in truncate and fallocate, and then modify the i_size 899 * later on. 900 * 901 * The barriers are to emulate READ_ONCE, remove that once i_size_read 902 * does that for us. 903 */ 904 barrier(); 905 i_size = i_size_read(&inode->vfs_inode); 906 barrier(); 907 actual_end = min_t(u64, i_size, end + 1); 908 again: 909 folios = NULL; 910 nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; 911 nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); 912 913 /* 914 * we don't want to send crud past the end of i_size through 915 * compression, that's just a waste of CPU time. So, if the 916 * end of the file is before the start of our current 917 * requested range of bytes, we bail out to the uncompressed 918 * cleanup code that can deal with all of this. 919 * 920 * It isn't really the fastest way to fix things, but this is a 921 * very uncommon corner. 922 */ 923 if (actual_end <= start) 924 goto cleanup_and_bail_uncompressed; 925 926 total_compressed = actual_end - start; 927 928 /* 929 * Skip compression for a small file range(<=blocksize) that 930 * isn't an inline extent, since it doesn't save disk space at all. 931 */ 932 if (total_compressed <= blocksize && 933 (start > 0 || end + 1 < inode->disk_i_size)) 934 goto cleanup_and_bail_uncompressed; 935 936 total_compressed = min_t(unsigned long, total_compressed, 937 BTRFS_MAX_UNCOMPRESSED); 938 total_in = 0; 939 ret = 0; 940 941 /* 942 * We do compression for mount -o compress and when the inode has not 943 * been flagged as NOCOMPRESS. This flag can change at any time if we 944 * discover bad compression ratios. 945 */ 946 if (!inode_need_compress(inode, start, end)) 947 goto cleanup_and_bail_uncompressed; 948 949 folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); 950 if (!folios) { 951 /* 952 * Memory allocation failure is not a fatal error, we can fall 953 * back to uncompressed code. 954 */ 955 goto cleanup_and_bail_uncompressed; 956 } 957 958 if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { 959 compress_type = inode->defrag_compress; 960 compress_level = inode->defrag_compress_level; 961 } else if (inode->prop_compress) { 962 compress_type = inode->prop_compress; 963 } 964 965 /* Compression level is applied here. */ 966 ret = btrfs_compress_folios(compress_type, compress_level, 967 inode, start, folios, &nr_folios, &total_in, 968 &total_compressed); 969 if (ret) 970 goto mark_incompressible; 971 972 /* 973 * Zero the tail end of the last folio, as we might be sending it down 974 * to disk. 975 */ 976 loff = (total_compressed & (min_folio_size - 1)); 977 if (loff) 978 folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); 979 980 /* 981 * Try to create an inline extent. 982 * 983 * If we didn't compress the entire range, try to create an uncompressed 984 * inline extent, else a compressed one. 985 * 986 * Check cow_file_range() for why we don't even try to create inline 987 * extent for the subpage case. 988 */ 989 if (total_in < actual_end) 990 ret = cow_file_range_inline(inode, NULL, start, end, 0, 991 BTRFS_COMPRESS_NONE, NULL, false); 992 else 993 ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, 994 compress_type, folios[0], false); 995 if (ret <= 0) { 996 if (ret < 0) 997 mapping_set_error(mapping, -EIO); 998 goto free_pages; 999 } 1000 1001 /* 1002 * We aren't doing an inline extent. Round the compressed size up to a 1003 * block size boundary so the allocator does sane things. 1004 */ 1005 total_compressed = ALIGN(total_compressed, blocksize); 1006 1007 /* 1008 * One last check to make sure the compression is really a win, compare 1009 * the page count read with the blocks on disk, compression must free at 1010 * least one sector. 1011 */ 1012 total_in = round_up(total_in, fs_info->sectorsize); 1013 if (total_compressed + blocksize > total_in) 1014 goto mark_incompressible; 1015 1016 /* 1017 * The async work queues will take care of doing actual allocation on 1018 * disk for these compressed pages, and will submit the bios. 1019 */ 1020 ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, 1021 nr_folios, compress_type); 1022 BUG_ON(ret); 1023 if (start + total_in < end) { 1024 start += total_in; 1025 cond_resched(); 1026 goto again; 1027 } 1028 return; 1029 1030 mark_incompressible: 1031 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) 1032 inode->flags |= BTRFS_INODE_NOCOMPRESS; 1033 cleanup_and_bail_uncompressed: 1034 ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, 1035 BTRFS_COMPRESS_NONE); 1036 BUG_ON(ret); 1037 free_pages: 1038 if (folios) { 1039 for (i = 0; i < nr_folios; i++) { 1040 WARN_ON(folios[i]->mapping); 1041 btrfs_free_compr_folio(folios[i]); 1042 } 1043 kfree(folios); 1044 } 1045 } 1046 1047 static void free_async_extent_pages(struct async_extent *async_extent) 1048 { 1049 int i; 1050 1051 if (!async_extent->folios) 1052 return; 1053 1054 for (i = 0; i < async_extent->nr_folios; i++) { 1055 WARN_ON(async_extent->folios[i]->mapping); 1056 btrfs_free_compr_folio(async_extent->folios[i]); 1057 } 1058 kfree(async_extent->folios); 1059 async_extent->nr_folios = 0; 1060 async_extent->folios = NULL; 1061 } 1062 1063 static void submit_uncompressed_range(struct btrfs_inode *inode, 1064 struct async_extent *async_extent, 1065 struct folio *locked_folio) 1066 { 1067 u64 start = async_extent->start; 1068 u64 end = async_extent->start + async_extent->ram_size - 1; 1069 int ret; 1070 struct writeback_control wbc = { 1071 .sync_mode = WB_SYNC_ALL, 1072 .range_start = start, 1073 .range_end = end, 1074 .no_cgroup_owner = 1, 1075 }; 1076 1077 wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); 1078 ret = run_delalloc_cow(inode, locked_folio, start, end, 1079 &wbc, false); 1080 wbc_detach_inode(&wbc); 1081 if (ret < 0) { 1082 if (locked_folio) 1083 btrfs_folio_end_lock(inode->root->fs_info, locked_folio, 1084 start, async_extent->ram_size); 1085 btrfs_err_rl(inode->root->fs_info, 1086 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", 1087 __func__, btrfs_root_id(inode->root), 1088 btrfs_ino(inode), start, async_extent->ram_size, ret); 1089 } 1090 } 1091 1092 static void submit_one_async_extent(struct async_chunk *async_chunk, 1093 struct async_extent *async_extent, 1094 u64 *alloc_hint) 1095 { 1096 struct btrfs_inode *inode = async_chunk->inode; 1097 struct extent_io_tree *io_tree = &inode->io_tree; 1098 struct btrfs_root *root = inode->root; 1099 struct btrfs_fs_info *fs_info = root->fs_info; 1100 struct btrfs_ordered_extent *ordered; 1101 struct btrfs_file_extent file_extent; 1102 struct btrfs_key ins; 1103 struct folio *locked_folio = NULL; 1104 struct extent_state *cached = NULL; 1105 struct extent_map *em; 1106 int ret = 0; 1107 bool free_pages = false; 1108 u64 start = async_extent->start; 1109 u64 end = async_extent->start + async_extent->ram_size - 1; 1110 1111 if (async_chunk->blkcg_css) 1112 kthread_associate_blkcg(async_chunk->blkcg_css); 1113 1114 /* 1115 * If async_chunk->locked_folio is in the async_extent range, we need to 1116 * handle it. 1117 */ 1118 if (async_chunk->locked_folio) { 1119 u64 locked_folio_start = folio_pos(async_chunk->locked_folio); 1120 u64 locked_folio_end = locked_folio_start + 1121 folio_size(async_chunk->locked_folio) - 1; 1122 1123 if (!(start >= locked_folio_end || end <= locked_folio_start)) 1124 locked_folio = async_chunk->locked_folio; 1125 } 1126 1127 if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { 1128 ASSERT(!async_extent->folios); 1129 ASSERT(async_extent->nr_folios == 0); 1130 submit_uncompressed_range(inode, async_extent, locked_folio); 1131 free_pages = true; 1132 goto done; 1133 } 1134 1135 ret = btrfs_reserve_extent(root, async_extent->ram_size, 1136 async_extent->compressed_size, 1137 async_extent->compressed_size, 1138 0, *alloc_hint, &ins, 1, 1); 1139 if (ret) { 1140 /* 1141 * We can't reserve contiguous space for the compressed size. 1142 * Unlikely, but it's possible that we could have enough 1143 * non-contiguous space for the uncompressed size instead. So 1144 * fall back to uncompressed. 1145 */ 1146 submit_uncompressed_range(inode, async_extent, locked_folio); 1147 free_pages = true; 1148 goto done; 1149 } 1150 1151 btrfs_lock_extent(io_tree, start, end, &cached); 1152 1153 /* Here we're doing allocation and writeback of the compressed pages */ 1154 file_extent.disk_bytenr = ins.objectid; 1155 file_extent.disk_num_bytes = ins.offset; 1156 file_extent.ram_bytes = async_extent->ram_size; 1157 file_extent.num_bytes = async_extent->ram_size; 1158 file_extent.offset = 0; 1159 file_extent.compression = async_extent->compress_type; 1160 1161 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); 1162 if (IS_ERR(em)) { 1163 ret = PTR_ERR(em); 1164 goto out_free_reserve; 1165 } 1166 btrfs_free_extent_map(em); 1167 1168 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1169 1U << BTRFS_ORDERED_COMPRESSED); 1170 if (IS_ERR(ordered)) { 1171 btrfs_drop_extent_map_range(inode, start, end, false); 1172 ret = PTR_ERR(ordered); 1173 goto out_free_reserve; 1174 } 1175 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1176 1177 /* Clear dirty, set writeback and unlock the pages. */ 1178 extent_clear_unlock_delalloc(inode, start, end, 1179 NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, 1180 PAGE_UNLOCK | PAGE_START_WRITEBACK); 1181 btrfs_submit_compressed_write(ordered, 1182 async_extent->folios, /* compressed_folios */ 1183 async_extent->nr_folios, 1184 async_chunk->write_flags, true); 1185 *alloc_hint = ins.objectid + ins.offset; 1186 done: 1187 if (async_chunk->blkcg_css) 1188 kthread_associate_blkcg(NULL); 1189 if (free_pages) 1190 free_async_extent_pages(async_extent); 1191 kfree(async_extent); 1192 return; 1193 1194 out_free_reserve: 1195 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1196 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); 1197 mapping_set_error(inode->vfs_inode.i_mapping, -EIO); 1198 extent_clear_unlock_delalloc(inode, start, end, 1199 NULL, &cached, 1200 EXTENT_LOCKED | EXTENT_DELALLOC | 1201 EXTENT_DELALLOC_NEW | 1202 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 1203 PAGE_UNLOCK | PAGE_START_WRITEBACK | 1204 PAGE_END_WRITEBACK); 1205 free_async_extent_pages(async_extent); 1206 if (async_chunk->blkcg_css) 1207 kthread_associate_blkcg(NULL); 1208 btrfs_debug(fs_info, 1209 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", 1210 btrfs_root_id(root), btrfs_ino(inode), start, 1211 async_extent->ram_size, ret); 1212 kfree(async_extent); 1213 } 1214 1215 u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, 1216 u64 num_bytes) 1217 { 1218 struct extent_map_tree *em_tree = &inode->extent_tree; 1219 struct extent_map *em; 1220 u64 alloc_hint = 0; 1221 1222 read_lock(&em_tree->lock); 1223 em = btrfs_search_extent_mapping(em_tree, start, num_bytes); 1224 if (em) { 1225 /* 1226 * if block start isn't an actual block number then find the 1227 * first block in this inode and use that as a hint. If that 1228 * block is also bogus then just don't worry about it. 1229 */ 1230 if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { 1231 btrfs_free_extent_map(em); 1232 em = btrfs_search_extent_mapping(em_tree, 0, 0); 1233 if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) 1234 alloc_hint = btrfs_extent_map_block_start(em); 1235 if (em) 1236 btrfs_free_extent_map(em); 1237 } else { 1238 alloc_hint = btrfs_extent_map_block_start(em); 1239 btrfs_free_extent_map(em); 1240 } 1241 } 1242 read_unlock(&em_tree->lock); 1243 1244 return alloc_hint; 1245 } 1246 1247 /* 1248 * when extent_io.c finds a delayed allocation range in the file, 1249 * the call backs end up in this code. The basic idea is to 1250 * allocate extents on disk for the range, and create ordered data structs 1251 * in ram to track those extents. 1252 * 1253 * locked_folio is the folio that writepage had locked already. We use 1254 * it to make sure we don't do extra locks or unlocks. 1255 * 1256 * When this function fails, it unlocks all folios except @locked_folio. 1257 * 1258 * When this function successfully creates an inline extent, it returns 1 and 1259 * unlocks all folios including locked_folio and starts I/O on them. 1260 * (In reality inline extents are limited to a single block, so locked_folio is 1261 * the only folio handled anyway). 1262 * 1263 * When this function succeed and creates a normal extent, the folio locking 1264 * status depends on the passed in flags: 1265 * 1266 * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked. 1267 * - Else all folios except for @locked_folio are unlocked. 1268 * 1269 * When a failure happens in the second or later iteration of the 1270 * while-loop, the ordered extents created in previous iterations are cleaned up. 1271 */ 1272 static noinline int cow_file_range(struct btrfs_inode *inode, 1273 struct folio *locked_folio, u64 start, 1274 u64 end, u64 *done_offset, 1275 unsigned long flags) 1276 { 1277 struct btrfs_root *root = inode->root; 1278 struct btrfs_fs_info *fs_info = root->fs_info; 1279 struct extent_state *cached = NULL; 1280 u64 alloc_hint = 0; 1281 u64 orig_start = start; 1282 u64 num_bytes; 1283 u64 cur_alloc_size = 0; 1284 u64 min_alloc_size; 1285 u64 blocksize = fs_info->sectorsize; 1286 struct btrfs_key ins; 1287 struct extent_map *em; 1288 unsigned clear_bits; 1289 unsigned long page_ops; 1290 int ret = 0; 1291 1292 if (btrfs_is_free_space_inode(inode)) { 1293 ret = -EINVAL; 1294 goto out_unlock; 1295 } 1296 1297 num_bytes = ALIGN(end - start + 1, blocksize); 1298 num_bytes = max(blocksize, num_bytes); 1299 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); 1300 1301 inode_should_defrag(inode, start, end, num_bytes, SZ_64K); 1302 1303 if (!(flags & COW_FILE_RANGE_NO_INLINE)) { 1304 /* lets try to make an inline extent */ 1305 ret = cow_file_range_inline(inode, locked_folio, start, end, 0, 1306 BTRFS_COMPRESS_NONE, NULL, false); 1307 if (ret <= 0) { 1308 /* 1309 * We succeeded, return 1 so the caller knows we're done 1310 * with this page and already handled the IO. 1311 * 1312 * If there was an error then cow_file_range_inline() has 1313 * already done the cleanup. 1314 */ 1315 if (ret == 0) 1316 ret = 1; 1317 goto done; 1318 } 1319 } 1320 1321 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); 1322 1323 /* 1324 * We're not doing compressed IO, don't unlock the first page (which 1325 * the caller expects to stay locked), don't clear any dirty bits and 1326 * don't set any writeback bits. 1327 * 1328 * Do set the Ordered (Private2) bit so we know this page was properly 1329 * setup for writepage. 1330 */ 1331 page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK); 1332 page_ops |= PAGE_SET_ORDERED; 1333 1334 /* 1335 * Relocation relies on the relocated extents to have exactly the same 1336 * size as the original extents. Normally writeback for relocation data 1337 * extents follows a NOCOW path because relocation preallocates the 1338 * extents. However, due to an operation such as scrub turning a block 1339 * group to RO mode, it may fallback to COW mode, so we must make sure 1340 * an extent allocated during COW has exactly the requested size and can 1341 * not be split into smaller extents, otherwise relocation breaks and 1342 * fails during the stage where it updates the bytenr of file extent 1343 * items. 1344 */ 1345 if (btrfs_is_data_reloc_root(root)) 1346 min_alloc_size = num_bytes; 1347 else 1348 min_alloc_size = fs_info->sectorsize; 1349 1350 while (num_bytes > 0) { 1351 struct btrfs_ordered_extent *ordered; 1352 struct btrfs_file_extent file_extent; 1353 1354 ret = btrfs_reserve_extent(root, num_bytes, num_bytes, 1355 min_alloc_size, 0, alloc_hint, 1356 &ins, 1, 1); 1357 if (ret == -EAGAIN) { 1358 /* 1359 * btrfs_reserve_extent only returns -EAGAIN for zoned 1360 * file systems, which is an indication that there are 1361 * no active zones to allocate from at the moment. 1362 * 1363 * If this is the first loop iteration, wait for at 1364 * least one zone to finish before retrying the 1365 * allocation. Otherwise ask the caller to write out 1366 * the already allocated blocks before coming back to 1367 * us, or return -ENOSPC if it can't handle retries. 1368 */ 1369 ASSERT(btrfs_is_zoned(fs_info)); 1370 if (start == orig_start) { 1371 wait_on_bit_io(&inode->root->fs_info->flags, 1372 BTRFS_FS_NEED_ZONE_FINISH, 1373 TASK_UNINTERRUPTIBLE); 1374 continue; 1375 } 1376 if (done_offset) { 1377 /* 1378 * Move @end to the end of the processed range, 1379 * and exit the loop to unlock the processed extents. 1380 */ 1381 end = start - 1; 1382 ret = 0; 1383 break; 1384 } 1385 ret = -ENOSPC; 1386 } 1387 if (ret < 0) 1388 goto out_unlock; 1389 cur_alloc_size = ins.offset; 1390 1391 file_extent.disk_bytenr = ins.objectid; 1392 file_extent.disk_num_bytes = ins.offset; 1393 file_extent.num_bytes = ins.offset; 1394 file_extent.ram_bytes = ins.offset; 1395 file_extent.offset = 0; 1396 file_extent.compression = BTRFS_COMPRESS_NONE; 1397 1398 /* 1399 * Locked range will be released either during error clean up or 1400 * after the whole range is finished. 1401 */ 1402 btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, 1403 &cached); 1404 1405 em = btrfs_create_io_em(inode, start, &file_extent, 1406 BTRFS_ORDERED_REGULAR); 1407 if (IS_ERR(em)) { 1408 btrfs_unlock_extent(&inode->io_tree, start, 1409 start + cur_alloc_size - 1, &cached); 1410 ret = PTR_ERR(em); 1411 goto out_reserve; 1412 } 1413 btrfs_free_extent_map(em); 1414 1415 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1416 1U << BTRFS_ORDERED_REGULAR); 1417 if (IS_ERR(ordered)) { 1418 btrfs_unlock_extent(&inode->io_tree, start, 1419 start + cur_alloc_size - 1, &cached); 1420 ret = PTR_ERR(ordered); 1421 goto out_drop_extent_cache; 1422 } 1423 1424 if (btrfs_is_data_reloc_root(root)) { 1425 ret = btrfs_reloc_clone_csums(ordered); 1426 1427 /* 1428 * Only drop cache here, and process as normal. 1429 * 1430 * We must not allow extent_clear_unlock_delalloc() 1431 * at out_unlock label to free meta of this ordered 1432 * extent, as its meta should be freed by 1433 * btrfs_finish_ordered_io(). 1434 * 1435 * So we must continue until @start is increased to 1436 * skip current ordered extent. 1437 */ 1438 if (ret) 1439 btrfs_drop_extent_map_range(inode, start, 1440 start + cur_alloc_size - 1, 1441 false); 1442 } 1443 btrfs_put_ordered_extent(ordered); 1444 1445 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1446 1447 if (num_bytes < cur_alloc_size) 1448 num_bytes = 0; 1449 else 1450 num_bytes -= cur_alloc_size; 1451 alloc_hint = ins.objectid + ins.offset; 1452 start += cur_alloc_size; 1453 cur_alloc_size = 0; 1454 1455 /* 1456 * btrfs_reloc_clone_csums() error, since start is increased 1457 * extent_clear_unlock_delalloc() at out_unlock label won't 1458 * free metadata of current ordered extent, we're OK to exit. 1459 */ 1460 if (ret) 1461 goto out_unlock; 1462 } 1463 extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, 1464 EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); 1465 done: 1466 if (done_offset) 1467 *done_offset = end; 1468 return ret; 1469 1470 out_drop_extent_cache: 1471 btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); 1472 out_reserve: 1473 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1474 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); 1475 out_unlock: 1476 /* 1477 * Now, we have three regions to clean up: 1478 * 1479 * |-------(1)----|---(2)---|-------------(3)----------| 1480 * `- orig_start `- start `- start + cur_alloc_size `- end 1481 * 1482 * We process each region below. 1483 */ 1484 1485 /* 1486 * For the range (1). We have already instantiated the ordered extents 1487 * for this region, thus we need to cleanup those ordered extents. 1488 * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV 1489 * are also handled by the ordered extents cleanup. 1490 * 1491 * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and 1492 * finish the writeback of the involved folios, which will be never submitted. 1493 */ 1494 if (orig_start < start) { 1495 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; 1496 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; 1497 1498 if (!locked_folio) 1499 mapping_set_error(inode->vfs_inode.i_mapping, ret); 1500 1501 btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); 1502 extent_clear_unlock_delalloc(inode, orig_start, start - 1, 1503 locked_folio, NULL, clear_bits, page_ops); 1504 } 1505 1506 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 1507 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; 1508 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; 1509 1510 /* 1511 * For the range (2). If we reserved an extent for our delalloc range 1512 * (or a subrange) and failed to create the respective ordered extent, 1513 * then it means that when we reserved the extent we decremented the 1514 * extent's size from the data space_info's bytes_may_use counter and 1515 * incremented the space_info's bytes_reserved counter by the same 1516 * amount. We must make sure extent_clear_unlock_delalloc() does not try 1517 * to decrement again the data space_info's bytes_may_use counter, 1518 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. 1519 */ 1520 if (cur_alloc_size) { 1521 extent_clear_unlock_delalloc(inode, start, 1522 start + cur_alloc_size - 1, 1523 locked_folio, &cached, clear_bits, 1524 page_ops); 1525 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); 1526 } 1527 1528 /* 1529 * For the range (3). We never touched the region. In addition to the 1530 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data 1531 * space_info's bytes_may_use counter, reserved in 1532 * btrfs_check_data_free_space(). 1533 */ 1534 if (start + cur_alloc_size < end) { 1535 clear_bits |= EXTENT_CLEAR_DATA_RESV; 1536 extent_clear_unlock_delalloc(inode, start + cur_alloc_size, 1537 end, locked_folio, 1538 &cached, clear_bits, page_ops); 1539 btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, 1540 end - start - cur_alloc_size + 1, NULL); 1541 } 1542 btrfs_err(fs_info, 1543 "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", 1544 __func__, btrfs_root_id(inode->root), 1545 btrfs_ino(inode), orig_start, end + 1 - orig_start, 1546 start, cur_alloc_size, ret); 1547 return ret; 1548 } 1549 1550 /* 1551 * Phase two of compressed writeback. This is the ordered portion of the code, 1552 * which only gets called in the order the work was queued. We walk all the 1553 * async extents created by compress_file_range and send them down to the disk. 1554 * 1555 * If called with @do_free == true then it'll try to finish the work and free 1556 * the work struct eventually. 1557 */ 1558 static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) 1559 { 1560 struct async_chunk *async_chunk = container_of(work, struct async_chunk, 1561 work); 1562 struct btrfs_fs_info *fs_info = btrfs_work_owner(work); 1563 struct async_extent *async_extent; 1564 unsigned long nr_pages; 1565 u64 alloc_hint = 0; 1566 1567 if (do_free) { 1568 struct async_cow *async_cow; 1569 1570 btrfs_add_delayed_iput(async_chunk->inode); 1571 if (async_chunk->blkcg_css) 1572 css_put(async_chunk->blkcg_css); 1573 1574 async_cow = async_chunk->async_cow; 1575 if (atomic_dec_and_test(&async_cow->num_chunks)) 1576 kvfree(async_cow); 1577 return; 1578 } 1579 1580 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> 1581 PAGE_SHIFT; 1582 1583 while (!list_empty(&async_chunk->extents)) { 1584 async_extent = list_first_entry(&async_chunk->extents, 1585 struct async_extent, list); 1586 list_del(&async_extent->list); 1587 submit_one_async_extent(async_chunk, async_extent, &alloc_hint); 1588 } 1589 1590 /* atomic_sub_return implies a barrier */ 1591 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 1592 5 * SZ_1M) 1593 cond_wake_up_nomb(&fs_info->async_submit_wait); 1594 } 1595 1596 static bool run_delalloc_compressed(struct btrfs_inode *inode, 1597 struct folio *locked_folio, u64 start, 1598 u64 end, struct writeback_control *wbc) 1599 { 1600 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1601 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); 1602 struct async_cow *ctx; 1603 struct async_chunk *async_chunk; 1604 unsigned long nr_pages; 1605 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); 1606 int i; 1607 unsigned nofs_flag; 1608 const blk_opf_t write_flags = wbc_to_write_flags(wbc); 1609 1610 nofs_flag = memalloc_nofs_save(); 1611 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); 1612 memalloc_nofs_restore(nofs_flag); 1613 if (!ctx) 1614 return false; 1615 1616 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); 1617 1618 async_chunk = ctx->chunks; 1619 atomic_set(&ctx->num_chunks, num_chunks); 1620 1621 for (i = 0; i < num_chunks; i++) { 1622 u64 cur_end = min(end, start + SZ_512K - 1); 1623 1624 /* 1625 * igrab is called higher up in the call chain, take only the 1626 * lightweight reference for the callback lifetime 1627 */ 1628 ihold(&inode->vfs_inode); 1629 async_chunk[i].async_cow = ctx; 1630 async_chunk[i].inode = inode; 1631 async_chunk[i].start = start; 1632 async_chunk[i].end = cur_end; 1633 async_chunk[i].write_flags = write_flags; 1634 INIT_LIST_HEAD(&async_chunk[i].extents); 1635 1636 /* 1637 * The locked_folio comes all the way from writepage and its 1638 * the original folio we were actually given. As we spread 1639 * this large delalloc region across multiple async_chunk 1640 * structs, only the first struct needs a pointer to 1641 * locked_folio. 1642 * 1643 * This way we don't need racey decisions about who is supposed 1644 * to unlock it. 1645 */ 1646 if (locked_folio) { 1647 /* 1648 * Depending on the compressibility, the pages might or 1649 * might not go through async. We want all of them to 1650 * be accounted against wbc once. Let's do it here 1651 * before the paths diverge. wbc accounting is used 1652 * only for foreign writeback detection and doesn't 1653 * need full accuracy. Just account the whole thing 1654 * against the first page. 1655 */ 1656 wbc_account_cgroup_owner(wbc, locked_folio, 1657 cur_end - start); 1658 async_chunk[i].locked_folio = locked_folio; 1659 locked_folio = NULL; 1660 } else { 1661 async_chunk[i].locked_folio = NULL; 1662 } 1663 1664 if (blkcg_css != blkcg_root_css) { 1665 css_get(blkcg_css); 1666 async_chunk[i].blkcg_css = blkcg_css; 1667 async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; 1668 } else { 1669 async_chunk[i].blkcg_css = NULL; 1670 } 1671 1672 btrfs_init_work(&async_chunk[i].work, compress_file_range, 1673 submit_compressed_extents); 1674 1675 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); 1676 atomic_add(nr_pages, &fs_info->async_delalloc_pages); 1677 1678 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); 1679 1680 start = cur_end + 1; 1681 } 1682 return true; 1683 } 1684 1685 /* 1686 * Run the delalloc range from start to end, and write back any dirty pages 1687 * covered by the range. 1688 */ 1689 static noinline int run_delalloc_cow(struct btrfs_inode *inode, 1690 struct folio *locked_folio, u64 start, 1691 u64 end, struct writeback_control *wbc, 1692 bool pages_dirty) 1693 { 1694 u64 done_offset = end; 1695 int ret; 1696 1697 while (start <= end) { 1698 ret = cow_file_range(inode, locked_folio, start, end, 1699 &done_offset, COW_FILE_RANGE_KEEP_LOCKED); 1700 if (ret) 1701 return ret; 1702 extent_write_locked_range(&inode->vfs_inode, locked_folio, 1703 start, done_offset, wbc, pages_dirty); 1704 start = done_offset + 1; 1705 } 1706 1707 return 1; 1708 } 1709 1710 static int fallback_to_cow(struct btrfs_inode *inode, 1711 struct folio *locked_folio, const u64 start, 1712 const u64 end) 1713 { 1714 const bool is_space_ino = btrfs_is_free_space_inode(inode); 1715 const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); 1716 const u64 range_bytes = end + 1 - start; 1717 struct extent_io_tree *io_tree = &inode->io_tree; 1718 struct extent_state *cached_state = NULL; 1719 u64 range_start = start; 1720 u64 count; 1721 int ret; 1722 1723 /* 1724 * If EXTENT_NORESERVE is set it means that when the buffered write was 1725 * made we had not enough available data space and therefore we did not 1726 * reserve data space for it, since we though we could do NOCOW for the 1727 * respective file range (either there is prealloc extent or the inode 1728 * has the NOCOW bit set). 1729 * 1730 * However when we need to fallback to COW mode (because for example the 1731 * block group for the corresponding extent was turned to RO mode by a 1732 * scrub or relocation) we need to do the following: 1733 * 1734 * 1) We increment the bytes_may_use counter of the data space info. 1735 * If COW succeeds, it allocates a new data extent and after doing 1736 * that it decrements the space info's bytes_may_use counter and 1737 * increments its bytes_reserved counter by the same amount (we do 1738 * this at btrfs_add_reserved_bytes()). So we need to increment the 1739 * bytes_may_use counter to compensate (when space is reserved at 1740 * buffered write time, the bytes_may_use counter is incremented); 1741 * 1742 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so 1743 * that if the COW path fails for any reason, it decrements (through 1744 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the 1745 * data space info, which we incremented in the step above. 1746 * 1747 * If we need to fallback to cow and the inode corresponds to a free 1748 * space cache inode or an inode of the data relocation tree, we must 1749 * also increment bytes_may_use of the data space_info for the same 1750 * reason. Space caches and relocated data extents always get a prealloc 1751 * extent for them, however scrub or balance may have set the block 1752 * group that contains that extent to RO mode and therefore force COW 1753 * when starting writeback. 1754 */ 1755 btrfs_lock_extent(io_tree, start, end, &cached_state); 1756 count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, 1757 EXTENT_NORESERVE, 0, NULL); 1758 if (count > 0 || is_space_ino || is_reloc_ino) { 1759 u64 bytes = count; 1760 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1761 struct btrfs_space_info *sinfo = fs_info->data_sinfo; 1762 1763 if (is_space_ino || is_reloc_ino) 1764 bytes = range_bytes; 1765 1766 spin_lock(&sinfo->lock); 1767 btrfs_space_info_update_bytes_may_use(sinfo, bytes); 1768 spin_unlock(&sinfo->lock); 1769 1770 if (count > 0) 1771 btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, 1772 &cached_state); 1773 } 1774 btrfs_unlock_extent(io_tree, start, end, &cached_state); 1775 1776 /* 1777 * Don't try to create inline extents, as a mix of inline extent that 1778 * is written out and unlocked directly and a normal NOCOW extent 1779 * doesn't work. 1780 * 1781 * And here we do not unlock the folio after a successful run. 1782 * The folios will be unlocked after everything is finished, or by error handling. 1783 * 1784 * This is to ensure error handling won't need to clear dirty/ordered flags without 1785 * a locked folio, which can race with writeback. 1786 */ 1787 ret = cow_file_range(inode, locked_folio, start, end, NULL, 1788 COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED); 1789 ASSERT(ret != 1); 1790 return ret; 1791 } 1792 1793 struct can_nocow_file_extent_args { 1794 /* Input fields. */ 1795 1796 /* Start file offset of the range we want to NOCOW. */ 1797 u64 start; 1798 /* End file offset (inclusive) of the range we want to NOCOW. */ 1799 u64 end; 1800 bool writeback_path; 1801 /* 1802 * Free the path passed to can_nocow_file_extent() once it's not needed 1803 * anymore. 1804 */ 1805 bool free_path; 1806 1807 /* 1808 * Output fields. Only set when can_nocow_file_extent() returns 1. 1809 * The expected file extent for the NOCOW write. 1810 */ 1811 struct btrfs_file_extent file_extent; 1812 }; 1813 1814 /* 1815 * Check if we can NOCOW the file extent that the path points to. 1816 * This function may return with the path released, so the caller should check 1817 * if path->nodes[0] is NULL or not if it needs to use the path afterwards. 1818 * 1819 * Returns: < 0 on error 1820 * 0 if we can not NOCOW 1821 * 1 if we can NOCOW 1822 */ 1823 static int can_nocow_file_extent(struct btrfs_path *path, 1824 struct btrfs_key *key, 1825 struct btrfs_inode *inode, 1826 struct can_nocow_file_extent_args *args) 1827 { 1828 const bool is_freespace_inode = btrfs_is_free_space_inode(inode); 1829 struct extent_buffer *leaf = path->nodes[0]; 1830 struct btrfs_root *root = inode->root; 1831 struct btrfs_file_extent_item *fi; 1832 struct btrfs_root *csum_root; 1833 u64 io_start; 1834 u64 extent_end; 1835 u8 extent_type; 1836 int can_nocow = 0; 1837 int ret = 0; 1838 bool nowait = path->nowait; 1839 1840 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 1841 extent_type = btrfs_file_extent_type(leaf, fi); 1842 1843 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1844 goto out; 1845 1846 if (!(inode->flags & BTRFS_INODE_NODATACOW) && 1847 extent_type == BTRFS_FILE_EXTENT_REG) 1848 goto out; 1849 1850 /* 1851 * If the extent was created before the generation where the last snapshot 1852 * for its subvolume was created, then this implies the extent is shared, 1853 * hence we must COW. 1854 */ 1855 if (btrfs_file_extent_generation(leaf, fi) <= 1856 btrfs_root_last_snapshot(&root->root_item)) 1857 goto out; 1858 1859 /* An explicit hole, must COW. */ 1860 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 1861 goto out; 1862 1863 /* Compressed/encrypted/encoded extents must be COWed. */ 1864 if (btrfs_file_extent_compression(leaf, fi) || 1865 btrfs_file_extent_encryption(leaf, fi) || 1866 btrfs_file_extent_other_encoding(leaf, fi)) 1867 goto out; 1868 1869 extent_end = btrfs_file_extent_end(path); 1870 1871 args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1872 args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1873 args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1874 args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); 1875 args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); 1876 1877 /* 1878 * The following checks can be expensive, as they need to take other 1879 * locks and do btree or rbtree searches, so release the path to avoid 1880 * blocking other tasks for too long. 1881 */ 1882 btrfs_release_path(path); 1883 1884 ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, 1885 args->file_extent.disk_bytenr, path); 1886 WARN_ON_ONCE(ret > 0 && is_freespace_inode); 1887 if (ret != 0) 1888 goto out; 1889 1890 if (args->free_path) { 1891 /* 1892 * We don't need the path anymore, plus through the 1893 * btrfs_lookup_csums_list() call below we will end up allocating 1894 * another path. So free the path to avoid unnecessary extra 1895 * memory usage. 1896 */ 1897 btrfs_free_path(path); 1898 path = NULL; 1899 } 1900 1901 /* If there are pending snapshots for this root, we must COW. */ 1902 if (args->writeback_path && !is_freespace_inode && 1903 atomic_read(&root->snapshot_force_cow)) 1904 goto out; 1905 1906 args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; 1907 args->file_extent.offset += args->start - key->offset; 1908 io_start = args->file_extent.disk_bytenr + args->file_extent.offset; 1909 1910 /* 1911 * Force COW if csums exist in the range. This ensures that csums for a 1912 * given extent are either valid or do not exist. 1913 */ 1914 1915 csum_root = btrfs_csum_root(root->fs_info, io_start); 1916 ret = btrfs_lookup_csums_list(csum_root, io_start, 1917 io_start + args->file_extent.num_bytes - 1, 1918 NULL, nowait); 1919 WARN_ON_ONCE(ret > 0 && is_freespace_inode); 1920 if (ret != 0) 1921 goto out; 1922 1923 can_nocow = 1; 1924 out: 1925 if (args->free_path && path) 1926 btrfs_free_path(path); 1927 1928 return ret < 0 ? ret : can_nocow; 1929 } 1930 1931 static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, 1932 struct extent_state **cached, 1933 struct can_nocow_file_extent_args *nocow_args, 1934 u64 file_pos, bool is_prealloc) 1935 { 1936 struct btrfs_ordered_extent *ordered; 1937 const u64 len = nocow_args->file_extent.num_bytes; 1938 const u64 end = file_pos + len - 1; 1939 int ret = 0; 1940 1941 btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); 1942 1943 if (is_prealloc) { 1944 struct extent_map *em; 1945 1946 em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, 1947 BTRFS_ORDERED_PREALLOC); 1948 if (IS_ERR(em)) { 1949 ret = PTR_ERR(em); 1950 goto error; 1951 } 1952 btrfs_free_extent_map(em); 1953 } 1954 1955 ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, 1956 is_prealloc 1957 ? (1U << BTRFS_ORDERED_PREALLOC) 1958 : (1U << BTRFS_ORDERED_NOCOW)); 1959 if (IS_ERR(ordered)) { 1960 if (is_prealloc) 1961 btrfs_drop_extent_map_range(inode, file_pos, end, false); 1962 ret = PTR_ERR(ordered); 1963 goto error; 1964 } 1965 1966 if (btrfs_is_data_reloc_root(inode->root)) 1967 /* 1968 * Errors are handled later, as we must prevent 1969 * extent_clear_unlock_delalloc() in error handler from freeing 1970 * metadata of the created ordered extent. 1971 */ 1972 ret = btrfs_reloc_clone_csums(ordered); 1973 btrfs_put_ordered_extent(ordered); 1974 1975 if (ret < 0) 1976 goto error; 1977 extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, 1978 EXTENT_LOCKED | EXTENT_DELALLOC | 1979 EXTENT_CLEAR_DATA_RESV, 1980 PAGE_SET_ORDERED); 1981 return ret; 1982 1983 error: 1984 btrfs_cleanup_ordered_extents(inode, file_pos, len); 1985 extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, 1986 EXTENT_LOCKED | EXTENT_DELALLOC | 1987 EXTENT_CLEAR_DATA_RESV, 1988 PAGE_UNLOCK | PAGE_START_WRITEBACK | 1989 PAGE_END_WRITEBACK); 1990 btrfs_err(inode->root->fs_info, 1991 "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d", 1992 __func__, btrfs_root_id(inode->root), btrfs_ino(inode), 1993 file_pos, len, ret); 1994 return ret; 1995 } 1996 1997 /* 1998 * When nocow writeback calls back. This checks for snapshots or COW copies 1999 * of the extents that exist in the file, and COWs the file as required. 2000 * 2001 * If no cow copies or snapshots exist, we write directly to the existing 2002 * blocks on disk 2003 */ 2004 static noinline int run_delalloc_nocow(struct btrfs_inode *inode, 2005 struct folio *locked_folio, 2006 const u64 start, const u64 end) 2007 { 2008 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2009 struct btrfs_root *root = inode->root; 2010 struct btrfs_path *path; 2011 u64 cow_start = (u64)-1; 2012 /* 2013 * If not 0, represents the inclusive end of the last fallback_to_cow() 2014 * range. Only for error handling. 2015 * 2016 * The same for nocow_end, it's to avoid double cleaning up the range 2017 * already cleaned by nocow_one_range(). 2018 */ 2019 u64 cow_end = 0; 2020 u64 nocow_end = 0; 2021 u64 cur_offset = start; 2022 int ret; 2023 bool check_prev = true; 2024 u64 ino = btrfs_ino(inode); 2025 struct can_nocow_file_extent_args nocow_args = { 0 }; 2026 /* The range that has ordered extent(s). */ 2027 u64 oe_cleanup_start; 2028 u64 oe_cleanup_len = 0; 2029 /* The range that is untouched. */ 2030 u64 untouched_start; 2031 u64 untouched_len = 0; 2032 2033 /* 2034 * Normally on a zoned device we're only doing COW writes, but in case 2035 * of relocation on a zoned filesystem serializes I/O so that we're only 2036 * writing sequentially and can end up here as well. 2037 */ 2038 ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); 2039 2040 path = btrfs_alloc_path(); 2041 if (!path) { 2042 ret = -ENOMEM; 2043 goto error; 2044 } 2045 2046 nocow_args.end = end; 2047 nocow_args.writeback_path = true; 2048 2049 while (cur_offset <= end) { 2050 struct btrfs_block_group *nocow_bg = NULL; 2051 struct btrfs_key found_key; 2052 struct btrfs_file_extent_item *fi; 2053 struct extent_buffer *leaf; 2054 struct extent_state *cached_state = NULL; 2055 u64 extent_end; 2056 int extent_type; 2057 2058 ret = btrfs_lookup_file_extent(NULL, root, path, ino, 2059 cur_offset, 0); 2060 if (ret < 0) 2061 goto error; 2062 2063 /* 2064 * If there is no extent for our range when doing the initial 2065 * search, then go back to the previous slot as it will be the 2066 * one containing the search offset 2067 */ 2068 if (ret > 0 && path->slots[0] > 0 && check_prev) { 2069 leaf = path->nodes[0]; 2070 btrfs_item_key_to_cpu(leaf, &found_key, 2071 path->slots[0] - 1); 2072 if (found_key.objectid == ino && 2073 found_key.type == BTRFS_EXTENT_DATA_KEY) 2074 path->slots[0]--; 2075 } 2076 check_prev = false; 2077 next_slot: 2078 /* Go to next leaf if we have exhausted the current one */ 2079 leaf = path->nodes[0]; 2080 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2081 ret = btrfs_next_leaf(root, path); 2082 if (ret < 0) 2083 goto error; 2084 if (ret > 0) 2085 break; 2086 leaf = path->nodes[0]; 2087 } 2088 2089 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2090 2091 /* Didn't find anything for our INO */ 2092 if (found_key.objectid > ino) 2093 break; 2094 /* 2095 * Keep searching until we find an EXTENT_ITEM or there are no 2096 * more extents for this inode 2097 */ 2098 if (WARN_ON_ONCE(found_key.objectid < ino) || 2099 found_key.type < BTRFS_EXTENT_DATA_KEY) { 2100 path->slots[0]++; 2101 goto next_slot; 2102 } 2103 2104 /* Found key is not EXTENT_DATA_KEY or starts after req range */ 2105 if (found_key.type > BTRFS_EXTENT_DATA_KEY || 2106 found_key.offset > end) 2107 break; 2108 2109 /* 2110 * If the found extent starts after requested offset, then 2111 * adjust cur_offset to be right before this extent begins. 2112 */ 2113 if (found_key.offset > cur_offset) { 2114 if (cow_start == (u64)-1) 2115 cow_start = cur_offset; 2116 cur_offset = found_key.offset; 2117 goto next_slot; 2118 } 2119 2120 /* 2121 * Found extent which begins before our range and potentially 2122 * intersect it 2123 */ 2124 fi = btrfs_item_ptr(leaf, path->slots[0], 2125 struct btrfs_file_extent_item); 2126 extent_type = btrfs_file_extent_type(leaf, fi); 2127 /* If this is triggered then we have a memory corruption. */ 2128 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); 2129 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { 2130 ret = -EUCLEAN; 2131 goto error; 2132 } 2133 extent_end = btrfs_file_extent_end(path); 2134 2135 /* 2136 * If the extent we got ends before our current offset, skip to 2137 * the next extent. 2138 */ 2139 if (extent_end <= cur_offset) { 2140 path->slots[0]++; 2141 goto next_slot; 2142 } 2143 2144 nocow_args.start = cur_offset; 2145 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); 2146 if (ret < 0) 2147 goto error; 2148 if (ret == 0) 2149 goto must_cow; 2150 2151 ret = 0; 2152 nocow_bg = btrfs_inc_nocow_writers(fs_info, 2153 nocow_args.file_extent.disk_bytenr + 2154 nocow_args.file_extent.offset); 2155 if (!nocow_bg) { 2156 must_cow: 2157 /* 2158 * If we can't perform NOCOW writeback for the range, 2159 * then record the beginning of the range that needs to 2160 * be COWed. It will be written out before the next 2161 * NOCOW range if we find one, or when exiting this 2162 * loop. 2163 */ 2164 if (cow_start == (u64)-1) 2165 cow_start = cur_offset; 2166 cur_offset = extent_end; 2167 if (cur_offset > end) 2168 break; 2169 if (!path->nodes[0]) 2170 continue; 2171 path->slots[0]++; 2172 goto next_slot; 2173 } 2174 2175 /* 2176 * COW range from cow_start to found_key.offset - 1. As the key 2177 * will contain the beginning of the first extent that can be 2178 * NOCOW, following one which needs to be COW'ed 2179 */ 2180 if (cow_start != (u64)-1) { 2181 ret = fallback_to_cow(inode, locked_folio, cow_start, 2182 found_key.offset - 1); 2183 if (ret) { 2184 cow_end = found_key.offset - 1; 2185 btrfs_dec_nocow_writers(nocow_bg); 2186 goto error; 2187 } 2188 cow_start = (u64)-1; 2189 } 2190 2191 ret = nocow_one_range(inode, locked_folio, &cached_state, 2192 &nocow_args, cur_offset, 2193 extent_type == BTRFS_FILE_EXTENT_PREALLOC); 2194 btrfs_dec_nocow_writers(nocow_bg); 2195 if (ret < 0) { 2196 nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; 2197 goto error; 2198 } 2199 cur_offset = extent_end; 2200 } 2201 btrfs_release_path(path); 2202 2203 if (cur_offset <= end && cow_start == (u64)-1) 2204 cow_start = cur_offset; 2205 2206 if (cow_start != (u64)-1) { 2207 ret = fallback_to_cow(inode, locked_folio, cow_start, end); 2208 if (ret) { 2209 cow_end = end; 2210 goto error; 2211 } 2212 cow_start = (u64)-1; 2213 } 2214 2215 /* 2216 * Everything is finished without an error, can unlock the folios now. 2217 * 2218 * No need to touch the io tree range nor set folio ordered flag, as 2219 * fallback_to_cow() and nocow_one_range() have already handled them. 2220 */ 2221 extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK); 2222 2223 btrfs_free_path(path); 2224 return 0; 2225 2226 error: 2227 if (cow_start == (u64)-1) { 2228 /* 2229 * case a) 2230 * start cur_offset end 2231 * | OE cleanup | Untouched | 2232 * 2233 * We finished a fallback_to_cow() or nocow_one_range() call, 2234 * but failed to check the next range. 2235 * 2236 * or 2237 * start cur_offset nocow_end end 2238 * | OE cleanup | Skip | Untouched | 2239 * 2240 * nocow_one_range() failed, the range [cur_offset, nocow_end] is 2241 * already cleaned up. 2242 */ 2243 oe_cleanup_start = start; 2244 oe_cleanup_len = cur_offset - start; 2245 if (nocow_end) 2246 untouched_start = nocow_end + 1; 2247 else 2248 untouched_start = cur_offset; 2249 untouched_len = end + 1 - untouched_start; 2250 } else if (cow_start != (u64)-1 && cow_end == 0) { 2251 /* 2252 * case b) 2253 * start cow_start cur_offset end 2254 * | OE cleanup | Untouched | 2255 * 2256 * We got a range that needs COW, but before we hit the next NOCOW range, 2257 * thus [cow_start, cur_offset) doesn't yet have any OE. 2258 */ 2259 oe_cleanup_start = start; 2260 oe_cleanup_len = cow_start - start; 2261 untouched_start = cow_start; 2262 untouched_len = end + 1 - untouched_start; 2263 } else { 2264 /* 2265 * case c) 2266 * start cow_start cow_end end 2267 * | OE cleanup | Skip | Untouched | 2268 * 2269 * fallback_to_cow() failed, and fallback_to_cow() will do the 2270 * cleanup for its range, we shouldn't touch the range 2271 * [cow_start, cow_end]. 2272 */ 2273 ASSERT(cow_start != (u64)-1 && cow_end != 0); 2274 oe_cleanup_start = start; 2275 oe_cleanup_len = cow_start - start; 2276 untouched_start = cow_end + 1; 2277 untouched_len = end + 1 - untouched_start; 2278 } 2279 2280 if (oe_cleanup_len) { 2281 const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1; 2282 btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len); 2283 extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end, 2284 locked_folio, NULL, 2285 EXTENT_LOCKED | EXTENT_DELALLOC, 2286 PAGE_UNLOCK | PAGE_START_WRITEBACK | 2287 PAGE_END_WRITEBACK); 2288 } 2289 2290 if (untouched_len) { 2291 struct extent_state *cached = NULL; 2292 const u64 untouched_end = untouched_start + untouched_len - 1; 2293 2294 /* 2295 * We need to lock the extent here because we're clearing DELALLOC and 2296 * we're not locked at this point. 2297 */ 2298 btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached); 2299 extent_clear_unlock_delalloc(inode, untouched_start, untouched_end, 2300 locked_folio, &cached, 2301 EXTENT_LOCKED | EXTENT_DELALLOC | 2302 EXTENT_DEFRAG | 2303 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 2304 PAGE_START_WRITEBACK | 2305 PAGE_END_WRITEBACK); 2306 btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL); 2307 } 2308 btrfs_free_path(path); 2309 btrfs_err(fs_info, 2310 "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d", 2311 __func__, btrfs_root_id(inode->root), btrfs_ino(inode), 2312 start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len, 2313 untouched_start, untouched_len, ret); 2314 return ret; 2315 } 2316 2317 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) 2318 { 2319 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { 2320 if (inode->defrag_bytes && 2321 btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) 2322 return false; 2323 return true; 2324 } 2325 return false; 2326 } 2327 2328 /* 2329 * Function to process delayed allocation (create CoW) for ranges which are 2330 * being touched for the first time. 2331 */ 2332 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, 2333 u64 start, u64 end, struct writeback_control *wbc) 2334 { 2335 const bool zoned = btrfs_is_zoned(inode->root->fs_info); 2336 int ret; 2337 2338 /* 2339 * The range must cover part of the @locked_folio, or a return of 1 2340 * can confuse the caller. 2341 */ 2342 ASSERT(!(end <= folio_pos(locked_folio) || 2343 start >= folio_next_pos(locked_folio))); 2344 2345 if (should_nocow(inode, start, end)) { 2346 ret = run_delalloc_nocow(inode, locked_folio, start, end); 2347 return ret; 2348 } 2349 2350 if (btrfs_inode_can_compress(inode) && 2351 inode_need_compress(inode, start, end) && 2352 run_delalloc_compressed(inode, locked_folio, start, end, wbc)) 2353 return 1; 2354 2355 if (zoned) 2356 ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, 2357 true); 2358 else 2359 ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); 2360 return ret; 2361 } 2362 2363 void btrfs_split_delalloc_extent(struct btrfs_inode *inode, 2364 struct extent_state *orig, u64 split) 2365 { 2366 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2367 u64 size; 2368 2369 lockdep_assert_held(&inode->io_tree.lock); 2370 2371 /* not delalloc, ignore it */ 2372 if (!(orig->state & EXTENT_DELALLOC)) 2373 return; 2374 2375 size = orig->end - orig->start + 1; 2376 if (size > fs_info->max_extent_size) { 2377 u32 num_extents; 2378 u64 new_size; 2379 2380 /* 2381 * See the explanation in btrfs_merge_delalloc_extent, the same 2382 * applies here, just in reverse. 2383 */ 2384 new_size = orig->end - split + 1; 2385 num_extents = count_max_extents(fs_info, new_size); 2386 new_size = split - orig->start; 2387 num_extents += count_max_extents(fs_info, new_size); 2388 if (count_max_extents(fs_info, size) >= num_extents) 2389 return; 2390 } 2391 2392 spin_lock(&inode->lock); 2393 btrfs_mod_outstanding_extents(inode, 1); 2394 spin_unlock(&inode->lock); 2395 } 2396 2397 /* 2398 * Handle merged delayed allocation extents so we can keep track of new extents 2399 * that are just merged onto old extents, such as when we are doing sequential 2400 * writes, so we can properly account for the metadata space we'll need. 2401 */ 2402 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, 2403 struct extent_state *other) 2404 { 2405 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2406 u64 new_size, old_size; 2407 u32 num_extents; 2408 2409 lockdep_assert_held(&inode->io_tree.lock); 2410 2411 /* not delalloc, ignore it */ 2412 if (!(other->state & EXTENT_DELALLOC)) 2413 return; 2414 2415 if (new->start > other->start) 2416 new_size = new->end - other->start + 1; 2417 else 2418 new_size = other->end - new->start + 1; 2419 2420 /* we're not bigger than the max, unreserve the space and go */ 2421 if (new_size <= fs_info->max_extent_size) { 2422 spin_lock(&inode->lock); 2423 btrfs_mod_outstanding_extents(inode, -1); 2424 spin_unlock(&inode->lock); 2425 return; 2426 } 2427 2428 /* 2429 * We have to add up either side to figure out how many extents were 2430 * accounted for before we merged into one big extent. If the number of 2431 * extents we accounted for is <= the amount we need for the new range 2432 * then we can return, otherwise drop. Think of it like this 2433 * 2434 * [ 4k][MAX_SIZE] 2435 * 2436 * So we've grown the extent by a MAX_SIZE extent, this would mean we 2437 * need 2 outstanding extents, on one side we have 1 and the other side 2438 * we have 1 so they are == and we can return. But in this case 2439 * 2440 * [MAX_SIZE+4k][MAX_SIZE+4k] 2441 * 2442 * Each range on their own accounts for 2 extents, but merged together 2443 * they are only 3 extents worth of accounting, so we need to drop in 2444 * this case. 2445 */ 2446 old_size = other->end - other->start + 1; 2447 num_extents = count_max_extents(fs_info, old_size); 2448 old_size = new->end - new->start + 1; 2449 num_extents += count_max_extents(fs_info, old_size); 2450 if (count_max_extents(fs_info, new_size) >= num_extents) 2451 return; 2452 2453 spin_lock(&inode->lock); 2454 btrfs_mod_outstanding_extents(inode, -1); 2455 spin_unlock(&inode->lock); 2456 } 2457 2458 static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) 2459 { 2460 struct btrfs_root *root = inode->root; 2461 struct btrfs_fs_info *fs_info = root->fs_info; 2462 2463 spin_lock(&root->delalloc_lock); 2464 ASSERT(list_empty(&inode->delalloc_inodes)); 2465 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); 2466 root->nr_delalloc_inodes++; 2467 if (root->nr_delalloc_inodes == 1) { 2468 spin_lock(&fs_info->delalloc_root_lock); 2469 ASSERT(list_empty(&root->delalloc_root)); 2470 list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); 2471 spin_unlock(&fs_info->delalloc_root_lock); 2472 } 2473 spin_unlock(&root->delalloc_lock); 2474 } 2475 2476 void btrfs_del_delalloc_inode(struct btrfs_inode *inode) 2477 { 2478 struct btrfs_root *root = inode->root; 2479 struct btrfs_fs_info *fs_info = root->fs_info; 2480 2481 lockdep_assert_held(&root->delalloc_lock); 2482 2483 /* 2484 * We may be called after the inode was already deleted from the list, 2485 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), 2486 * and then later through btrfs_clear_delalloc_extent() while the inode 2487 * still has ->delalloc_bytes > 0. 2488 */ 2489 if (!list_empty(&inode->delalloc_inodes)) { 2490 list_del_init(&inode->delalloc_inodes); 2491 root->nr_delalloc_inodes--; 2492 if (!root->nr_delalloc_inodes) { 2493 ASSERT(list_empty(&root->delalloc_inodes)); 2494 spin_lock(&fs_info->delalloc_root_lock); 2495 ASSERT(!list_empty(&root->delalloc_root)); 2496 list_del_init(&root->delalloc_root); 2497 spin_unlock(&fs_info->delalloc_root_lock); 2498 } 2499 } 2500 } 2501 2502 /* 2503 * Properly track delayed allocation bytes in the inode and to maintain the 2504 * list of inodes that have pending delalloc work to be done. 2505 */ 2506 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, 2507 u32 bits) 2508 { 2509 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2510 2511 lockdep_assert_held(&inode->io_tree.lock); 2512 2513 if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) 2514 WARN_ON(1); 2515 /* 2516 * set_bit and clear bit hooks normally require _irqsave/restore 2517 * but in this case, we are only testing for the DELALLOC 2518 * bit, which is only set or cleared with irqs on 2519 */ 2520 if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 2521 u64 len = state->end + 1 - state->start; 2522 u64 prev_delalloc_bytes; 2523 u32 num_extents = count_max_extents(fs_info, len); 2524 2525 spin_lock(&inode->lock); 2526 btrfs_mod_outstanding_extents(inode, num_extents); 2527 spin_unlock(&inode->lock); 2528 2529 /* For sanity tests */ 2530 if (btrfs_is_testing(fs_info)) 2531 return; 2532 2533 percpu_counter_add_batch(&fs_info->delalloc_bytes, len, 2534 fs_info->delalloc_batch); 2535 spin_lock(&inode->lock); 2536 prev_delalloc_bytes = inode->delalloc_bytes; 2537 inode->delalloc_bytes += len; 2538 if (bits & EXTENT_DEFRAG) 2539 inode->defrag_bytes += len; 2540 spin_unlock(&inode->lock); 2541 2542 /* 2543 * We don't need to be under the protection of the inode's lock, 2544 * because we are called while holding the inode's io_tree lock 2545 * and are therefore protected against concurrent calls of this 2546 * function and btrfs_clear_delalloc_extent(). 2547 */ 2548 if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) 2549 btrfs_add_delalloc_inode(inode); 2550 } 2551 2552 if (!(state->state & EXTENT_DELALLOC_NEW) && 2553 (bits & EXTENT_DELALLOC_NEW)) { 2554 spin_lock(&inode->lock); 2555 inode->new_delalloc_bytes += state->end + 1 - state->start; 2556 spin_unlock(&inode->lock); 2557 } 2558 } 2559 2560 /* 2561 * Once a range is no longer delalloc this function ensures that proper 2562 * accounting happens. 2563 */ 2564 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, 2565 struct extent_state *state, u32 bits) 2566 { 2567 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2568 u64 len = state->end + 1 - state->start; 2569 u32 num_extents = count_max_extents(fs_info, len); 2570 2571 lockdep_assert_held(&inode->io_tree.lock); 2572 2573 if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { 2574 spin_lock(&inode->lock); 2575 inode->defrag_bytes -= len; 2576 spin_unlock(&inode->lock); 2577 } 2578 2579 /* 2580 * set_bit and clear bit hooks normally require _irqsave/restore 2581 * but in this case, we are only testing for the DELALLOC 2582 * bit, which is only set or cleared with irqs on 2583 */ 2584 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 2585 struct btrfs_root *root = inode->root; 2586 u64 new_delalloc_bytes; 2587 2588 spin_lock(&inode->lock); 2589 btrfs_mod_outstanding_extents(inode, -num_extents); 2590 spin_unlock(&inode->lock); 2591 2592 /* 2593 * We don't reserve metadata space for space cache inodes so we 2594 * don't need to call delalloc_release_metadata if there is an 2595 * error. 2596 */ 2597 if (bits & EXTENT_CLEAR_META_RESV && 2598 root != fs_info->tree_root) 2599 btrfs_delalloc_release_metadata(inode, len, true); 2600 2601 /* For sanity tests. */ 2602 if (btrfs_is_testing(fs_info)) 2603 return; 2604 2605 if (!btrfs_is_data_reloc_root(root) && 2606 !btrfs_is_free_space_inode(inode) && 2607 !(state->state & EXTENT_NORESERVE) && 2608 (bits & EXTENT_CLEAR_DATA_RESV)) 2609 btrfs_free_reserved_data_space_noquota(inode, len); 2610 2611 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, 2612 fs_info->delalloc_batch); 2613 spin_lock(&inode->lock); 2614 inode->delalloc_bytes -= len; 2615 new_delalloc_bytes = inode->delalloc_bytes; 2616 spin_unlock(&inode->lock); 2617 2618 /* 2619 * We don't need to be under the protection of the inode's lock, 2620 * because we are called while holding the inode's io_tree lock 2621 * and are therefore protected against concurrent calls of this 2622 * function and btrfs_set_delalloc_extent(). 2623 */ 2624 if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { 2625 spin_lock(&root->delalloc_lock); 2626 btrfs_del_delalloc_inode(inode); 2627 spin_unlock(&root->delalloc_lock); 2628 } 2629 } 2630 2631 if ((state->state & EXTENT_DELALLOC_NEW) && 2632 (bits & EXTENT_DELALLOC_NEW)) { 2633 spin_lock(&inode->lock); 2634 ASSERT(inode->new_delalloc_bytes >= len); 2635 inode->new_delalloc_bytes -= len; 2636 if (bits & EXTENT_ADD_INODE_BYTES) 2637 inode_add_bytes(&inode->vfs_inode, len); 2638 spin_unlock(&inode->lock); 2639 } 2640 } 2641 2642 /* 2643 * given a list of ordered sums record them in the inode. This happens 2644 * at IO completion time based on sums calculated at bio submission time. 2645 */ 2646 static int add_pending_csums(struct btrfs_trans_handle *trans, 2647 struct list_head *list) 2648 { 2649 struct btrfs_ordered_sum *sum; 2650 struct btrfs_root *csum_root = NULL; 2651 int ret; 2652 2653 list_for_each_entry(sum, list, list) { 2654 trans->adding_csums = true; 2655 if (!csum_root) 2656 csum_root = btrfs_csum_root(trans->fs_info, 2657 sum->logical); 2658 ret = btrfs_csum_file_blocks(trans, csum_root, sum); 2659 trans->adding_csums = false; 2660 if (ret) 2661 return ret; 2662 } 2663 return 0; 2664 } 2665 2666 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, 2667 const u64 start, 2668 const u64 len, 2669 struct extent_state **cached_state) 2670 { 2671 u64 search_start = start; 2672 const u64 end = start + len - 1; 2673 2674 while (search_start < end) { 2675 const u64 search_len = end - search_start + 1; 2676 struct extent_map *em; 2677 u64 em_len; 2678 int ret = 0; 2679 2680 em = btrfs_get_extent(inode, NULL, search_start, search_len); 2681 if (IS_ERR(em)) 2682 return PTR_ERR(em); 2683 2684 if (em->disk_bytenr != EXTENT_MAP_HOLE) 2685 goto next; 2686 2687 em_len = em->len; 2688 if (em->start < search_start) 2689 em_len -= search_start - em->start; 2690 if (em_len > search_len) 2691 em_len = search_len; 2692 2693 ret = btrfs_set_extent_bit(&inode->io_tree, search_start, 2694 search_start + em_len - 1, 2695 EXTENT_DELALLOC_NEW, cached_state); 2696 next: 2697 search_start = btrfs_extent_map_end(em); 2698 btrfs_free_extent_map(em); 2699 if (ret) 2700 return ret; 2701 } 2702 return 0; 2703 } 2704 2705 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2706 unsigned int extra_bits, 2707 struct extent_state **cached_state) 2708 { 2709 WARN_ON(PAGE_ALIGNED(end)); 2710 2711 if (start >= i_size_read(&inode->vfs_inode) && 2712 !(inode->flags & BTRFS_INODE_PREALLOC)) { 2713 /* 2714 * There can't be any extents following eof in this case so just 2715 * set the delalloc new bit for the range directly. 2716 */ 2717 extra_bits |= EXTENT_DELALLOC_NEW; 2718 } else { 2719 int ret; 2720 2721 ret = btrfs_find_new_delalloc_bytes(inode, start, 2722 end + 1 - start, 2723 cached_state); 2724 if (ret) 2725 return ret; 2726 } 2727 2728 return btrfs_set_extent_bit(&inode->io_tree, start, end, 2729 EXTENT_DELALLOC | extra_bits, cached_state); 2730 } 2731 2732 /* see btrfs_writepage_start_hook for details on why this is required */ 2733 struct btrfs_writepage_fixup { 2734 struct folio *folio; 2735 struct btrfs_inode *inode; 2736 struct btrfs_work work; 2737 }; 2738 2739 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 2740 { 2741 struct btrfs_writepage_fixup *fixup = 2742 container_of(work, struct btrfs_writepage_fixup, work); 2743 struct btrfs_ordered_extent *ordered; 2744 struct extent_state *cached_state = NULL; 2745 struct extent_changeset *data_reserved = NULL; 2746 struct folio *folio = fixup->folio; 2747 struct btrfs_inode *inode = fixup->inode; 2748 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2749 u64 page_start = folio_pos(folio); 2750 u64 page_end = folio_next_pos(folio) - 1; 2751 int ret = 0; 2752 bool free_delalloc_space = true; 2753 2754 /* 2755 * This is similar to page_mkwrite, we need to reserve the space before 2756 * we take the folio lock. 2757 */ 2758 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 2759 folio_size(folio)); 2760 again: 2761 folio_lock(folio); 2762 2763 /* 2764 * Before we queued this fixup, we took a reference on the folio. 2765 * folio->mapping may go NULL, but it shouldn't be moved to a different 2766 * address space. 2767 */ 2768 if (!folio->mapping || !folio_test_dirty(folio) || 2769 !folio_test_checked(folio)) { 2770 /* 2771 * Unfortunately this is a little tricky, either 2772 * 2773 * 1) We got here and our folio had already been dealt with and 2774 * we reserved our space, thus ret == 0, so we need to just 2775 * drop our space reservation and bail. This can happen the 2776 * first time we come into the fixup worker, or could happen 2777 * while waiting for the ordered extent. 2778 * 2) Our folio was already dealt with, but we happened to get an 2779 * ENOSPC above from the btrfs_delalloc_reserve_space. In 2780 * this case we obviously don't have anything to release, but 2781 * because the folio was already dealt with we don't want to 2782 * mark the folio with an error, so make sure we're resetting 2783 * ret to 0. This is why we have this check _before_ the ret 2784 * check, because we do not want to have a surprise ENOSPC 2785 * when the folio was already properly dealt with. 2786 */ 2787 if (!ret) { 2788 btrfs_delalloc_release_extents(inode, folio_size(folio)); 2789 btrfs_delalloc_release_space(inode, data_reserved, 2790 page_start, folio_size(folio), 2791 true); 2792 } 2793 ret = 0; 2794 goto out_page; 2795 } 2796 2797 /* 2798 * We can't mess with the folio state unless it is locked, so now that 2799 * it is locked bail if we failed to make our space reservation. 2800 */ 2801 if (ret) 2802 goto out_page; 2803 2804 btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 2805 2806 /* already ordered? We're done */ 2807 if (folio_test_ordered(folio)) 2808 goto out_reserved; 2809 2810 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 2811 if (ordered) { 2812 btrfs_unlock_extent(&inode->io_tree, page_start, page_end, 2813 &cached_state); 2814 folio_unlock(folio); 2815 btrfs_start_ordered_extent(ordered); 2816 btrfs_put_ordered_extent(ordered); 2817 goto again; 2818 } 2819 2820 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 2821 &cached_state); 2822 if (ret) 2823 goto out_reserved; 2824 2825 /* 2826 * Everything went as planned, we're now the owner of a dirty page with 2827 * delayed allocation bits set and space reserved for our COW 2828 * destination. 2829 * 2830 * The page was dirty when we started, nothing should have cleaned it. 2831 */ 2832 BUG_ON(!folio_test_dirty(folio)); 2833 free_delalloc_space = false; 2834 out_reserved: 2835 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2836 if (free_delalloc_space) 2837 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2838 PAGE_SIZE, true); 2839 btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); 2840 out_page: 2841 if (ret) { 2842 /* 2843 * We hit ENOSPC or other errors. Update the mapping and page 2844 * to reflect the errors and clean the page. 2845 */ 2846 mapping_set_error(folio->mapping, ret); 2847 btrfs_mark_ordered_io_finished(inode, folio, page_start, 2848 folio_size(folio), !ret); 2849 folio_clear_dirty_for_io(folio); 2850 } 2851 btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); 2852 folio_unlock(folio); 2853 folio_put(folio); 2854 kfree(fixup); 2855 extent_changeset_free(data_reserved); 2856 /* 2857 * As a precaution, do a delayed iput in case it would be the last iput 2858 * that could need flushing space. Recursing back to fixup worker would 2859 * deadlock. 2860 */ 2861 btrfs_add_delayed_iput(inode); 2862 } 2863 2864 /* 2865 * There are a few paths in the higher layers of the kernel that directly 2866 * set the folio dirty bit without asking the filesystem if it is a 2867 * good idea. This causes problems because we want to make sure COW 2868 * properly happens and the data=ordered rules are followed. 2869 * 2870 * In our case any range that doesn't have the ORDERED bit set 2871 * hasn't been properly setup for IO. We kick off an async process 2872 * to fix it up. The async helper will wait for ordered extents, set 2873 * the delalloc bit and make it safe to write the folio. 2874 */ 2875 int btrfs_writepage_cow_fixup(struct folio *folio) 2876 { 2877 struct inode *inode = folio->mapping->host; 2878 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2879 struct btrfs_writepage_fixup *fixup; 2880 2881 /* This folio has ordered extent covering it already */ 2882 if (folio_test_ordered(folio)) 2883 return 0; 2884 2885 /* 2886 * For experimental build, we error out instead of EAGAIN. 2887 * 2888 * We should not hit such out-of-band dirty folios anymore. 2889 */ 2890 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { 2891 DEBUG_WARN(); 2892 btrfs_err_rl(fs_info, 2893 "root %lld ino %llu folio %llu is marked dirty without notifying the fs", 2894 btrfs_root_id(BTRFS_I(inode)->root), 2895 btrfs_ino(BTRFS_I(inode)), 2896 folio_pos(folio)); 2897 return -EUCLEAN; 2898 } 2899 2900 /* 2901 * folio_checked is set below when we create a fixup worker for this 2902 * folio, don't try to create another one if we're already 2903 * folio_test_checked. 2904 * 2905 * The extent_io writepage code will redirty the foio if we send back 2906 * EAGAIN. 2907 */ 2908 if (folio_test_checked(folio)) 2909 return -EAGAIN; 2910 2911 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 2912 if (!fixup) 2913 return -EAGAIN; 2914 2915 /* 2916 * We are already holding a reference to this inode from 2917 * write_cache_pages. We need to hold it because the space reservation 2918 * takes place outside of the folio lock, and we can't trust 2919 * folio->mapping outside of the folio lock. 2920 */ 2921 ihold(inode); 2922 btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); 2923 folio_get(folio); 2924 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); 2925 fixup->folio = folio; 2926 fixup->inode = BTRFS_I(inode); 2927 btrfs_queue_work(fs_info->fixup_workers, &fixup->work); 2928 2929 return -EAGAIN; 2930 } 2931 2932 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 2933 struct btrfs_inode *inode, u64 file_pos, 2934 struct btrfs_file_extent_item *stack_fi, 2935 const bool update_inode_bytes, 2936 u64 qgroup_reserved) 2937 { 2938 struct btrfs_root *root = inode->root; 2939 const u64 sectorsize = root->fs_info->sectorsize; 2940 BTRFS_PATH_AUTO_FREE(path); 2941 struct extent_buffer *leaf; 2942 struct btrfs_key ins; 2943 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); 2944 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); 2945 u64 offset = btrfs_stack_file_extent_offset(stack_fi); 2946 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); 2947 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); 2948 struct btrfs_drop_extents_args drop_args = { 0 }; 2949 int ret; 2950 2951 path = btrfs_alloc_path(); 2952 if (!path) 2953 return -ENOMEM; 2954 2955 /* 2956 * we may be replacing one extent in the tree with another. 2957 * The new extent is pinned in the extent map, and we don't want 2958 * to drop it from the cache until it is completely in the btree. 2959 * 2960 * So, tell btrfs_drop_extents to leave this extent in the cache. 2961 * the caller is expected to unpin it and allow it to be merged 2962 * with the others. 2963 */ 2964 drop_args.path = path; 2965 drop_args.start = file_pos; 2966 drop_args.end = file_pos + num_bytes; 2967 drop_args.replace_extent = true; 2968 drop_args.extent_item_size = sizeof(*stack_fi); 2969 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2970 if (ret) 2971 goto out; 2972 2973 if (!drop_args.extent_inserted) { 2974 ins.objectid = btrfs_ino(inode); 2975 ins.type = BTRFS_EXTENT_DATA_KEY; 2976 ins.offset = file_pos; 2977 2978 ret = btrfs_insert_empty_item(trans, root, path, &ins, 2979 sizeof(*stack_fi)); 2980 if (ret) 2981 goto out; 2982 } 2983 leaf = path->nodes[0]; 2984 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); 2985 write_extent_buffer(leaf, stack_fi, 2986 btrfs_item_ptr_offset(leaf, path->slots[0]), 2987 sizeof(struct btrfs_file_extent_item)); 2988 2989 btrfs_release_path(path); 2990 2991 /* 2992 * If we dropped an inline extent here, we know the range where it is 2993 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the 2994 * number of bytes only for that range containing the inline extent. 2995 * The remaining of the range will be processed when clearing the 2996 * EXTENT_DELALLOC_BIT bit through the ordered extent completion. 2997 */ 2998 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { 2999 u64 inline_size = round_down(drop_args.bytes_found, sectorsize); 3000 3001 inline_size = drop_args.bytes_found - inline_size; 3002 btrfs_update_inode_bytes(inode, sectorsize, inline_size); 3003 drop_args.bytes_found -= inline_size; 3004 num_bytes -= sectorsize; 3005 } 3006 3007 if (update_inode_bytes) 3008 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); 3009 3010 ins.objectid = disk_bytenr; 3011 ins.type = BTRFS_EXTENT_ITEM_KEY; 3012 ins.offset = disk_num_bytes; 3013 3014 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); 3015 if (ret) 3016 goto out; 3017 3018 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), 3019 file_pos - offset, 3020 qgroup_reserved, &ins); 3021 out: 3022 return ret; 3023 } 3024 3025 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, 3026 u64 start, u64 len) 3027 { 3028 struct btrfs_block_group *cache; 3029 3030 cache = btrfs_lookup_block_group(fs_info, start); 3031 ASSERT(cache); 3032 3033 spin_lock(&cache->lock); 3034 cache->delalloc_bytes -= len; 3035 spin_unlock(&cache->lock); 3036 3037 btrfs_put_block_group(cache); 3038 } 3039 3040 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, 3041 struct btrfs_ordered_extent *oe) 3042 { 3043 struct btrfs_file_extent_item stack_fi; 3044 bool update_inode_bytes; 3045 u64 num_bytes = oe->num_bytes; 3046 u64 ram_bytes = oe->ram_bytes; 3047 3048 memset(&stack_fi, 0, sizeof(stack_fi)); 3049 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); 3050 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); 3051 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, 3052 oe->disk_num_bytes); 3053 btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); 3054 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) 3055 num_bytes = oe->truncated_len; 3056 btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); 3057 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); 3058 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); 3059 /* Encryption and other encoding is reserved and all 0 */ 3060 3061 /* 3062 * For delalloc, when completing an ordered extent we update the inode's 3063 * bytes when clearing the range in the inode's io tree, so pass false 3064 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), 3065 * except if the ordered extent was truncated. 3066 */ 3067 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || 3068 test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || 3069 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); 3070 3071 return insert_reserved_file_extent(trans, oe->inode, 3072 oe->file_offset, &stack_fi, 3073 update_inode_bytes, oe->qgroup_rsv); 3074 } 3075 3076 /* 3077 * As ordered data IO finishes, this gets called so we can finish 3078 * an ordered extent if the range of bytes in the file it covers are 3079 * fully written. 3080 */ 3081 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) 3082 { 3083 struct btrfs_inode *inode = ordered_extent->inode; 3084 struct btrfs_root *root = inode->root; 3085 struct btrfs_fs_info *fs_info = root->fs_info; 3086 struct btrfs_trans_handle *trans = NULL; 3087 struct extent_io_tree *io_tree = &inode->io_tree; 3088 struct extent_state *cached_state = NULL; 3089 u64 start, end; 3090 int compress_type = 0; 3091 int ret = 0; 3092 u64 logical_len = ordered_extent->num_bytes; 3093 bool freespace_inode; 3094 bool truncated = false; 3095 bool clear_reserved_extent = true; 3096 unsigned int clear_bits = EXTENT_DEFRAG; 3097 3098 start = ordered_extent->file_offset; 3099 end = start + ordered_extent->num_bytes - 1; 3100 3101 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3102 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && 3103 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && 3104 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) 3105 clear_bits |= EXTENT_DELALLOC_NEW; 3106 3107 freespace_inode = btrfs_is_free_space_inode(inode); 3108 if (!freespace_inode) 3109 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); 3110 3111 if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) { 3112 ret = -EIO; 3113 goto out; 3114 } 3115 3116 ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, 3117 ordered_extent->disk_num_bytes); 3118 if (ret) 3119 goto out; 3120 3121 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 3122 truncated = true; 3123 logical_len = ordered_extent->truncated_len; 3124 /* Truncated the entire extent, don't bother adding */ 3125 if (!logical_len) 3126 goto out; 3127 } 3128 3129 /* 3130 * If it's a COW write we need to lock the extent range as we will be 3131 * inserting/replacing file extent items and unpinning an extent map. 3132 * This must be taken before joining a transaction, as it's a higher 3133 * level lock (like the inode's VFS lock), otherwise we can run into an 3134 * ABBA deadlock with other tasks (transactions work like a lock, 3135 * depending on their current state). 3136 */ 3137 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 3138 clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED; 3139 btrfs_lock_extent_bits(io_tree, start, end, 3140 EXTENT_LOCKED | EXTENT_FINISHING_ORDERED, 3141 &cached_state); 3142 } 3143 3144 if (freespace_inode) 3145 trans = btrfs_join_transaction_spacecache(root); 3146 else 3147 trans = btrfs_join_transaction(root); 3148 if (IS_ERR(trans)) { 3149 ret = PTR_ERR(trans); 3150 trans = NULL; 3151 goto out; 3152 } 3153 3154 trans->block_rsv = &inode->block_rsv; 3155 3156 ret = btrfs_insert_raid_extent(trans, ordered_extent); 3157 if (unlikely(ret)) { 3158 btrfs_abort_transaction(trans, ret); 3159 goto out; 3160 } 3161 3162 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 3163 /* Logic error */ 3164 ASSERT(list_empty(&ordered_extent->list)); 3165 if (unlikely(!list_empty(&ordered_extent->list))) { 3166 ret = -EINVAL; 3167 btrfs_abort_transaction(trans, ret); 3168 goto out; 3169 } 3170 3171 btrfs_inode_safe_disk_i_size_write(inode, 0); 3172 ret = btrfs_update_inode_fallback(trans, inode); 3173 if (unlikely(ret)) { 3174 /* -ENOMEM or corruption */ 3175 btrfs_abort_transaction(trans, ret); 3176 } 3177 goto out; 3178 } 3179 3180 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 3181 compress_type = ordered_extent->compress_type; 3182 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3183 BUG_ON(compress_type); 3184 ret = btrfs_mark_extent_written(trans, inode, 3185 ordered_extent->file_offset, 3186 ordered_extent->file_offset + 3187 logical_len); 3188 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, 3189 ordered_extent->disk_num_bytes); 3190 } else { 3191 BUG_ON(root == fs_info->tree_root); 3192 ret = insert_ordered_extent_file_extent(trans, ordered_extent); 3193 if (!ret) { 3194 clear_reserved_extent = false; 3195 btrfs_release_delalloc_bytes(fs_info, 3196 ordered_extent->disk_bytenr, 3197 ordered_extent->disk_num_bytes); 3198 } 3199 } 3200 if (unlikely(ret < 0)) { 3201 btrfs_abort_transaction(trans, ret); 3202 goto out; 3203 } 3204 3205 ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, 3206 ordered_extent->num_bytes, trans->transid); 3207 if (unlikely(ret < 0)) { 3208 btrfs_abort_transaction(trans, ret); 3209 goto out; 3210 } 3211 3212 ret = add_pending_csums(trans, &ordered_extent->list); 3213 if (unlikely(ret)) { 3214 btrfs_abort_transaction(trans, ret); 3215 goto out; 3216 } 3217 3218 /* 3219 * If this is a new delalloc range, clear its new delalloc flag to 3220 * update the inode's number of bytes. This needs to be done first 3221 * before updating the inode item. 3222 */ 3223 if ((clear_bits & EXTENT_DELALLOC_NEW) && 3224 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) 3225 btrfs_clear_extent_bit(&inode->io_tree, start, end, 3226 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, 3227 &cached_state); 3228 3229 btrfs_inode_safe_disk_i_size_write(inode, 0); 3230 ret = btrfs_update_inode_fallback(trans, inode); 3231 if (unlikely(ret)) { /* -ENOMEM or corruption */ 3232 btrfs_abort_transaction(trans, ret); 3233 goto out; 3234 } 3235 out: 3236 btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, 3237 &cached_state); 3238 3239 if (trans) 3240 btrfs_end_transaction(trans); 3241 3242 if (ret || truncated) { 3243 /* 3244 * If we failed to finish this ordered extent for any reason we 3245 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered 3246 * extent, and mark the inode with the error if it wasn't 3247 * already set. Any error during writeback would have already 3248 * set the mapping error, so we need to set it if we're the ones 3249 * marking this ordered extent as failed. 3250 */ 3251 if (ret) 3252 btrfs_mark_ordered_extent_error(ordered_extent); 3253 3254 /* 3255 * Drop extent maps for the part of the extent we didn't write. 3256 * 3257 * We have an exception here for the free_space_inode, this is 3258 * because when we do btrfs_get_extent() on the free space inode 3259 * we will search the commit root. If this is a new block group 3260 * we won't find anything, and we will trip over the assert in 3261 * writepage where we do ASSERT(em->block_start != 3262 * EXTENT_MAP_HOLE). 3263 * 3264 * Theoretically we could also skip this for any NOCOW extent as 3265 * we don't mess with the extent map tree in the NOCOW case, but 3266 * for now simply skip this if we are the free space inode. 3267 */ 3268 if (!btrfs_is_free_space_inode(inode)) { 3269 u64 unwritten_start = start; 3270 3271 if (truncated) 3272 unwritten_start += logical_len; 3273 3274 btrfs_drop_extent_map_range(inode, unwritten_start, 3275 end, false); 3276 } 3277 3278 /* 3279 * If the ordered extent had an IOERR or something else went 3280 * wrong we need to return the space for this ordered extent 3281 * back to the allocator. We only free the extent in the 3282 * truncated case if we didn't write out the extent at all. 3283 * 3284 * If we made it past insert_reserved_file_extent before we 3285 * errored out then we don't need to do this as the accounting 3286 * has already been done. 3287 */ 3288 if ((ret || !logical_len) && 3289 clear_reserved_extent && 3290 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3291 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3292 /* 3293 * Discard the range before returning it back to the 3294 * free space pool 3295 */ 3296 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) 3297 btrfs_discard_extent(fs_info, 3298 ordered_extent->disk_bytenr, 3299 ordered_extent->disk_num_bytes, 3300 NULL); 3301 btrfs_free_reserved_extent(fs_info, 3302 ordered_extent->disk_bytenr, 3303 ordered_extent->disk_num_bytes, true); 3304 /* 3305 * Actually free the qgroup rsv which was released when 3306 * the ordered extent was created. 3307 */ 3308 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), 3309 ordered_extent->qgroup_rsv, 3310 BTRFS_QGROUP_RSV_DATA); 3311 } 3312 } 3313 3314 /* 3315 * This needs to be done to make sure anybody waiting knows we are done 3316 * updating everything for this ordered extent. 3317 */ 3318 btrfs_remove_ordered_extent(inode, ordered_extent); 3319 3320 /* once for us */ 3321 btrfs_put_ordered_extent(ordered_extent); 3322 /* once for the tree */ 3323 btrfs_put_ordered_extent(ordered_extent); 3324 3325 return ret; 3326 } 3327 3328 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) 3329 { 3330 if (btrfs_is_zoned(ordered->inode->root->fs_info) && 3331 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && 3332 list_empty(&ordered->bioc_list)) 3333 btrfs_finish_ordered_zoned(ordered); 3334 return btrfs_finish_one_ordered(ordered); 3335 } 3336 3337 void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, 3338 u8 *dest) 3339 { 3340 struct folio *folio = page_folio(phys_to_page(paddr)); 3341 const u32 blocksize = fs_info->sectorsize; 3342 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 3343 3344 shash->tfm = fs_info->csum_shash; 3345 /* The full block must be inside the folio. */ 3346 ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); 3347 3348 if (folio_test_partial_kmap(folio)) { 3349 size_t cur = paddr; 3350 3351 crypto_shash_init(shash); 3352 while (cur < paddr + blocksize) { 3353 void *kaddr; 3354 size_t len = min(paddr + blocksize - cur, 3355 PAGE_SIZE - offset_in_page(cur)); 3356 3357 kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); 3358 crypto_shash_update(shash, kaddr, len); 3359 kunmap_local(kaddr); 3360 cur += len; 3361 } 3362 crypto_shash_final(shash, dest); 3363 } else { 3364 crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); 3365 } 3366 } 3367 /* 3368 * Verify the checksum for a single sector without any extra action that depend 3369 * on the type of I/O. 3370 * 3371 * @kaddr must be a properly kmapped address. 3372 */ 3373 int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, 3374 const u8 * const csum_expected) 3375 { 3376 btrfs_calculate_block_csum(fs_info, paddr, csum); 3377 if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) 3378 return -EIO; 3379 return 0; 3380 } 3381 3382 /* 3383 * Verify the checksum of a single data sector. 3384 * 3385 * @bbio: btrfs_io_bio which contains the csum 3386 * @dev: device the sector is on 3387 * @bio_offset: offset to the beginning of the bio (in bytes) 3388 * @bv: bio_vec to check 3389 * 3390 * Check if the checksum on a data block is valid. When a checksum mismatch is 3391 * detected, report the error and fill the corrupted range with zero. 3392 * 3393 * Return %true if the sector is ok or had no checksum to start with, else %false. 3394 */ 3395 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, 3396 u32 bio_offset, phys_addr_t paddr) 3397 { 3398 struct btrfs_inode *inode = bbio->inode; 3399 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3400 const u32 blocksize = fs_info->sectorsize; 3401 struct folio *folio; 3402 u64 file_offset = bbio->file_offset + bio_offset; 3403 u64 end = file_offset + blocksize - 1; 3404 u8 *csum_expected; 3405 u8 csum[BTRFS_CSUM_SIZE]; 3406 3407 if (!bbio->csum) 3408 return true; 3409 3410 if (btrfs_is_data_reloc_root(inode->root) && 3411 btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, 3412 NULL)) { 3413 /* Skip the range without csum for data reloc inode */ 3414 btrfs_clear_extent_bit(&inode->io_tree, file_offset, end, 3415 EXTENT_NODATASUM, NULL); 3416 return true; 3417 } 3418 3419 csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * 3420 fs_info->csum_size; 3421 if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) 3422 goto zeroit; 3423 return true; 3424 3425 zeroit: 3426 btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, 3427 bbio->mirror_num); 3428 if (dev) 3429 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 3430 folio = page_folio(phys_to_page(paddr)); 3431 ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); 3432 folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); 3433 return false; 3434 } 3435 3436 /* 3437 * Perform a delayed iput on @inode. 3438 * 3439 * @inode: The inode we want to perform iput on 3440 * 3441 * This function uses the generic vfs_inode::i_count to track whether we should 3442 * just decrement it (in case it's > 1) or if this is the last iput then link 3443 * the inode to the delayed iput machinery. Delayed iputs are processed at 3444 * transaction commit time/superblock commit/cleaner kthread. 3445 */ 3446 void btrfs_add_delayed_iput(struct btrfs_inode *inode) 3447 { 3448 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3449 unsigned long flags; 3450 3451 if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) 3452 return; 3453 3454 WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); 3455 atomic_inc(&fs_info->nr_delayed_iputs); 3456 /* 3457 * Need to be irq safe here because we can be called from either an irq 3458 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq 3459 * context. 3460 */ 3461 spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); 3462 ASSERT(list_empty(&inode->delayed_iput)); 3463 list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); 3464 spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); 3465 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) 3466 wake_up_process(fs_info->cleaner_kthread); 3467 } 3468 3469 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, 3470 struct btrfs_inode *inode) 3471 { 3472 list_del_init(&inode->delayed_iput); 3473 spin_unlock_irq(&fs_info->delayed_iput_lock); 3474 iput(&inode->vfs_inode); 3475 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) 3476 wake_up(&fs_info->delayed_iputs_wait); 3477 spin_lock_irq(&fs_info->delayed_iput_lock); 3478 } 3479 3480 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, 3481 struct btrfs_inode *inode) 3482 { 3483 if (!list_empty(&inode->delayed_iput)) { 3484 spin_lock_irq(&fs_info->delayed_iput_lock); 3485 if (!list_empty(&inode->delayed_iput)) 3486 run_delayed_iput_locked(fs_info, inode); 3487 spin_unlock_irq(&fs_info->delayed_iput_lock); 3488 } 3489 } 3490 3491 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3492 { 3493 /* 3494 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which 3495 * calls btrfs_add_delayed_iput() and that needs to lock 3496 * fs_info->delayed_iput_lock. So we need to disable irqs here to 3497 * prevent a deadlock. 3498 */ 3499 spin_lock_irq(&fs_info->delayed_iput_lock); 3500 while (!list_empty(&fs_info->delayed_iputs)) { 3501 struct btrfs_inode *inode; 3502 3503 inode = list_first_entry(&fs_info->delayed_iputs, 3504 struct btrfs_inode, delayed_iput); 3505 run_delayed_iput_locked(fs_info, inode); 3506 if (need_resched()) { 3507 spin_unlock_irq(&fs_info->delayed_iput_lock); 3508 cond_resched(); 3509 spin_lock_irq(&fs_info->delayed_iput_lock); 3510 } 3511 } 3512 spin_unlock_irq(&fs_info->delayed_iput_lock); 3513 } 3514 3515 /* 3516 * Wait for flushing all delayed iputs 3517 * 3518 * @fs_info: the filesystem 3519 * 3520 * This will wait on any delayed iputs that are currently running with KILLABLE 3521 * set. Once they are all done running we will return, unless we are killed in 3522 * which case we return EINTR. This helps in user operations like fallocate etc 3523 * that might get blocked on the iputs. 3524 * 3525 * Return EINTR if we were killed, 0 if nothing's pending 3526 */ 3527 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) 3528 { 3529 int ret = wait_event_killable(fs_info->delayed_iputs_wait, 3530 atomic_read(&fs_info->nr_delayed_iputs) == 0); 3531 if (ret) 3532 return -EINTR; 3533 return 0; 3534 } 3535 3536 /* 3537 * This creates an orphan entry for the given inode in case something goes wrong 3538 * in the middle of an unlink. 3539 */ 3540 int btrfs_orphan_add(struct btrfs_trans_handle *trans, 3541 struct btrfs_inode *inode) 3542 { 3543 int ret; 3544 3545 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); 3546 if (unlikely(ret && ret != -EEXIST)) { 3547 btrfs_abort_transaction(trans, ret); 3548 return ret; 3549 } 3550 3551 return 0; 3552 } 3553 3554 /* 3555 * We have done the delete so we can go ahead and remove the orphan item for 3556 * this particular inode. 3557 */ 3558 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3559 struct btrfs_inode *inode) 3560 { 3561 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); 3562 } 3563 3564 /* 3565 * this cleans up any orphans that may be left on the list from the last use 3566 * of this root. 3567 */ 3568 int btrfs_orphan_cleanup(struct btrfs_root *root) 3569 { 3570 struct btrfs_fs_info *fs_info = root->fs_info; 3571 BTRFS_PATH_AUTO_FREE(path); 3572 struct extent_buffer *leaf; 3573 struct btrfs_key key, found_key; 3574 struct btrfs_trans_handle *trans; 3575 u64 last_objectid = 0; 3576 int ret = 0, nr_unlink = 0; 3577 3578 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) 3579 return 0; 3580 3581 path = btrfs_alloc_path(); 3582 if (!path) { 3583 ret = -ENOMEM; 3584 goto out; 3585 } 3586 path->reada = READA_BACK; 3587 3588 key.objectid = BTRFS_ORPHAN_OBJECTID; 3589 key.type = BTRFS_ORPHAN_ITEM_KEY; 3590 key.offset = (u64)-1; 3591 3592 while (1) { 3593 struct btrfs_inode *inode; 3594 3595 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3596 if (ret < 0) 3597 goto out; 3598 3599 /* 3600 * if ret == 0 means we found what we were searching for, which 3601 * is weird, but possible, so only screw with path if we didn't 3602 * find the key and see if we have stuff that matches 3603 */ 3604 if (ret > 0) { 3605 ret = 0; 3606 if (path->slots[0] == 0) 3607 break; 3608 path->slots[0]--; 3609 } 3610 3611 /* pull out the item */ 3612 leaf = path->nodes[0]; 3613 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3614 3615 /* make sure the item matches what we want */ 3616 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3617 break; 3618 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3619 break; 3620 3621 /* release the path since we're done with it */ 3622 btrfs_release_path(path); 3623 3624 /* 3625 * this is where we are basically btrfs_lookup, without the 3626 * crossing root thing. we store the inode number in the 3627 * offset of the orphan item. 3628 */ 3629 3630 if (found_key.offset == last_objectid) { 3631 /* 3632 * We found the same inode as before. This means we were 3633 * not able to remove its items via eviction triggered 3634 * by an iput(). A transaction abort may have happened, 3635 * due to -ENOSPC for example, so try to grab the error 3636 * that lead to a transaction abort, if any. 3637 */ 3638 btrfs_err(fs_info, 3639 "Error removing orphan entry, stopping orphan cleanup"); 3640 ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; 3641 goto out; 3642 } 3643 3644 last_objectid = found_key.offset; 3645 3646 found_key.objectid = found_key.offset; 3647 found_key.type = BTRFS_INODE_ITEM_KEY; 3648 found_key.offset = 0; 3649 inode = btrfs_iget(last_objectid, root); 3650 if (IS_ERR(inode)) { 3651 ret = PTR_ERR(inode); 3652 inode = NULL; 3653 if (ret != -ENOENT) 3654 goto out; 3655 } 3656 3657 if (!inode && root == fs_info->tree_root) { 3658 struct btrfs_root *dead_root; 3659 int is_dead_root = 0; 3660 3661 /* 3662 * This is an orphan in the tree root. Currently these 3663 * could come from 2 sources: 3664 * a) a root (snapshot/subvolume) deletion in progress 3665 * b) a free space cache inode 3666 * We need to distinguish those two, as the orphan item 3667 * for a root must not get deleted before the deletion 3668 * of the snapshot/subvolume's tree completes. 3669 * 3670 * btrfs_find_orphan_roots() ran before us, which has 3671 * found all deleted roots and loaded them into 3672 * fs_info->fs_roots_radix. So here we can find if an 3673 * orphan item corresponds to a deleted root by looking 3674 * up the root from that radix tree. 3675 */ 3676 3677 spin_lock(&fs_info->fs_roots_radix_lock); 3678 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, 3679 (unsigned long)found_key.objectid); 3680 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) 3681 is_dead_root = 1; 3682 spin_unlock(&fs_info->fs_roots_radix_lock); 3683 3684 if (is_dead_root) { 3685 /* prevent this orphan from being found again */ 3686 key.offset = found_key.objectid - 1; 3687 continue; 3688 } 3689 3690 } 3691 3692 /* 3693 * If we have an inode with links, there are a couple of 3694 * possibilities: 3695 * 3696 * 1. We were halfway through creating fsverity metadata for the 3697 * file. In that case, the orphan item represents incomplete 3698 * fsverity metadata which must be cleaned up with 3699 * btrfs_drop_verity_items and deleting the orphan item. 3700 3701 * 2. Old kernels (before v3.12) used to create an 3702 * orphan item for truncate indicating that there were possibly 3703 * extent items past i_size that needed to be deleted. In v3.12, 3704 * truncate was changed to update i_size in sync with the extent 3705 * items, but the (useless) orphan item was still created. Since 3706 * v4.18, we don't create the orphan item for truncate at all. 3707 * 3708 * So, this item could mean that we need to do a truncate, but 3709 * only if this filesystem was last used on a pre-v3.12 kernel 3710 * and was not cleanly unmounted. The odds of that are quite 3711 * slim, and it's a pain to do the truncate now, so just delete 3712 * the orphan item. 3713 * 3714 * It's also possible that this orphan item was supposed to be 3715 * deleted but wasn't. The inode number may have been reused, 3716 * but either way, we can delete the orphan item. 3717 */ 3718 if (!inode || inode->vfs_inode.i_nlink) { 3719 if (inode) { 3720 ret = btrfs_drop_verity_items(inode); 3721 iput(&inode->vfs_inode); 3722 inode = NULL; 3723 if (ret) 3724 goto out; 3725 } 3726 trans = btrfs_start_transaction(root, 1); 3727 if (IS_ERR(trans)) { 3728 ret = PTR_ERR(trans); 3729 goto out; 3730 } 3731 btrfs_debug(fs_info, "auto deleting %Lu", 3732 found_key.objectid); 3733 ret = btrfs_del_orphan_item(trans, root, 3734 found_key.objectid); 3735 btrfs_end_transaction(trans); 3736 if (ret) 3737 goto out; 3738 continue; 3739 } 3740 3741 nr_unlink++; 3742 3743 /* this will do delete_inode and everything for us */ 3744 iput(&inode->vfs_inode); 3745 } 3746 /* release the path since we're done with it */ 3747 btrfs_release_path(path); 3748 3749 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3750 trans = btrfs_join_transaction(root); 3751 if (!IS_ERR(trans)) 3752 btrfs_end_transaction(trans); 3753 } 3754 3755 if (nr_unlink) 3756 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); 3757 3758 out: 3759 if (ret) 3760 btrfs_err(fs_info, "could not do orphan cleanup %d", ret); 3761 return ret; 3762 } 3763 3764 /* 3765 * Look ahead in the leaf for xattrs. If we don't find any then we know there 3766 * can't be any ACLs. 3767 * 3768 * @leaf: the eb leaf where to search 3769 * @slot: the slot the inode is in 3770 * @objectid: the objectid of the inode 3771 * 3772 * Return true if there is xattr/ACL, false otherwise. 3773 */ 3774 static noinline bool acls_after_inode_item(struct extent_buffer *leaf, 3775 int slot, u64 objectid, 3776 int *first_xattr_slot) 3777 { 3778 u32 nritems = btrfs_header_nritems(leaf); 3779 struct btrfs_key found_key; 3780 static u64 xattr_access = 0; 3781 static u64 xattr_default = 0; 3782 int scanned = 0; 3783 3784 if (!xattr_access) { 3785 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, 3786 strlen(XATTR_NAME_POSIX_ACL_ACCESS)); 3787 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, 3788 strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); 3789 } 3790 3791 slot++; 3792 *first_xattr_slot = -1; 3793 while (slot < nritems) { 3794 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3795 3796 /* We found a different objectid, there must be no ACLs. */ 3797 if (found_key.objectid != objectid) 3798 return false; 3799 3800 /* We found an xattr, assume we've got an ACL. */ 3801 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3802 if (*first_xattr_slot == -1) 3803 *first_xattr_slot = slot; 3804 if (found_key.offset == xattr_access || 3805 found_key.offset == xattr_default) 3806 return true; 3807 } 3808 3809 /* 3810 * We found a key greater than an xattr key, there can't be any 3811 * ACLs later on. 3812 */ 3813 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3814 return false; 3815 3816 slot++; 3817 scanned++; 3818 3819 /* 3820 * The item order goes like: 3821 * - inode 3822 * - inode backrefs 3823 * - xattrs 3824 * - extents, 3825 * 3826 * so if there are lots of hard links to an inode there can be 3827 * a lot of backrefs. Don't waste time searching too hard, 3828 * this is just an optimization. 3829 */ 3830 if (scanned >= 8) 3831 break; 3832 } 3833 /* 3834 * We hit the end of the leaf before we found an xattr or something 3835 * larger than an xattr. We have to assume the inode has ACLs. 3836 */ 3837 if (*first_xattr_slot == -1) 3838 *first_xattr_slot = slot; 3839 return true; 3840 } 3841 3842 static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) 3843 { 3844 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3845 3846 if (WARN_ON_ONCE(inode->file_extent_tree)) 3847 return 0; 3848 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 3849 return 0; 3850 if (!S_ISREG(inode->vfs_inode.i_mode)) 3851 return 0; 3852 if (btrfs_is_free_space_inode(inode)) 3853 return 0; 3854 3855 inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); 3856 if (!inode->file_extent_tree) 3857 return -ENOMEM; 3858 3859 btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree, 3860 IO_TREE_INODE_FILE_EXTENT); 3861 /* Lockdep class is set only for the file extent tree. */ 3862 lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); 3863 3864 return 0; 3865 } 3866 3867 static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) 3868 { 3869 struct btrfs_root *root = inode->root; 3870 struct btrfs_inode *existing; 3871 const u64 ino = btrfs_ino(inode); 3872 int ret; 3873 3874 if (inode_unhashed(&inode->vfs_inode)) 3875 return 0; 3876 3877 if (prealloc) { 3878 ret = xa_reserve(&root->inodes, ino, GFP_NOFS); 3879 if (ret) 3880 return ret; 3881 } 3882 3883 existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); 3884 3885 if (xa_is_err(existing)) { 3886 ret = xa_err(existing); 3887 ASSERT(ret != -EINVAL); 3888 ASSERT(ret != -ENOMEM); 3889 return ret; 3890 } else if (existing) { 3891 WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING))); 3892 } 3893 3894 return 0; 3895 } 3896 3897 /* 3898 * Read a locked inode from the btree into the in-memory inode and add it to 3899 * its root list/tree. 3900 * 3901 * On failure clean up the inode. 3902 */ 3903 static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) 3904 { 3905 struct btrfs_root *root = inode->root; 3906 struct btrfs_fs_info *fs_info = root->fs_info; 3907 struct extent_buffer *leaf; 3908 struct btrfs_inode_item *inode_item; 3909 struct inode *vfs_inode = &inode->vfs_inode; 3910 struct btrfs_key location; 3911 unsigned long ptr; 3912 int maybe_acls; 3913 u32 rdev; 3914 int ret; 3915 bool filled = false; 3916 int first_xattr_slot; 3917 3918 ret = btrfs_fill_inode(inode, &rdev); 3919 if (!ret) 3920 filled = true; 3921 3922 ASSERT(path); 3923 3924 btrfs_get_inode_key(inode, &location); 3925 3926 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3927 if (ret) { 3928 /* 3929 * ret > 0 can come from btrfs_search_slot called by 3930 * btrfs_lookup_inode(), this means the inode was not found. 3931 */ 3932 if (ret > 0) 3933 ret = -ENOENT; 3934 goto out; 3935 } 3936 3937 leaf = path->nodes[0]; 3938 3939 if (filled) 3940 goto cache_index; 3941 3942 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3943 struct btrfs_inode_item); 3944 vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3945 set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); 3946 i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); 3947 i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); 3948 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 3949 3950 inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), 3951 btrfs_timespec_nsec(leaf, &inode_item->atime)); 3952 3953 inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), 3954 btrfs_timespec_nsec(leaf, &inode_item->mtime)); 3955 3956 inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), 3957 btrfs_timespec_nsec(leaf, &inode_item->ctime)); 3958 3959 inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); 3960 inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); 3961 3962 inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); 3963 inode->generation = btrfs_inode_generation(leaf, inode_item); 3964 inode->last_trans = btrfs_inode_transid(leaf, inode_item); 3965 3966 inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); 3967 vfs_inode->i_generation = inode->generation; 3968 vfs_inode->i_rdev = 0; 3969 rdev = btrfs_inode_rdev(leaf, inode_item); 3970 3971 if (S_ISDIR(vfs_inode->i_mode)) 3972 inode->index_cnt = (u64)-1; 3973 3974 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), 3975 &inode->flags, &inode->ro_flags); 3976 btrfs_update_inode_mapping_flags(inode); 3977 btrfs_set_inode_mapping_order(inode); 3978 3979 cache_index: 3980 ret = btrfs_init_file_extent_tree(inode); 3981 if (ret) 3982 goto out; 3983 btrfs_inode_set_file_extent_range(inode, 0, 3984 round_up(i_size_read(vfs_inode), fs_info->sectorsize)); 3985 /* 3986 * If we were modified in the current generation and evicted from memory 3987 * and then re-read we need to do a full sync since we don't have any 3988 * idea about which extents were modified before we were evicted from 3989 * cache. 3990 * 3991 * This is required for both inode re-read from disk and delayed inode 3992 * in the delayed_nodes xarray. 3993 */ 3994 if (inode->last_trans == btrfs_get_fs_generation(fs_info)) 3995 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 3996 3997 /* 3998 * We don't persist the id of the transaction where an unlink operation 3999 * against the inode was last made. So here we assume the inode might 4000 * have been evicted, and therefore the exact value of last_unlink_trans 4001 * lost, and set it to last_trans to avoid metadata inconsistencies 4002 * between the inode and its parent if the inode is fsync'ed and the log 4003 * replayed. For example, in the scenario: 4004 * 4005 * touch mydir/foo 4006 * ln mydir/foo mydir/bar 4007 * sync 4008 * unlink mydir/bar 4009 * echo 2 > /proc/sys/vm/drop_caches # evicts inode 4010 * xfs_io -c fsync mydir/foo 4011 * <power failure> 4012 * mount fs, triggers fsync log replay 4013 * 4014 * We must make sure that when we fsync our inode foo we also log its 4015 * parent inode, otherwise after log replay the parent still has the 4016 * dentry with the "bar" name but our inode foo has a link count of 1 4017 * and doesn't have an inode ref with the name "bar" anymore. 4018 * 4019 * Setting last_unlink_trans to last_trans is a pessimistic approach, 4020 * but it guarantees correctness at the expense of occasional full 4021 * transaction commits on fsync if our inode is a directory, or if our 4022 * inode is not a directory, logging its parent unnecessarily. 4023 */ 4024 inode->last_unlink_trans = inode->last_trans; 4025 4026 /* 4027 * Same logic as for last_unlink_trans. We don't persist the generation 4028 * of the last transaction where this inode was used for a reflink 4029 * operation, so after eviction and reloading the inode we must be 4030 * pessimistic and assume the last transaction that modified the inode. 4031 */ 4032 inode->last_reflink_trans = inode->last_trans; 4033 4034 path->slots[0]++; 4035 if (vfs_inode->i_nlink != 1 || 4036 path->slots[0] >= btrfs_header_nritems(leaf)) 4037 goto cache_acl; 4038 4039 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 4040 if (location.objectid != btrfs_ino(inode)) 4041 goto cache_acl; 4042 4043 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4044 if (location.type == BTRFS_INODE_REF_KEY) { 4045 struct btrfs_inode_ref *ref; 4046 4047 ref = (struct btrfs_inode_ref *)ptr; 4048 inode->dir_index = btrfs_inode_ref_index(leaf, ref); 4049 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 4050 struct btrfs_inode_extref *extref; 4051 4052 extref = (struct btrfs_inode_extref *)ptr; 4053 inode->dir_index = btrfs_inode_extref_index(leaf, extref); 4054 } 4055 cache_acl: 4056 /* 4057 * try to precache a NULL acl entry for files that don't have 4058 * any xattrs or acls 4059 */ 4060 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 4061 btrfs_ino(inode), &first_xattr_slot); 4062 if (first_xattr_slot != -1) { 4063 path->slots[0] = first_xattr_slot; 4064 ret = btrfs_load_inode_props(inode, path); 4065 if (ret) 4066 btrfs_err(fs_info, 4067 "error loading props for ino %llu (root %llu): %d", 4068 btrfs_ino(inode), btrfs_root_id(root), ret); 4069 } 4070 4071 if (!maybe_acls) 4072 cache_no_acl(vfs_inode); 4073 4074 switch (vfs_inode->i_mode & S_IFMT) { 4075 case S_IFREG: 4076 vfs_inode->i_mapping->a_ops = &btrfs_aops; 4077 vfs_inode->i_fop = &btrfs_file_operations; 4078 vfs_inode->i_op = &btrfs_file_inode_operations; 4079 break; 4080 case S_IFDIR: 4081 vfs_inode->i_fop = &btrfs_dir_file_operations; 4082 vfs_inode->i_op = &btrfs_dir_inode_operations; 4083 break; 4084 case S_IFLNK: 4085 vfs_inode->i_op = &btrfs_symlink_inode_operations; 4086 inode_nohighmem(vfs_inode); 4087 vfs_inode->i_mapping->a_ops = &btrfs_aops; 4088 break; 4089 default: 4090 vfs_inode->i_op = &btrfs_special_inode_operations; 4091 init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); 4092 break; 4093 } 4094 4095 btrfs_sync_inode_flags_to_i_flags(inode); 4096 4097 ret = btrfs_add_inode_to_root(inode, true); 4098 if (ret) 4099 goto out; 4100 4101 return 0; 4102 out: 4103 iget_failed(vfs_inode); 4104 return ret; 4105 } 4106 4107 /* 4108 * given a leaf and an inode, copy the inode fields into the leaf 4109 */ 4110 static void fill_inode_item(struct btrfs_trans_handle *trans, 4111 struct extent_buffer *leaf, 4112 struct btrfs_inode_item *item, 4113 struct inode *inode) 4114 { 4115 u64 flags; 4116 4117 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 4118 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 4119 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 4120 btrfs_set_inode_mode(leaf, item, inode->i_mode); 4121 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 4122 4123 btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); 4124 btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); 4125 4126 btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); 4127 btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); 4128 4129 btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); 4130 btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); 4131 4132 btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); 4133 btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); 4134 4135 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 4136 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 4137 btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); 4138 btrfs_set_inode_transid(leaf, item, trans->transid); 4139 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 4140 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4141 BTRFS_I(inode)->ro_flags); 4142 btrfs_set_inode_flags(leaf, item, flags); 4143 btrfs_set_inode_block_group(leaf, item, 0); 4144 } 4145 4146 /* 4147 * copy everything in the in-memory inode into the btree. 4148 */ 4149 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 4150 struct btrfs_inode *inode) 4151 { 4152 struct btrfs_inode_item *inode_item; 4153 BTRFS_PATH_AUTO_FREE(path); 4154 struct extent_buffer *leaf; 4155 struct btrfs_key key; 4156 int ret; 4157 4158 path = btrfs_alloc_path(); 4159 if (!path) 4160 return -ENOMEM; 4161 4162 btrfs_get_inode_key(inode, &key); 4163 ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1); 4164 if (ret) { 4165 if (ret > 0) 4166 ret = -ENOENT; 4167 return ret; 4168 } 4169 4170 leaf = path->nodes[0]; 4171 inode_item = btrfs_item_ptr(leaf, path->slots[0], 4172 struct btrfs_inode_item); 4173 4174 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); 4175 btrfs_set_inode_last_trans(trans, inode); 4176 return 0; 4177 } 4178 4179 /* 4180 * copy everything in the in-memory inode into the btree. 4181 */ 4182 int btrfs_update_inode(struct btrfs_trans_handle *trans, 4183 struct btrfs_inode *inode) 4184 { 4185 struct btrfs_root *root = inode->root; 4186 struct btrfs_fs_info *fs_info = root->fs_info; 4187 int ret; 4188 4189 /* 4190 * If the inode is a free space inode, we can deadlock during commit 4191 * if we put it into the delayed code. 4192 * 4193 * The data relocation inode should also be directly updated 4194 * without delay 4195 */ 4196 if (!btrfs_is_free_space_inode(inode) 4197 && !btrfs_is_data_reloc_root(root) 4198 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 4199 btrfs_update_root_times(trans, root); 4200 4201 ret = btrfs_delayed_update_inode(trans, inode); 4202 if (!ret) 4203 btrfs_set_inode_last_trans(trans, inode); 4204 return ret; 4205 } 4206 4207 return btrfs_update_inode_item(trans, inode); 4208 } 4209 4210 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 4211 struct btrfs_inode *inode) 4212 { 4213 int ret; 4214 4215 ret = btrfs_update_inode(trans, inode); 4216 if (ret == -ENOSPC) 4217 return btrfs_update_inode_item(trans, inode); 4218 return ret; 4219 } 4220 4221 static void update_time_after_link_or_unlink(struct btrfs_inode *dir) 4222 { 4223 struct timespec64 now; 4224 4225 /* 4226 * If we are replaying a log tree, we do not want to update the mtime 4227 * and ctime of the parent directory with the current time, since the 4228 * log replay procedure is responsible for setting them to their correct 4229 * values (the ones it had when the fsync was done). 4230 */ 4231 if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags)) 4232 return; 4233 4234 now = inode_set_ctime_current(&dir->vfs_inode); 4235 inode_set_mtime_to_ts(&dir->vfs_inode, now); 4236 } 4237 4238 /* 4239 * unlink helper that gets used here in inode.c and in the tree logging 4240 * recovery code. It remove a link in a directory with a given name, and 4241 * also drops the back refs in the inode to the directory 4242 */ 4243 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4244 struct btrfs_inode *dir, 4245 struct btrfs_inode *inode, 4246 const struct fscrypt_str *name, 4247 struct btrfs_rename_ctx *rename_ctx) 4248 { 4249 struct btrfs_root *root = dir->root; 4250 struct btrfs_fs_info *fs_info = root->fs_info; 4251 struct btrfs_path *path; 4252 int ret = 0; 4253 struct btrfs_dir_item *di; 4254 u64 index; 4255 u64 ino = btrfs_ino(inode); 4256 u64 dir_ino = btrfs_ino(dir); 4257 4258 path = btrfs_alloc_path(); 4259 if (!path) 4260 return -ENOMEM; 4261 4262 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); 4263 if (IS_ERR_OR_NULL(di)) { 4264 btrfs_free_path(path); 4265 return di ? PTR_ERR(di) : -ENOENT; 4266 } 4267 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4268 /* 4269 * Down the call chains below we'll also need to allocate a path, so no 4270 * need to hold on to this one for longer than necessary. 4271 */ 4272 btrfs_free_path(path); 4273 if (ret) 4274 return ret; 4275 4276 /* 4277 * If we don't have dir index, we have to get it by looking up 4278 * the inode ref, since we get the inode ref, remove it directly, 4279 * it is unnecessary to do delayed deletion. 4280 * 4281 * But if we have dir index, needn't search inode ref to get it. 4282 * Since the inode ref is close to the inode item, it is better 4283 * that we delay to delete it, and just do this deletion when 4284 * we update the inode item. 4285 */ 4286 if (inode->dir_index) { 4287 ret = btrfs_delayed_delete_inode_ref(inode); 4288 if (!ret) { 4289 index = inode->dir_index; 4290 goto skip_backref; 4291 } 4292 } 4293 4294 ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); 4295 if (unlikely(ret)) { 4296 btrfs_crit(fs_info, 4297 "failed to delete reference to %.*s, root %llu inode %llu parent %llu", 4298 name->len, name->name, btrfs_root_id(root), ino, dir_ino); 4299 btrfs_abort_transaction(trans, ret); 4300 return ret; 4301 } 4302 skip_backref: 4303 if (rename_ctx) 4304 rename_ctx->index = index; 4305 4306 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 4307 if (unlikely(ret)) { 4308 btrfs_abort_transaction(trans, ret); 4309 return ret; 4310 } 4311 4312 /* 4313 * If we are in a rename context, we don't need to update anything in the 4314 * log. That will be done later during the rename by btrfs_log_new_name(). 4315 * Besides that, doing it here would only cause extra unnecessary btree 4316 * operations on the log tree, increasing latency for applications. 4317 */ 4318 if (!rename_ctx) { 4319 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); 4320 btrfs_del_dir_entries_in_log(trans, root, name, dir, index); 4321 } 4322 4323 /* 4324 * If we have a pending delayed iput we could end up with the final iput 4325 * being run in btrfs-cleaner context. If we have enough of these built 4326 * up we can end up burning a lot of time in btrfs-cleaner without any 4327 * way to throttle the unlinks. Since we're currently holding a ref on 4328 * the inode we can run the delayed iput here without any issues as the 4329 * final iput won't be done until after we drop the ref we're currently 4330 * holding. 4331 */ 4332 btrfs_run_delayed_iput(fs_info, inode); 4333 4334 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); 4335 inode_inc_iversion(&inode->vfs_inode); 4336 inode_set_ctime_current(&inode->vfs_inode); 4337 inode_inc_iversion(&dir->vfs_inode); 4338 update_time_after_link_or_unlink(dir); 4339 4340 return btrfs_update_inode(trans, dir); 4341 } 4342 4343 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4344 struct btrfs_inode *dir, struct btrfs_inode *inode, 4345 const struct fscrypt_str *name) 4346 { 4347 int ret; 4348 4349 ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); 4350 if (!ret) { 4351 drop_nlink(&inode->vfs_inode); 4352 ret = btrfs_update_inode(trans, inode); 4353 } 4354 return ret; 4355 } 4356 4357 /* 4358 * helper to start transaction for unlink and rmdir. 4359 * 4360 * unlink and rmdir are special in btrfs, they do not always free space, so 4361 * if we cannot make our reservations the normal way try and see if there is 4362 * plenty of slack room in the global reserve to migrate, otherwise we cannot 4363 * allow the unlink to occur. 4364 */ 4365 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) 4366 { 4367 struct btrfs_root *root = dir->root; 4368 4369 return btrfs_start_transaction_fallback_global_rsv(root, 4370 BTRFS_UNLINK_METADATA_UNITS); 4371 } 4372 4373 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 4374 { 4375 struct btrfs_trans_handle *trans; 4376 struct inode *inode = d_inode(dentry); 4377 int ret; 4378 struct fscrypt_name fname; 4379 4380 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); 4381 if (ret) 4382 return ret; 4383 4384 /* This needs to handle no-key deletions later on */ 4385 4386 trans = __unlink_start_trans(BTRFS_I(dir)); 4387 if (IS_ERR(trans)) { 4388 ret = PTR_ERR(trans); 4389 goto fscrypt_free; 4390 } 4391 4392 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4393 false); 4394 4395 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4396 &fname.disk_name); 4397 if (ret) 4398 goto end_trans; 4399 4400 if (inode->i_nlink == 0) { 4401 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 4402 if (ret) 4403 goto end_trans; 4404 } 4405 4406 end_trans: 4407 btrfs_end_transaction(trans); 4408 btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); 4409 fscrypt_free: 4410 fscrypt_free_filename(&fname); 4411 return ret; 4412 } 4413 4414 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 4415 struct btrfs_inode *dir, struct dentry *dentry) 4416 { 4417 struct btrfs_root *root = dir->root; 4418 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 4419 struct btrfs_path *path; 4420 struct extent_buffer *leaf; 4421 struct btrfs_dir_item *di; 4422 struct btrfs_key key; 4423 u64 index; 4424 int ret; 4425 u64 objectid; 4426 u64 dir_ino = btrfs_ino(dir); 4427 struct fscrypt_name fname; 4428 4429 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); 4430 if (ret) 4431 return ret; 4432 4433 /* This needs to handle no-key deletions later on */ 4434 4435 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { 4436 objectid = btrfs_root_id(inode->root); 4437 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 4438 objectid = inode->ref_root_id; 4439 } else { 4440 WARN_ON(1); 4441 fscrypt_free_filename(&fname); 4442 return -EINVAL; 4443 } 4444 4445 path = btrfs_alloc_path(); 4446 if (!path) { 4447 ret = -ENOMEM; 4448 goto out; 4449 } 4450 4451 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4452 &fname.disk_name, -1); 4453 if (IS_ERR_OR_NULL(di)) { 4454 ret = di ? PTR_ERR(di) : -ENOENT; 4455 goto out; 4456 } 4457 4458 leaf = path->nodes[0]; 4459 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4460 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 4461 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4462 if (unlikely(ret)) { 4463 btrfs_abort_transaction(trans, ret); 4464 goto out; 4465 } 4466 btrfs_release_path(path); 4467 4468 /* 4469 * This is a placeholder inode for a subvolume we didn't have a 4470 * reference to at the time of the snapshot creation. In the meantime 4471 * we could have renamed the real subvol link into our snapshot, so 4472 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. 4473 * Instead simply lookup the dir_index_item for this entry so we can 4474 * remove it. Otherwise we know we have a ref to the root and we can 4475 * call btrfs_del_root_ref, and it _shouldn't_ fail. 4476 */ 4477 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 4478 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); 4479 if (IS_ERR(di)) { 4480 ret = PTR_ERR(di); 4481 btrfs_abort_transaction(trans, ret); 4482 goto out; 4483 } 4484 4485 leaf = path->nodes[0]; 4486 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4487 index = key.offset; 4488 btrfs_release_path(path); 4489 } else { 4490 ret = btrfs_del_root_ref(trans, objectid, 4491 btrfs_root_id(root), dir_ino, 4492 &index, &fname.disk_name); 4493 if (unlikely(ret)) { 4494 btrfs_abort_transaction(trans, ret); 4495 goto out; 4496 } 4497 } 4498 4499 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 4500 if (unlikely(ret)) { 4501 btrfs_abort_transaction(trans, ret); 4502 goto out; 4503 } 4504 4505 btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); 4506 inode_inc_iversion(&dir->vfs_inode); 4507 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); 4508 ret = btrfs_update_inode_fallback(trans, dir); 4509 if (ret) 4510 btrfs_abort_transaction(trans, ret); 4511 out: 4512 btrfs_free_path(path); 4513 fscrypt_free_filename(&fname); 4514 return ret; 4515 } 4516 4517 /* 4518 * Helper to check if the subvolume references other subvolumes or if it's 4519 * default. 4520 */ 4521 static noinline int may_destroy_subvol(struct btrfs_root *root) 4522 { 4523 struct btrfs_fs_info *fs_info = root->fs_info; 4524 BTRFS_PATH_AUTO_FREE(path); 4525 struct btrfs_dir_item *di; 4526 struct btrfs_key key; 4527 struct fscrypt_str name = FSTR_INIT("default", 7); 4528 u64 dir_id; 4529 int ret; 4530 4531 path = btrfs_alloc_path(); 4532 if (!path) 4533 return -ENOMEM; 4534 4535 /* Make sure this root isn't set as the default subvol */ 4536 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4537 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, 4538 dir_id, &name, 0); 4539 if (di && !IS_ERR(di)) { 4540 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 4541 if (key.objectid == btrfs_root_id(root)) { 4542 ret = -EPERM; 4543 btrfs_err(fs_info, 4544 "deleting default subvolume %llu is not allowed", 4545 key.objectid); 4546 return ret; 4547 } 4548 btrfs_release_path(path); 4549 } 4550 4551 key.objectid = btrfs_root_id(root); 4552 key.type = BTRFS_ROOT_REF_KEY; 4553 key.offset = (u64)-1; 4554 4555 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4556 if (ret < 0) 4557 return ret; 4558 if (unlikely(ret == 0)) { 4559 /* 4560 * Key with offset -1 found, there would have to exist a root 4561 * with such id, but this is out of valid range. 4562 */ 4563 return -EUCLEAN; 4564 } 4565 4566 ret = 0; 4567 if (path->slots[0] > 0) { 4568 path->slots[0]--; 4569 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 4570 if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) 4571 ret = -ENOTEMPTY; 4572 } 4573 4574 return ret; 4575 } 4576 4577 /* Delete all dentries for inodes belonging to the root */ 4578 static void btrfs_prune_dentries(struct btrfs_root *root) 4579 { 4580 struct btrfs_fs_info *fs_info = root->fs_info; 4581 struct btrfs_inode *inode; 4582 u64 min_ino = 0; 4583 4584 if (!BTRFS_FS_ERROR(fs_info)) 4585 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4586 4587 inode = btrfs_find_first_inode(root, min_ino); 4588 while (inode) { 4589 if (icount_read(&inode->vfs_inode) > 1) 4590 d_prune_aliases(&inode->vfs_inode); 4591 4592 min_ino = btrfs_ino(inode) + 1; 4593 /* 4594 * btrfs_drop_inode() will have it removed from the inode 4595 * cache when its usage count hits zero. 4596 */ 4597 iput(&inode->vfs_inode); 4598 cond_resched(); 4599 inode = btrfs_find_first_inode(root, min_ino); 4600 } 4601 } 4602 4603 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) 4604 { 4605 struct btrfs_root *root = dir->root; 4606 struct btrfs_fs_info *fs_info = root->fs_info; 4607 struct inode *inode = d_inode(dentry); 4608 struct btrfs_root *dest = BTRFS_I(inode)->root; 4609 struct btrfs_trans_handle *trans; 4610 struct btrfs_block_rsv block_rsv; 4611 u64 root_flags; 4612 u64 qgroup_reserved = 0; 4613 int ret; 4614 4615 down_write(&fs_info->subvol_sem); 4616 4617 /* 4618 * Don't allow to delete a subvolume with send in progress. This is 4619 * inside the inode lock so the error handling that has to drop the bit 4620 * again is not run concurrently. 4621 */ 4622 spin_lock(&dest->root_item_lock); 4623 if (dest->send_in_progress) { 4624 spin_unlock(&dest->root_item_lock); 4625 btrfs_warn(fs_info, 4626 "attempt to delete subvolume %llu during send", 4627 btrfs_root_id(dest)); 4628 ret = -EPERM; 4629 goto out_up_write; 4630 } 4631 if (atomic_read(&dest->nr_swapfiles)) { 4632 spin_unlock(&dest->root_item_lock); 4633 btrfs_warn(fs_info, 4634 "attempt to delete subvolume %llu with active swapfile", 4635 btrfs_root_id(root)); 4636 ret = -EPERM; 4637 goto out_up_write; 4638 } 4639 root_flags = btrfs_root_flags(&dest->root_item); 4640 btrfs_set_root_flags(&dest->root_item, 4641 root_flags | BTRFS_ROOT_SUBVOL_DEAD); 4642 spin_unlock(&dest->root_item_lock); 4643 4644 ret = may_destroy_subvol(dest); 4645 if (ret) 4646 goto out_undead; 4647 4648 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 4649 /* 4650 * One for dir inode, 4651 * two for dir entries, 4652 * two for root ref/backref. 4653 */ 4654 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); 4655 if (ret) 4656 goto out_undead; 4657 qgroup_reserved = block_rsv.qgroup_rsv_reserved; 4658 4659 trans = btrfs_start_transaction(root, 0); 4660 if (IS_ERR(trans)) { 4661 ret = PTR_ERR(trans); 4662 goto out_release; 4663 } 4664 btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); 4665 qgroup_reserved = 0; 4666 trans->block_rsv = &block_rsv; 4667 trans->bytes_reserved = block_rsv.size; 4668 4669 btrfs_record_snapshot_destroy(trans, dir); 4670 4671 ret = btrfs_unlink_subvol(trans, dir, dentry); 4672 if (unlikely(ret)) { 4673 btrfs_abort_transaction(trans, ret); 4674 goto out_end_trans; 4675 } 4676 4677 ret = btrfs_record_root_in_trans(trans, dest); 4678 if (unlikely(ret)) { 4679 btrfs_abort_transaction(trans, ret); 4680 goto out_end_trans; 4681 } 4682 4683 memset(&dest->root_item.drop_progress, 0, 4684 sizeof(dest->root_item.drop_progress)); 4685 btrfs_set_root_drop_level(&dest->root_item, 0); 4686 btrfs_set_root_refs(&dest->root_item, 0); 4687 4688 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { 4689 ret = btrfs_insert_orphan_item(trans, 4690 fs_info->tree_root, 4691 btrfs_root_id(dest)); 4692 if (unlikely(ret)) { 4693 btrfs_abort_transaction(trans, ret); 4694 goto out_end_trans; 4695 } 4696 } 4697 4698 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, 4699 BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); 4700 if (unlikely(ret && ret != -ENOENT)) { 4701 btrfs_abort_transaction(trans, ret); 4702 goto out_end_trans; 4703 } 4704 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { 4705 ret = btrfs_uuid_tree_remove(trans, 4706 dest->root_item.received_uuid, 4707 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4708 btrfs_root_id(dest)); 4709 if (unlikely(ret && ret != -ENOENT)) { 4710 btrfs_abort_transaction(trans, ret); 4711 goto out_end_trans; 4712 } 4713 } 4714 4715 free_anon_bdev(dest->anon_dev); 4716 dest->anon_dev = 0; 4717 out_end_trans: 4718 trans->block_rsv = NULL; 4719 trans->bytes_reserved = 0; 4720 ret = btrfs_end_transaction(trans); 4721 inode->i_flags |= S_DEAD; 4722 out_release: 4723 btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); 4724 if (qgroup_reserved) 4725 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); 4726 out_undead: 4727 if (ret) { 4728 spin_lock(&dest->root_item_lock); 4729 root_flags = btrfs_root_flags(&dest->root_item); 4730 btrfs_set_root_flags(&dest->root_item, 4731 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); 4732 spin_unlock(&dest->root_item_lock); 4733 } 4734 out_up_write: 4735 up_write(&fs_info->subvol_sem); 4736 if (!ret) { 4737 d_invalidate(dentry); 4738 btrfs_prune_dentries(dest); 4739 ASSERT(dest->send_in_progress == 0); 4740 } 4741 4742 return ret; 4743 } 4744 4745 static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry) 4746 { 4747 struct btrfs_inode *dir = BTRFS_I(vfs_dir); 4748 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 4749 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4750 int ret = 0; 4751 struct btrfs_trans_handle *trans; 4752 struct fscrypt_name fname; 4753 4754 if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE) 4755 return -ENOTEMPTY; 4756 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { 4757 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { 4758 btrfs_err(fs_info, 4759 "extent tree v2 doesn't support snapshot deletion yet"); 4760 return -EOPNOTSUPP; 4761 } 4762 return btrfs_delete_subvolume(dir, dentry); 4763 } 4764 4765 ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname); 4766 if (ret) 4767 return ret; 4768 4769 /* This needs to handle no-key deletions later on */ 4770 4771 trans = __unlink_start_trans(dir); 4772 if (IS_ERR(trans)) { 4773 ret = PTR_ERR(trans); 4774 goto out_notrans; 4775 } 4776 4777 /* 4778 * Propagate the last_unlink_trans value of the deleted dir to its 4779 * parent directory. This is to prevent an unrecoverable log tree in the 4780 * case we do something like this: 4781 * 1) create dir foo 4782 * 2) create snapshot under dir foo 4783 * 3) delete the snapshot 4784 * 4) rmdir foo 4785 * 5) mkdir foo 4786 * 6) fsync foo or some file inside foo 4787 * 4788 * This is because we can't unlink other roots when replaying the dir 4789 * deletes for directory foo. 4790 */ 4791 if (inode->last_unlink_trans >= trans->transid) 4792 btrfs_record_snapshot_destroy(trans, dir); 4793 4794 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4795 ret = btrfs_unlink_subvol(trans, dir, dentry); 4796 goto out; 4797 } 4798 4799 ret = btrfs_orphan_add(trans, inode); 4800 if (ret) 4801 goto out; 4802 4803 /* now the directory is empty */ 4804 ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name); 4805 if (!ret) 4806 btrfs_i_size_write(inode, 0); 4807 out: 4808 btrfs_end_transaction(trans); 4809 out_notrans: 4810 btrfs_btree_balance_dirty(fs_info); 4811 fscrypt_free_filename(&fname); 4812 4813 return ret; 4814 } 4815 4816 static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize) 4817 { 4818 ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u", 4819 blockstart, blocksize); 4820 4821 if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1) 4822 return true; 4823 return false; 4824 } 4825 4826 static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) 4827 { 4828 const pgoff_t index = (start >> PAGE_SHIFT); 4829 struct address_space *mapping = inode->vfs_inode.i_mapping; 4830 struct folio *folio; 4831 u64 zero_start; 4832 u64 zero_end; 4833 int ret = 0; 4834 4835 again: 4836 folio = filemap_lock_folio(mapping, index); 4837 /* No folio present. */ 4838 if (IS_ERR(folio)) 4839 return 0; 4840 4841 if (!folio_test_uptodate(folio)) { 4842 ret = btrfs_read_folio(NULL, folio); 4843 folio_lock(folio); 4844 if (folio->mapping != mapping) { 4845 folio_unlock(folio); 4846 folio_put(folio); 4847 goto again; 4848 } 4849 if (unlikely(!folio_test_uptodate(folio))) { 4850 ret = -EIO; 4851 goto out_unlock; 4852 } 4853 } 4854 folio_wait_writeback(folio); 4855 4856 /* 4857 * We do not need to lock extents nor wait for OE, as it's already 4858 * beyond EOF. 4859 */ 4860 4861 zero_start = max_t(u64, folio_pos(folio), start); 4862 zero_end = folio_next_pos(folio); 4863 folio_zero_range(folio, zero_start - folio_pos(folio), 4864 zero_end - zero_start); 4865 4866 out_unlock: 4867 folio_unlock(folio); 4868 folio_put(folio); 4869 return ret; 4870 } 4871 4872 /* 4873 * Handle the truncation of a fs block. 4874 * 4875 * @inode - inode that we're zeroing 4876 * @offset - the file offset of the block to truncate 4877 * The value must be inside [@start, @end], and the function will do 4878 * extra checks if the block that covers @offset needs to be zeroed. 4879 * @start - the start file offset of the range we want to zero 4880 * @end - the end (inclusive) file offset of the range we want to zero. 4881 * 4882 * If the range is not block aligned, read out the folio that covers @offset, 4883 * and if needed zero blocks that are inside the folio and covered by [@start, @end). 4884 * If @start or @end + 1 lands inside a block, that block will be marked dirty 4885 * for writeback. 4886 * 4887 * This is utilized by hole punch, zero range, file expansion. 4888 */ 4889 int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end) 4890 { 4891 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4892 struct address_space *mapping = inode->vfs_inode.i_mapping; 4893 struct extent_io_tree *io_tree = &inode->io_tree; 4894 struct btrfs_ordered_extent *ordered; 4895 struct extent_state *cached_state = NULL; 4896 struct extent_changeset *data_reserved = NULL; 4897 bool only_release_metadata = false; 4898 u32 blocksize = fs_info->sectorsize; 4899 pgoff_t index = (offset >> PAGE_SHIFT); 4900 struct folio *folio; 4901 gfp_t mask = btrfs_alloc_write_mask(mapping); 4902 int ret = 0; 4903 const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), 4904 blocksize); 4905 const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize), 4906 blocksize); 4907 bool need_truncate_head = false; 4908 bool need_truncate_tail = false; 4909 u64 zero_start; 4910 u64 zero_end; 4911 u64 block_start; 4912 u64 block_end; 4913 4914 /* @offset should be inside the range. */ 4915 ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu", 4916 offset, start, end); 4917 4918 /* The range is aligned at both ends. */ 4919 if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) { 4920 /* 4921 * For block size < page size case, we may have polluted blocks 4922 * beyond EOF. So we also need to zero them out. 4923 */ 4924 if (end == (u64)-1 && blocksize < PAGE_SIZE) 4925 ret = truncate_block_zero_beyond_eof(inode, start); 4926 goto out; 4927 } 4928 4929 /* 4930 * @offset may not be inside the head nor tail block. In that case we 4931 * don't need to do anything. 4932 */ 4933 if (!in_head_block && !in_tail_block) 4934 goto out; 4935 4936 /* 4937 * Skip the truncation if the range in the target block is already aligned. 4938 * The seemingly complex check will also handle the same block case. 4939 */ 4940 if (in_head_block && !IS_ALIGNED(start, blocksize)) 4941 need_truncate_head = true; 4942 if (in_tail_block && !IS_ALIGNED(end + 1, blocksize)) 4943 need_truncate_tail = true; 4944 if (!need_truncate_head && !need_truncate_tail) 4945 goto out; 4946 4947 block_start = round_down(offset, blocksize); 4948 block_end = block_start + blocksize - 1; 4949 4950 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, 4951 blocksize, false); 4952 if (ret < 0) { 4953 size_t write_bytes = blocksize; 4954 4955 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { 4956 /* For nocow case, no need to reserve data space. */ 4957 ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u", 4958 write_bytes, blocksize); 4959 only_release_metadata = true; 4960 } else { 4961 goto out; 4962 } 4963 } 4964 ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); 4965 if (ret < 0) { 4966 if (!only_release_metadata) 4967 btrfs_free_reserved_data_space(inode, data_reserved, 4968 block_start, blocksize); 4969 goto out; 4970 } 4971 again: 4972 folio = __filemap_get_folio(mapping, index, 4973 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); 4974 if (IS_ERR(folio)) { 4975 if (only_release_metadata) 4976 btrfs_delalloc_release_metadata(inode, blocksize, true); 4977 else 4978 btrfs_delalloc_release_space(inode, data_reserved, 4979 block_start, blocksize, true); 4980 btrfs_delalloc_release_extents(inode, blocksize); 4981 ret = PTR_ERR(folio); 4982 goto out; 4983 } 4984 4985 if (!folio_test_uptodate(folio)) { 4986 ret = btrfs_read_folio(NULL, folio); 4987 folio_lock(folio); 4988 if (folio->mapping != mapping) { 4989 folio_unlock(folio); 4990 folio_put(folio); 4991 goto again; 4992 } 4993 if (unlikely(!folio_test_uptodate(folio))) { 4994 ret = -EIO; 4995 goto out_unlock; 4996 } 4997 } 4998 4999 /* 5000 * We unlock the page after the io is completed and then re-lock it 5001 * above. release_folio() could have come in between that and cleared 5002 * folio private, but left the page in the mapping. Set the page mapped 5003 * here to make sure it's properly set for the subpage stuff. 5004 */ 5005 ret = set_folio_extent_mapped(folio); 5006 if (ret < 0) 5007 goto out_unlock; 5008 5009 folio_wait_writeback(folio); 5010 5011 btrfs_lock_extent(io_tree, block_start, block_end, &cached_state); 5012 5013 ordered = btrfs_lookup_ordered_extent(inode, block_start); 5014 if (ordered) { 5015 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); 5016 folio_unlock(folio); 5017 folio_put(folio); 5018 btrfs_start_ordered_extent(ordered); 5019 btrfs_put_ordered_extent(ordered); 5020 goto again; 5021 } 5022 5023 btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end, 5024 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 5025 &cached_state); 5026 5027 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, 5028 &cached_state); 5029 if (ret) { 5030 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); 5031 goto out_unlock; 5032 } 5033 5034 if (end == (u64)-1) { 5035 /* 5036 * We're truncating beyond EOF, the remaining blocks normally are 5037 * already holes thus no need to zero again, but it's possible for 5038 * fs block size < page size cases to have memory mapped writes 5039 * to pollute ranges beyond EOF. 5040 * 5041 * In that case although such polluted blocks beyond EOF will 5042 * not reach disk, it still affects our page caches. 5043 */ 5044 zero_start = max_t(u64, folio_pos(folio), start); 5045 zero_end = min_t(u64, folio_next_pos(folio) - 1, end); 5046 } else { 5047 zero_start = max_t(u64, block_start, start); 5048 zero_end = min_t(u64, block_end, end); 5049 } 5050 folio_zero_range(folio, zero_start - folio_pos(folio), 5051 zero_end - zero_start + 1); 5052 5053 btrfs_folio_clear_checked(fs_info, folio, block_start, 5054 block_end + 1 - block_start); 5055 btrfs_folio_set_dirty(fs_info, folio, block_start, 5056 block_end + 1 - block_start); 5057 5058 if (only_release_metadata) 5059 btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, 5060 EXTENT_NORESERVE, &cached_state); 5061 5062 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); 5063 5064 out_unlock: 5065 if (ret) { 5066 if (only_release_metadata) 5067 btrfs_delalloc_release_metadata(inode, blocksize, true); 5068 else 5069 btrfs_delalloc_release_space(inode, data_reserved, 5070 block_start, blocksize, true); 5071 } 5072 btrfs_delalloc_release_extents(inode, blocksize); 5073 folio_unlock(folio); 5074 folio_put(folio); 5075 out: 5076 if (only_release_metadata) 5077 btrfs_check_nocow_unlock(inode); 5078 extent_changeset_free(data_reserved); 5079 return ret; 5080 } 5081 5082 static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) 5083 { 5084 struct btrfs_root *root = inode->root; 5085 struct btrfs_fs_info *fs_info = root->fs_info; 5086 struct btrfs_trans_handle *trans; 5087 struct btrfs_drop_extents_args drop_args = { 0 }; 5088 int ret; 5089 5090 /* 5091 * If NO_HOLES is enabled, we don't need to do anything. 5092 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() 5093 * or btrfs_update_inode() will be called, which guarantee that the next 5094 * fsync will know this inode was changed and needs to be logged. 5095 */ 5096 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 5097 return 0; 5098 5099 /* 5100 * 1 - for the one we're dropping 5101 * 1 - for the one we're adding 5102 * 1 - for updating the inode. 5103 */ 5104 trans = btrfs_start_transaction(root, 3); 5105 if (IS_ERR(trans)) 5106 return PTR_ERR(trans); 5107 5108 drop_args.start = offset; 5109 drop_args.end = offset + len; 5110 drop_args.drop_cache = true; 5111 5112 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 5113 if (unlikely(ret)) { 5114 btrfs_abort_transaction(trans, ret); 5115 btrfs_end_transaction(trans); 5116 return ret; 5117 } 5118 5119 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len); 5120 if (ret) { 5121 btrfs_abort_transaction(trans, ret); 5122 } else { 5123 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); 5124 btrfs_update_inode(trans, inode); 5125 } 5126 btrfs_end_transaction(trans); 5127 return ret; 5128 } 5129 5130 /* 5131 * This function puts in dummy file extents for the area we're creating a hole 5132 * for. So if we are truncating this file to a larger size we need to insert 5133 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 5134 * the range between oldsize and size 5135 */ 5136 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) 5137 { 5138 struct btrfs_root *root = inode->root; 5139 struct btrfs_fs_info *fs_info = root->fs_info; 5140 struct extent_io_tree *io_tree = &inode->io_tree; 5141 struct extent_map *em = NULL; 5142 struct extent_state *cached_state = NULL; 5143 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); 5144 u64 block_end = ALIGN(size, fs_info->sectorsize); 5145 u64 last_byte; 5146 u64 cur_offset; 5147 u64 hole_size; 5148 int ret = 0; 5149 5150 /* 5151 * If our size started in the middle of a block we need to zero out the 5152 * rest of the block before we expand the i_size, otherwise we could 5153 * expose stale data. 5154 */ 5155 ret = btrfs_truncate_block(inode, oldsize, oldsize, -1); 5156 if (ret) 5157 return ret; 5158 5159 if (size <= hole_start) 5160 return 0; 5161 5162 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, 5163 &cached_state); 5164 cur_offset = hole_start; 5165 while (1) { 5166 em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); 5167 if (IS_ERR(em)) { 5168 ret = PTR_ERR(em); 5169 em = NULL; 5170 break; 5171 } 5172 last_byte = min(btrfs_extent_map_end(em), block_end); 5173 last_byte = ALIGN(last_byte, fs_info->sectorsize); 5174 hole_size = last_byte - cur_offset; 5175 5176 if (!(em->flags & EXTENT_FLAG_PREALLOC)) { 5177 struct extent_map *hole_em; 5178 5179 ret = maybe_insert_hole(inode, cur_offset, hole_size); 5180 if (ret) 5181 break; 5182 5183 ret = btrfs_inode_set_file_extent_range(inode, 5184 cur_offset, hole_size); 5185 if (ret) 5186 break; 5187 5188 hole_em = btrfs_alloc_extent_map(); 5189 if (!hole_em) { 5190 btrfs_drop_extent_map_range(inode, cur_offset, 5191 cur_offset + hole_size - 1, 5192 false); 5193 btrfs_set_inode_full_sync(inode); 5194 goto next; 5195 } 5196 hole_em->start = cur_offset; 5197 hole_em->len = hole_size; 5198 5199 hole_em->disk_bytenr = EXTENT_MAP_HOLE; 5200 hole_em->disk_num_bytes = 0; 5201 hole_em->ram_bytes = hole_size; 5202 hole_em->generation = btrfs_get_fs_generation(fs_info); 5203 5204 ret = btrfs_replace_extent_map_range(inode, hole_em, true); 5205 btrfs_free_extent_map(hole_em); 5206 } else { 5207 ret = btrfs_inode_set_file_extent_range(inode, 5208 cur_offset, hole_size); 5209 if (ret) 5210 break; 5211 } 5212 next: 5213 btrfs_free_extent_map(em); 5214 em = NULL; 5215 cur_offset = last_byte; 5216 if (cur_offset >= block_end) 5217 break; 5218 } 5219 btrfs_free_extent_map(em); 5220 btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); 5221 return ret; 5222 } 5223 5224 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 5225 { 5226 struct btrfs_root *root = BTRFS_I(inode)->root; 5227 struct btrfs_trans_handle *trans; 5228 loff_t oldsize = i_size_read(inode); 5229 loff_t newsize = attr->ia_size; 5230 int mask = attr->ia_valid; 5231 int ret; 5232 5233 /* 5234 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 5235 * special case where we need to update the times despite not having 5236 * these flags set. For all other operations the VFS set these flags 5237 * explicitly if it wants a timestamp update. 5238 */ 5239 if (newsize != oldsize) { 5240 inode_inc_iversion(inode); 5241 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { 5242 inode_set_mtime_to_ts(inode, 5243 inode_set_ctime_current(inode)); 5244 } 5245 } 5246 5247 if (newsize > oldsize) { 5248 /* 5249 * Don't do an expanding truncate while snapshotting is ongoing. 5250 * This is to ensure the snapshot captures a fully consistent 5251 * state of this file - if the snapshot captures this expanding 5252 * truncation, it must capture all writes that happened before 5253 * this truncation. 5254 */ 5255 btrfs_drew_write_lock(&root->snapshot_lock); 5256 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); 5257 if (ret) { 5258 btrfs_drew_write_unlock(&root->snapshot_lock); 5259 return ret; 5260 } 5261 5262 trans = btrfs_start_transaction(root, 1); 5263 if (IS_ERR(trans)) { 5264 btrfs_drew_write_unlock(&root->snapshot_lock); 5265 return PTR_ERR(trans); 5266 } 5267 5268 i_size_write(inode, newsize); 5269 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 5270 pagecache_isize_extended(inode, oldsize, newsize); 5271 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 5272 btrfs_drew_write_unlock(&root->snapshot_lock); 5273 btrfs_end_transaction(trans); 5274 } else { 5275 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 5276 5277 if (btrfs_is_zoned(fs_info)) { 5278 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 5279 ALIGN(newsize, fs_info->sectorsize), 5280 (u64)-1); 5281 if (ret) 5282 return ret; 5283 } 5284 5285 /* 5286 * We're truncating a file that used to have good data down to 5287 * zero. Make sure any new writes to the file get on disk 5288 * on close. 5289 */ 5290 if (newsize == 0) 5291 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 5292 &BTRFS_I(inode)->runtime_flags); 5293 5294 truncate_setsize(inode, newsize); 5295 5296 inode_dio_wait(inode); 5297 5298 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); 5299 if (ret && inode->i_nlink) { 5300 int ret2; 5301 5302 /* 5303 * Truncate failed, so fix up the in-memory size. We 5304 * adjusted disk_i_size down as we removed extents, so 5305 * wait for disk_i_size to be stable and then update the 5306 * in-memory size to match. 5307 */ 5308 ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 5309 if (ret2) 5310 return ret2; 5311 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 5312 } 5313 } 5314 5315 return ret; 5316 } 5317 5318 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 5319 struct iattr *attr) 5320 { 5321 struct inode *inode = d_inode(dentry); 5322 struct btrfs_root *root = BTRFS_I(inode)->root; 5323 int ret; 5324 5325 if (btrfs_root_readonly(root)) 5326 return -EROFS; 5327 5328 ret = setattr_prepare(idmap, dentry, attr); 5329 if (ret) 5330 return ret; 5331 5332 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 5333 ret = btrfs_setsize(inode, attr); 5334 if (ret) 5335 return ret; 5336 } 5337 5338 if (attr->ia_valid) { 5339 setattr_copy(idmap, inode, attr); 5340 inode_inc_iversion(inode); 5341 ret = btrfs_dirty_inode(BTRFS_I(inode)); 5342 5343 if (!ret && attr->ia_valid & ATTR_MODE) 5344 ret = posix_acl_chmod(idmap, dentry, inode->i_mode); 5345 } 5346 5347 return ret; 5348 } 5349 5350 /* 5351 * While truncating the inode pages during eviction, we get the VFS 5352 * calling btrfs_invalidate_folio() against each folio of the inode. This 5353 * is slow because the calls to btrfs_invalidate_folio() result in a 5354 * huge amount of calls to lock_extent() and clear_extent_bit(), 5355 * which keep merging and splitting extent_state structures over and over, 5356 * wasting lots of time. 5357 * 5358 * Therefore if the inode is being evicted, let btrfs_invalidate_folio() 5359 * skip all those expensive operations on a per folio basis and do only 5360 * the ordered io finishing, while we release here the extent_map and 5361 * extent_state structures, without the excessive merging and splitting. 5362 */ 5363 static void evict_inode_truncate_pages(struct inode *inode) 5364 { 5365 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5366 struct rb_node *node; 5367 5368 ASSERT(inode_state_read_once(inode) & I_FREEING); 5369 truncate_inode_pages_final(&inode->i_data); 5370 5371 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); 5372 5373 /* 5374 * Keep looping until we have no more ranges in the io tree. 5375 * We can have ongoing bios started by readahead that have 5376 * their endio callback (extent_io.c:end_bio_extent_readpage) 5377 * still in progress (unlocked the pages in the bio but did not yet 5378 * unlocked the ranges in the io tree). Therefore this means some 5379 * ranges can still be locked and eviction started because before 5380 * submitting those bios, which are executed by a separate task (work 5381 * queue kthread), inode references (inode->i_count) were not taken 5382 * (which would be dropped in the end io callback of each bio). 5383 * Therefore here we effectively end up waiting for those bios and 5384 * anyone else holding locked ranges without having bumped the inode's 5385 * reference count - if we don't do it, when they access the inode's 5386 * io_tree to unlock a range it may be too late, leading to an 5387 * use-after-free issue. 5388 */ 5389 spin_lock(&io_tree->lock); 5390 while (!RB_EMPTY_ROOT(&io_tree->state)) { 5391 struct extent_state *state; 5392 struct extent_state *cached_state = NULL; 5393 u64 start; 5394 u64 end; 5395 unsigned state_flags; 5396 5397 node = rb_first(&io_tree->state); 5398 state = rb_entry(node, struct extent_state, rb_node); 5399 start = state->start; 5400 end = state->end; 5401 state_flags = state->state; 5402 spin_unlock(&io_tree->lock); 5403 5404 btrfs_lock_extent(io_tree, start, end, &cached_state); 5405 5406 /* 5407 * If still has DELALLOC flag, the extent didn't reach disk, 5408 * and its reserved space won't be freed by delayed_ref. 5409 * So we need to free its reserved space here. 5410 * (Refer to comment in btrfs_invalidate_folio, case 2) 5411 * 5412 * Note, end is the bytenr of last byte, so we need + 1 here. 5413 */ 5414 if (state_flags & EXTENT_DELALLOC) 5415 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, 5416 end - start + 1, NULL); 5417 5418 btrfs_clear_extent_bit(io_tree, start, end, 5419 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, 5420 &cached_state); 5421 5422 cond_resched(); 5423 spin_lock(&io_tree->lock); 5424 } 5425 spin_unlock(&io_tree->lock); 5426 } 5427 5428 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, 5429 struct btrfs_block_rsv *rsv) 5430 { 5431 struct btrfs_fs_info *fs_info = root->fs_info; 5432 struct btrfs_trans_handle *trans; 5433 u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1); 5434 int ret; 5435 5436 /* 5437 * Eviction should be taking place at some place safe because of our 5438 * delayed iputs. However the normal flushing code will run delayed 5439 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. 5440 * 5441 * We reserve the delayed_refs_extra here again because we can't use 5442 * btrfs_start_transaction(root, 0) for the same deadlocky reason as 5443 * above. We reserve our extra bit here because we generate a ton of 5444 * delayed refs activity by truncating. 5445 * 5446 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, 5447 * if we fail to make this reservation we can re-try without the 5448 * delayed_refs_extra so we can make some forward progress. 5449 */ 5450 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, 5451 BTRFS_RESERVE_FLUSH_EVICT); 5452 if (ret) { 5453 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, 5454 BTRFS_RESERVE_FLUSH_EVICT); 5455 if (ret) { 5456 btrfs_warn(fs_info, 5457 "could not allocate space for delete; will truncate on mount"); 5458 return ERR_PTR(-ENOSPC); 5459 } 5460 delayed_refs_extra = 0; 5461 } 5462 5463 trans = btrfs_join_transaction(root); 5464 if (IS_ERR(trans)) 5465 return trans; 5466 5467 if (delayed_refs_extra) { 5468 trans->block_rsv = &fs_info->trans_block_rsv; 5469 trans->bytes_reserved = delayed_refs_extra; 5470 btrfs_block_rsv_migrate(rsv, trans->block_rsv, 5471 delayed_refs_extra, true); 5472 } 5473 return trans; 5474 } 5475 5476 void btrfs_evict_inode(struct inode *inode) 5477 { 5478 struct btrfs_fs_info *fs_info; 5479 struct btrfs_trans_handle *trans; 5480 struct btrfs_root *root = BTRFS_I(inode)->root; 5481 struct btrfs_block_rsv rsv; 5482 int ret; 5483 5484 trace_btrfs_inode_evict(inode); 5485 5486 if (!root) { 5487 fsverity_cleanup_inode(inode); 5488 clear_inode(inode); 5489 return; 5490 } 5491 5492 fs_info = inode_to_fs_info(inode); 5493 evict_inode_truncate_pages(inode); 5494 5495 if (inode->i_nlink && 5496 ((btrfs_root_refs(&root->root_item) != 0 && 5497 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) || 5498 btrfs_is_free_space_inode(BTRFS_I(inode)))) 5499 goto out; 5500 5501 if (is_bad_inode(inode)) 5502 goto out; 5503 5504 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 5505 goto out; 5506 5507 if (inode->i_nlink > 0) { 5508 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 5509 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID); 5510 goto out; 5511 } 5512 5513 /* 5514 * This makes sure the inode item in tree is uptodate and the space for 5515 * the inode update is released. 5516 */ 5517 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); 5518 if (ret) 5519 goto out; 5520 5521 /* 5522 * This drops any pending insert or delete operations we have for this 5523 * inode. We could have a delayed dir index deletion queued up, but 5524 * we're removing the inode completely so that'll be taken care of in 5525 * the truncate. 5526 */ 5527 btrfs_kill_delayed_inode_items(BTRFS_I(inode)); 5528 5529 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); 5530 rsv.size = btrfs_calc_metadata_size(fs_info, 1); 5531 rsv.failfast = true; 5532 5533 btrfs_i_size_write(BTRFS_I(inode), 0); 5534 5535 while (1) { 5536 struct btrfs_truncate_control control = { 5537 .inode = BTRFS_I(inode), 5538 .ino = btrfs_ino(BTRFS_I(inode)), 5539 .new_size = 0, 5540 .min_type = 0, 5541 }; 5542 5543 trans = evict_refill_and_join(root, &rsv); 5544 if (IS_ERR(trans)) 5545 goto out_release; 5546 5547 trans->block_rsv = &rsv; 5548 5549 ret = btrfs_truncate_inode_items(trans, root, &control); 5550 trans->block_rsv = &fs_info->trans_block_rsv; 5551 btrfs_end_transaction(trans); 5552 /* 5553 * We have not added new delayed items for our inode after we 5554 * have flushed its delayed items, so no need to throttle on 5555 * delayed items. However we have modified extent buffers. 5556 */ 5557 btrfs_btree_balance_dirty_nodelay(fs_info); 5558 if (ret && ret != -ENOSPC && ret != -EAGAIN) 5559 goto out_release; 5560 else if (!ret) 5561 break; 5562 } 5563 5564 /* 5565 * Errors here aren't a big deal, it just means we leave orphan items in 5566 * the tree. They will be cleaned up on the next mount. If the inode 5567 * number gets reused, cleanup deletes the orphan item without doing 5568 * anything, and unlink reuses the existing orphan item. 5569 * 5570 * If it turns out that we are dropping too many of these, we might want 5571 * to add a mechanism for retrying these after a commit. 5572 */ 5573 trans = evict_refill_and_join(root, &rsv); 5574 if (!IS_ERR(trans)) { 5575 trans->block_rsv = &rsv; 5576 btrfs_orphan_del(trans, BTRFS_I(inode)); 5577 trans->block_rsv = &fs_info->trans_block_rsv; 5578 btrfs_end_transaction(trans); 5579 } 5580 5581 out_release: 5582 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); 5583 out: 5584 /* 5585 * If we didn't successfully delete, the orphan item will still be in 5586 * the tree and we'll retry on the next mount. Again, we might also want 5587 * to retry these periodically in the future. 5588 */ 5589 btrfs_remove_delayed_node(BTRFS_I(inode)); 5590 fsverity_cleanup_inode(inode); 5591 clear_inode(inode); 5592 } 5593 5594 /* 5595 * Return the key found in the dir entry in the location pointer, fill @type 5596 * with BTRFS_FT_*, and return 0. 5597 * 5598 * If no dir entries were found, returns -ENOENT. 5599 * If found a corrupted location in dir entry, returns -EUCLEAN. 5600 */ 5601 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, 5602 struct btrfs_key *location, u8 *type) 5603 { 5604 struct btrfs_dir_item *di; 5605 BTRFS_PATH_AUTO_FREE(path); 5606 struct btrfs_root *root = dir->root; 5607 int ret = 0; 5608 struct fscrypt_name fname; 5609 5610 path = btrfs_alloc_path(); 5611 if (!path) 5612 return -ENOMEM; 5613 5614 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); 5615 if (ret < 0) 5616 return ret; 5617 /* 5618 * fscrypt_setup_filename() should never return a positive value, but 5619 * gcc on sparc/parisc thinks it can, so assert that doesn't happen. 5620 */ 5621 ASSERT(ret == 0); 5622 5623 /* This needs to handle no-key deletions later on */ 5624 5625 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), 5626 &fname.disk_name, 0); 5627 if (IS_ERR_OR_NULL(di)) { 5628 ret = di ? PTR_ERR(di) : -ENOENT; 5629 goto out; 5630 } 5631 5632 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 5633 if (unlikely(location->type != BTRFS_INODE_ITEM_KEY && 5634 location->type != BTRFS_ROOT_ITEM_KEY)) { 5635 ret = -EUCLEAN; 5636 btrfs_warn(root->fs_info, 5637 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", 5638 __func__, fname.disk_name.name, btrfs_ino(dir), 5639 location->objectid, location->type, location->offset); 5640 } 5641 if (!ret) 5642 *type = btrfs_dir_ftype(path->nodes[0], di); 5643 out: 5644 fscrypt_free_filename(&fname); 5645 return ret; 5646 } 5647 5648 /* 5649 * when we hit a tree root in a directory, the btrfs part of the inode 5650 * needs to be changed to reflect the root directory of the tree root. This 5651 * is kind of like crossing a mount point. 5652 */ 5653 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, 5654 struct btrfs_inode *dir, 5655 struct dentry *dentry, 5656 struct btrfs_key *location, 5657 struct btrfs_root **sub_root) 5658 { 5659 BTRFS_PATH_AUTO_FREE(path); 5660 struct btrfs_root *new_root; 5661 struct btrfs_root_ref *ref; 5662 struct extent_buffer *leaf; 5663 struct btrfs_key key; 5664 int ret; 5665 int err = 0; 5666 struct fscrypt_name fname; 5667 5668 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); 5669 if (ret) 5670 return ret; 5671 5672 path = btrfs_alloc_path(); 5673 if (!path) { 5674 err = -ENOMEM; 5675 goto out; 5676 } 5677 5678 err = -ENOENT; 5679 key.objectid = btrfs_root_id(dir->root); 5680 key.type = BTRFS_ROOT_REF_KEY; 5681 key.offset = location->objectid; 5682 5683 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 5684 if (ret) { 5685 if (ret < 0) 5686 err = ret; 5687 goto out; 5688 } 5689 5690 leaf = path->nodes[0]; 5691 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 5692 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 5693 btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) 5694 goto out; 5695 5696 ret = memcmp_extent_buffer(leaf, fname.disk_name.name, 5697 (unsigned long)(ref + 1), fname.disk_name.len); 5698 if (ret) 5699 goto out; 5700 5701 btrfs_release_path(path); 5702 5703 new_root = btrfs_get_fs_root(fs_info, location->objectid, true); 5704 if (IS_ERR(new_root)) { 5705 err = PTR_ERR(new_root); 5706 goto out; 5707 } 5708 5709 *sub_root = new_root; 5710 location->objectid = btrfs_root_dirid(&new_root->root_item); 5711 location->type = BTRFS_INODE_ITEM_KEY; 5712 location->offset = 0; 5713 err = 0; 5714 out: 5715 fscrypt_free_filename(&fname); 5716 return err; 5717 } 5718 5719 5720 5721 static void btrfs_del_inode_from_root(struct btrfs_inode *inode) 5722 { 5723 struct btrfs_root *root = inode->root; 5724 struct btrfs_inode *entry; 5725 bool empty = false; 5726 5727 xa_lock(&root->inodes); 5728 /* 5729 * This btrfs_inode is being freed and has already been unhashed at this 5730 * point. It's possible that another btrfs_inode has already been 5731 * allocated for the same inode and inserted itself into the root, so 5732 * don't delete it in that case. 5733 * 5734 * Note that this shouldn't need to allocate memory, so the gfp flags 5735 * don't really matter. 5736 */ 5737 entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL, 5738 GFP_ATOMIC); 5739 if (entry == inode) 5740 empty = xa_empty(&root->inodes); 5741 xa_unlock(&root->inodes); 5742 5743 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5744 xa_lock(&root->inodes); 5745 empty = xa_empty(&root->inodes); 5746 xa_unlock(&root->inodes); 5747 if (empty) 5748 btrfs_add_dead_root(root); 5749 } 5750 } 5751 5752 5753 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5754 { 5755 struct btrfs_iget_args *args = p; 5756 5757 btrfs_set_inode_number(BTRFS_I(inode), args->ino); 5758 BTRFS_I(inode)->root = btrfs_grab_root(args->root); 5759 5760 if (args->root && args->root == args->root->fs_info->tree_root && 5761 args->ino != BTRFS_BTREE_INODE_OBJECTID) 5762 set_bit(BTRFS_INODE_FREE_SPACE_INODE, 5763 &BTRFS_I(inode)->runtime_flags); 5764 return 0; 5765 } 5766 5767 static int btrfs_find_actor(struct inode *inode, void *opaque) 5768 { 5769 struct btrfs_iget_args *args = opaque; 5770 5771 return args->ino == btrfs_ino(BTRFS_I(inode)) && 5772 args->root == BTRFS_I(inode)->root; 5773 } 5774 5775 static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) 5776 { 5777 struct inode *inode; 5778 struct btrfs_iget_args args; 5779 unsigned long hashval = btrfs_inode_hash(ino, root); 5780 5781 args.ino = ino; 5782 args.root = root; 5783 5784 inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, 5785 btrfs_init_locked_inode, 5786 (void *)&args); 5787 if (!inode) 5788 return NULL; 5789 return BTRFS_I(inode); 5790 } 5791 5792 /* 5793 * Get an inode object given its inode number and corresponding root. Path is 5794 * preallocated to prevent recursing back to iget through allocator. 5795 */ 5796 struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, 5797 struct btrfs_path *path) 5798 { 5799 struct btrfs_inode *inode; 5800 int ret; 5801 5802 inode = btrfs_iget_locked(ino, root); 5803 if (!inode) 5804 return ERR_PTR(-ENOMEM); 5805 5806 if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) 5807 return inode; 5808 5809 ret = btrfs_read_locked_inode(inode, path); 5810 if (ret) 5811 return ERR_PTR(ret); 5812 5813 unlock_new_inode(&inode->vfs_inode); 5814 return inode; 5815 } 5816 5817 /* 5818 * Get an inode object given its inode number and corresponding root. 5819 */ 5820 struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) 5821 { 5822 struct btrfs_inode *inode; 5823 struct btrfs_path *path; 5824 int ret; 5825 5826 inode = btrfs_iget_locked(ino, root); 5827 if (!inode) 5828 return ERR_PTR(-ENOMEM); 5829 5830 if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) 5831 return inode; 5832 5833 path = btrfs_alloc_path(); 5834 if (!path) { 5835 iget_failed(&inode->vfs_inode); 5836 return ERR_PTR(-ENOMEM); 5837 } 5838 5839 ret = btrfs_read_locked_inode(inode, path); 5840 btrfs_free_path(path); 5841 if (ret) 5842 return ERR_PTR(ret); 5843 5844 if (S_ISDIR(inode->vfs_inode.i_mode)) 5845 inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC; 5846 unlock_new_inode(&inode->vfs_inode); 5847 return inode; 5848 } 5849 5850 static struct btrfs_inode *new_simple_dir(struct inode *dir, 5851 struct btrfs_key *key, 5852 struct btrfs_root *root) 5853 { 5854 struct timespec64 ts; 5855 struct inode *vfs_inode; 5856 struct btrfs_inode *inode; 5857 5858 vfs_inode = new_inode(dir->i_sb); 5859 if (!vfs_inode) 5860 return ERR_PTR(-ENOMEM); 5861 5862 inode = BTRFS_I(vfs_inode); 5863 inode->root = btrfs_grab_root(root); 5864 inode->ref_root_id = key->objectid; 5865 set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); 5866 set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); 5867 5868 btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); 5869 /* 5870 * We only need lookup, the rest is read-only and there's no inode 5871 * associated with the dentry 5872 */ 5873 vfs_inode->i_op = &simple_dir_inode_operations; 5874 vfs_inode->i_opflags &= ~IOP_XATTR; 5875 vfs_inode->i_fop = &simple_dir_operations; 5876 vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5877 5878 ts = inode_set_ctime_current(vfs_inode); 5879 inode_set_mtime_to_ts(vfs_inode, ts); 5880 inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); 5881 inode->i_otime_sec = ts.tv_sec; 5882 inode->i_otime_nsec = ts.tv_nsec; 5883 5884 vfs_inode->i_uid = dir->i_uid; 5885 vfs_inode->i_gid = dir->i_gid; 5886 5887 return inode; 5888 } 5889 5890 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); 5891 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); 5892 static_assert(BTRFS_FT_DIR == FT_DIR); 5893 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); 5894 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); 5895 static_assert(BTRFS_FT_FIFO == FT_FIFO); 5896 static_assert(BTRFS_FT_SOCK == FT_SOCK); 5897 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); 5898 5899 static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) 5900 { 5901 return fs_umode_to_ftype(inode->vfs_inode.i_mode); 5902 } 5903 5904 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 5905 { 5906 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 5907 struct btrfs_inode *inode; 5908 struct btrfs_root *root = BTRFS_I(dir)->root; 5909 struct btrfs_root *sub_root = root; 5910 struct btrfs_key location = { 0 }; 5911 u8 di_type = 0; 5912 int ret = 0; 5913 5914 if (dentry->d_name.len > BTRFS_NAME_LEN) 5915 return ERR_PTR(-ENAMETOOLONG); 5916 5917 ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); 5918 if (ret < 0) 5919 return ERR_PTR(ret); 5920 5921 if (location.type == BTRFS_INODE_ITEM_KEY) { 5922 inode = btrfs_iget(location.objectid, root); 5923 if (IS_ERR(inode)) 5924 return ERR_CAST(inode); 5925 5926 /* Do extra check against inode mode with di_type */ 5927 if (unlikely(btrfs_inode_type(inode) != di_type)) { 5928 btrfs_crit(fs_info, 5929 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", 5930 inode->vfs_inode.i_mode, btrfs_inode_type(inode), 5931 di_type); 5932 iput(&inode->vfs_inode); 5933 return ERR_PTR(-EUCLEAN); 5934 } 5935 return &inode->vfs_inode; 5936 } 5937 5938 ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, 5939 &location, &sub_root); 5940 if (ret < 0) { 5941 if (ret != -ENOENT) 5942 inode = ERR_PTR(ret); 5943 else 5944 inode = new_simple_dir(dir, &location, root); 5945 } else { 5946 inode = btrfs_iget(location.objectid, sub_root); 5947 btrfs_put_root(sub_root); 5948 5949 if (IS_ERR(inode)) 5950 return ERR_CAST(inode); 5951 5952 down_read(&fs_info->cleanup_work_sem); 5953 if (!sb_rdonly(inode->vfs_inode.i_sb)) 5954 ret = btrfs_orphan_cleanup(sub_root); 5955 up_read(&fs_info->cleanup_work_sem); 5956 if (ret) { 5957 iput(&inode->vfs_inode); 5958 inode = ERR_PTR(ret); 5959 } 5960 } 5961 5962 if (IS_ERR(inode)) 5963 return ERR_CAST(inode); 5964 5965 return &inode->vfs_inode; 5966 } 5967 5968 static int btrfs_dentry_delete(const struct dentry *dentry) 5969 { 5970 struct btrfs_root *root; 5971 struct inode *inode = d_inode(dentry); 5972 5973 if (!inode && !IS_ROOT(dentry)) 5974 inode = d_inode(dentry->d_parent); 5975 5976 if (inode) { 5977 root = BTRFS_I(inode)->root; 5978 if (btrfs_root_refs(&root->root_item) == 0) 5979 return 1; 5980 5981 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5982 return 1; 5983 } 5984 return 0; 5985 } 5986 5987 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5988 unsigned int flags) 5989 { 5990 struct inode *inode = btrfs_lookup_dentry(dir, dentry); 5991 5992 if (inode == ERR_PTR(-ENOENT)) 5993 inode = NULL; 5994 return d_splice_alias(inode, dentry); 5995 } 5996 5997 /* 5998 * Find the highest existing sequence number in a directory and then set the 5999 * in-memory index_cnt variable to the first free sequence number. 6000 */ 6001 static int btrfs_set_inode_index_count(struct btrfs_inode *inode) 6002 { 6003 struct btrfs_root *root = inode->root; 6004 struct btrfs_key key, found_key; 6005 BTRFS_PATH_AUTO_FREE(path); 6006 struct extent_buffer *leaf; 6007 int ret; 6008 6009 key.objectid = btrfs_ino(inode); 6010 key.type = BTRFS_DIR_INDEX_KEY; 6011 key.offset = (u64)-1; 6012 6013 path = btrfs_alloc_path(); 6014 if (!path) 6015 return -ENOMEM; 6016 6017 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6018 if (ret < 0) 6019 return ret; 6020 /* FIXME: we should be able to handle this */ 6021 if (ret == 0) 6022 return ret; 6023 6024 if (path->slots[0] == 0) { 6025 inode->index_cnt = BTRFS_DIR_START_INDEX; 6026 return 0; 6027 } 6028 6029 path->slots[0]--; 6030 6031 leaf = path->nodes[0]; 6032 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6033 6034 if (found_key.objectid != btrfs_ino(inode) || 6035 found_key.type != BTRFS_DIR_INDEX_KEY) { 6036 inode->index_cnt = BTRFS_DIR_START_INDEX; 6037 return 0; 6038 } 6039 6040 inode->index_cnt = found_key.offset + 1; 6041 6042 return 0; 6043 } 6044 6045 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) 6046 { 6047 int ret = 0; 6048 6049 btrfs_inode_lock(dir, 0); 6050 if (dir->index_cnt == (u64)-1) { 6051 ret = btrfs_inode_delayed_dir_index_count(dir); 6052 if (ret) { 6053 ret = btrfs_set_inode_index_count(dir); 6054 if (ret) 6055 goto out; 6056 } 6057 } 6058 6059 /* index_cnt is the index number of next new entry, so decrement it. */ 6060 *index = dir->index_cnt - 1; 6061 out: 6062 btrfs_inode_unlock(dir, 0); 6063 6064 return ret; 6065 } 6066 6067 /* 6068 * All this infrastructure exists because dir_emit can fault, and we are holding 6069 * the tree lock when doing readdir. For now just allocate a buffer and copy 6070 * our information into that, and then dir_emit from the buffer. This is 6071 * similar to what NFS does, only we don't keep the buffer around in pagecache 6072 * because I'm afraid I'll mess that up. Long term we need to make filldir do 6073 * copy_to_user_inatomic so we don't have to worry about page faulting under the 6074 * tree lock. 6075 */ 6076 static int btrfs_opendir(struct inode *inode, struct file *file) 6077 { 6078 struct btrfs_file_private *private; 6079 u64 last_index; 6080 int ret; 6081 6082 ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); 6083 if (ret) 6084 return ret; 6085 6086 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); 6087 if (!private) 6088 return -ENOMEM; 6089 private->last_index = last_index; 6090 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); 6091 if (!private->filldir_buf) { 6092 kfree(private); 6093 return -ENOMEM; 6094 } 6095 file->private_data = private; 6096 return 0; 6097 } 6098 6099 static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) 6100 { 6101 struct btrfs_file_private *private = file->private_data; 6102 int ret; 6103 6104 ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), 6105 &private->last_index); 6106 if (ret) 6107 return ret; 6108 6109 return generic_file_llseek(file, offset, whence); 6110 } 6111 6112 struct dir_entry { 6113 u64 ino; 6114 u64 offset; 6115 unsigned type; 6116 int name_len; 6117 }; 6118 6119 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) 6120 { 6121 while (entries--) { 6122 struct dir_entry *entry = addr; 6123 char *name = (char *)(entry + 1); 6124 6125 ctx->pos = get_unaligned(&entry->offset); 6126 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), 6127 get_unaligned(&entry->ino), 6128 get_unaligned(&entry->type))) 6129 return 1; 6130 addr += sizeof(struct dir_entry) + 6131 get_unaligned(&entry->name_len); 6132 ctx->pos++; 6133 } 6134 return 0; 6135 } 6136 6137 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 6138 { 6139 struct inode *inode = file_inode(file); 6140 struct btrfs_root *root = BTRFS_I(inode)->root; 6141 struct btrfs_file_private *private = file->private_data; 6142 struct btrfs_dir_item *di; 6143 struct btrfs_key key; 6144 struct btrfs_key found_key; 6145 BTRFS_PATH_AUTO_FREE(path); 6146 void *addr; 6147 LIST_HEAD(ins_list); 6148 LIST_HEAD(del_list); 6149 int ret; 6150 char *name_ptr; 6151 int name_len; 6152 int entries = 0; 6153 int total_len = 0; 6154 bool put = false; 6155 struct btrfs_key location; 6156 6157 if (!dir_emit_dots(file, ctx)) 6158 return 0; 6159 6160 path = btrfs_alloc_path(); 6161 if (!path) 6162 return -ENOMEM; 6163 6164 addr = private->filldir_buf; 6165 path->reada = READA_FORWARD; 6166 6167 put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index, 6168 &ins_list, &del_list); 6169 6170 again: 6171 key.type = BTRFS_DIR_INDEX_KEY; 6172 key.offset = ctx->pos; 6173 key.objectid = btrfs_ino(BTRFS_I(inode)); 6174 6175 btrfs_for_each_slot(root, &key, &found_key, path, ret) { 6176 struct dir_entry *entry; 6177 struct extent_buffer *leaf = path->nodes[0]; 6178 u8 ftype; 6179 6180 if (found_key.objectid != key.objectid) 6181 break; 6182 if (found_key.type != BTRFS_DIR_INDEX_KEY) 6183 break; 6184 if (found_key.offset < ctx->pos) 6185 continue; 6186 if (found_key.offset > private->last_index) 6187 break; 6188 if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) 6189 continue; 6190 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 6191 name_len = btrfs_dir_name_len(leaf, di); 6192 if ((total_len + sizeof(struct dir_entry) + name_len) >= 6193 PAGE_SIZE) { 6194 btrfs_release_path(path); 6195 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 6196 if (ret) 6197 goto nopos; 6198 addr = private->filldir_buf; 6199 entries = 0; 6200 total_len = 0; 6201 goto again; 6202 } 6203 6204 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); 6205 entry = addr; 6206 name_ptr = (char *)(entry + 1); 6207 read_extent_buffer(leaf, name_ptr, 6208 (unsigned long)(di + 1), name_len); 6209 put_unaligned(name_len, &entry->name_len); 6210 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); 6211 btrfs_dir_item_key_to_cpu(leaf, di, &location); 6212 put_unaligned(location.objectid, &entry->ino); 6213 put_unaligned(found_key.offset, &entry->offset); 6214 entries++; 6215 addr += sizeof(struct dir_entry) + name_len; 6216 total_len += sizeof(struct dir_entry) + name_len; 6217 } 6218 /* Catch error encountered during iteration */ 6219 if (ret < 0) 6220 goto err; 6221 6222 btrfs_release_path(path); 6223 6224 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 6225 if (ret) 6226 goto nopos; 6227 6228 if (btrfs_readdir_delayed_dir_index(ctx, &ins_list)) 6229 goto nopos; 6230 6231 /* 6232 * Stop new entries from being returned after we return the last 6233 * entry. 6234 * 6235 * New directory entries are assigned a strictly increasing 6236 * offset. This means that new entries created during readdir 6237 * are *guaranteed* to be seen in the future by that readdir. 6238 * This has broken buggy programs which operate on names as 6239 * they're returned by readdir. Until we reuse freed offsets 6240 * we have this hack to stop new entries from being returned 6241 * under the assumption that they'll never reach this huge 6242 * offset. 6243 * 6244 * This is being careful not to overflow 32bit loff_t unless the 6245 * last entry requires it because doing so has broken 32bit apps 6246 * in the past. 6247 */ 6248 if (ctx->pos >= INT_MAX) 6249 ctx->pos = LLONG_MAX; 6250 else 6251 ctx->pos = INT_MAX; 6252 nopos: 6253 ret = 0; 6254 err: 6255 if (put) 6256 btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); 6257 return ret; 6258 } 6259 6260 /* 6261 * This is somewhat expensive, updating the tree every time the 6262 * inode changes. But, it is most likely to find the inode in cache. 6263 * FIXME, needs more benchmarking...there are no reasons other than performance 6264 * to keep or drop this code. 6265 */ 6266 static int btrfs_dirty_inode(struct btrfs_inode *inode) 6267 { 6268 struct btrfs_root *root = inode->root; 6269 struct btrfs_fs_info *fs_info = root->fs_info; 6270 struct btrfs_trans_handle *trans; 6271 int ret; 6272 6273 if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) 6274 return 0; 6275 6276 trans = btrfs_join_transaction(root); 6277 if (IS_ERR(trans)) 6278 return PTR_ERR(trans); 6279 6280 ret = btrfs_update_inode(trans, inode); 6281 if (ret == -ENOSPC || ret == -EDQUOT) { 6282 /* whoops, lets try again with the full transaction */ 6283 btrfs_end_transaction(trans); 6284 trans = btrfs_start_transaction(root, 1); 6285 if (IS_ERR(trans)) 6286 return PTR_ERR(trans); 6287 6288 ret = btrfs_update_inode(trans, inode); 6289 } 6290 btrfs_end_transaction(trans); 6291 if (inode->delayed_node) 6292 btrfs_balance_delayed_items(fs_info); 6293 6294 return ret; 6295 } 6296 6297 /* 6298 * We need our own ->update_time so that we can return error on ENOSPC for 6299 * updating the inode in the case of file write and mmap writes. 6300 */ 6301 static int btrfs_update_time(struct inode *inode, int flags) 6302 { 6303 struct btrfs_root *root = BTRFS_I(inode)->root; 6304 bool dirty; 6305 6306 if (btrfs_root_readonly(root)) 6307 return -EROFS; 6308 6309 dirty = inode_update_timestamps(inode, flags); 6310 return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; 6311 } 6312 6313 /* 6314 * helper to find a free sequence number in a given directory. This current 6315 * code is very simple, later versions will do smarter things in the btree 6316 */ 6317 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) 6318 { 6319 int ret = 0; 6320 6321 if (dir->index_cnt == (u64)-1) { 6322 ret = btrfs_inode_delayed_dir_index_count(dir); 6323 if (ret) { 6324 ret = btrfs_set_inode_index_count(dir); 6325 if (ret) 6326 return ret; 6327 } 6328 } 6329 6330 *index = dir->index_cnt; 6331 dir->index_cnt++; 6332 6333 return ret; 6334 } 6335 6336 static int btrfs_insert_inode_locked(struct inode *inode) 6337 { 6338 struct btrfs_iget_args args; 6339 6340 args.ino = btrfs_ino(BTRFS_I(inode)); 6341 args.root = BTRFS_I(inode)->root; 6342 6343 return insert_inode_locked4(inode, 6344 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), 6345 btrfs_find_actor, &args); 6346 } 6347 6348 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, 6349 unsigned int *trans_num_items) 6350 { 6351 struct inode *dir = args->dir; 6352 struct inode *inode = args->inode; 6353 int ret; 6354 6355 if (!args->orphan) { 6356 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, 6357 &args->fname); 6358 if (ret) 6359 return ret; 6360 } 6361 6362 ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); 6363 if (ret) { 6364 fscrypt_free_filename(&args->fname); 6365 return ret; 6366 } 6367 6368 /* 1 to add inode item */ 6369 *trans_num_items = 1; 6370 /* 1 to add compression property */ 6371 if (BTRFS_I(dir)->prop_compress) 6372 (*trans_num_items)++; 6373 /* 1 to add default ACL xattr */ 6374 if (args->default_acl) 6375 (*trans_num_items)++; 6376 /* 1 to add access ACL xattr */ 6377 if (args->acl) 6378 (*trans_num_items)++; 6379 #ifdef CONFIG_SECURITY 6380 /* 1 to add LSM xattr */ 6381 if (dir->i_security) 6382 (*trans_num_items)++; 6383 #endif 6384 if (args->orphan) { 6385 /* 1 to add orphan item */ 6386 (*trans_num_items)++; 6387 } else { 6388 /* 6389 * 1 to add dir item 6390 * 1 to add dir index 6391 * 1 to update parent inode item 6392 * 6393 * No need for 1 unit for the inode ref item because it is 6394 * inserted in a batch together with the inode item at 6395 * btrfs_create_new_inode(). 6396 */ 6397 *trans_num_items += 3; 6398 } 6399 return 0; 6400 } 6401 6402 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) 6403 { 6404 posix_acl_release(args->acl); 6405 posix_acl_release(args->default_acl); 6406 fscrypt_free_filename(&args->fname); 6407 } 6408 6409 /* 6410 * Inherit flags from the parent inode. 6411 * 6412 * Currently only the compression flags and the cow flags are inherited. 6413 */ 6414 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) 6415 { 6416 unsigned int flags; 6417 6418 flags = dir->flags; 6419 6420 if (flags & BTRFS_INODE_NOCOMPRESS) { 6421 inode->flags &= ~BTRFS_INODE_COMPRESS; 6422 inode->flags |= BTRFS_INODE_NOCOMPRESS; 6423 } else if (flags & BTRFS_INODE_COMPRESS) { 6424 inode->flags &= ~BTRFS_INODE_NOCOMPRESS; 6425 inode->flags |= BTRFS_INODE_COMPRESS; 6426 } 6427 6428 if (flags & BTRFS_INODE_NODATACOW) { 6429 inode->flags |= BTRFS_INODE_NODATACOW; 6430 if (S_ISREG(inode->vfs_inode.i_mode)) 6431 inode->flags |= BTRFS_INODE_NODATASUM; 6432 } 6433 6434 btrfs_sync_inode_flags_to_i_flags(inode); 6435 } 6436 6437 int btrfs_create_new_inode(struct btrfs_trans_handle *trans, 6438 struct btrfs_new_inode_args *args) 6439 { 6440 struct timespec64 ts; 6441 struct inode *dir = args->dir; 6442 struct inode *inode = args->inode; 6443 const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; 6444 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 6445 struct btrfs_root *root; 6446 struct btrfs_inode_item *inode_item; 6447 struct btrfs_path *path; 6448 u64 objectid; 6449 struct btrfs_inode_ref *ref; 6450 struct btrfs_key key[2]; 6451 u32 sizes[2]; 6452 struct btrfs_item_batch batch; 6453 unsigned long ptr; 6454 int ret; 6455 bool xa_reserved = false; 6456 6457 path = btrfs_alloc_path(); 6458 if (!path) 6459 return -ENOMEM; 6460 6461 if (!args->subvol) 6462 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); 6463 root = BTRFS_I(inode)->root; 6464 6465 ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); 6466 if (ret) 6467 goto out; 6468 6469 ret = btrfs_get_free_objectid(root, &objectid); 6470 if (ret) 6471 goto out; 6472 btrfs_set_inode_number(BTRFS_I(inode), objectid); 6473 6474 ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); 6475 if (ret) 6476 goto out; 6477 xa_reserved = true; 6478 6479 if (args->orphan) { 6480 /* 6481 * O_TMPFILE, set link count to 0, so that after this point, we 6482 * fill in an inode item with the correct link count. 6483 */ 6484 set_nlink(inode, 0); 6485 } else { 6486 trace_btrfs_inode_request(dir); 6487 6488 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); 6489 if (ret) 6490 goto out; 6491 } 6492 6493 if (S_ISDIR(inode->i_mode)) 6494 BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; 6495 6496 BTRFS_I(inode)->generation = trans->transid; 6497 inode->i_generation = BTRFS_I(inode)->generation; 6498 6499 /* 6500 * We don't have any capability xattrs set here yet, shortcut any 6501 * queries for the xattrs here. If we add them later via the inode 6502 * security init path or any other path this flag will be cleared. 6503 */ 6504 set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); 6505 6506 /* 6507 * Subvolumes don't inherit flags from their parent directory. 6508 * Originally this was probably by accident, but we probably can't 6509 * change it now without compatibility issues. 6510 */ 6511 if (!args->subvol) 6512 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); 6513 6514 btrfs_set_inode_mapping_order(BTRFS_I(inode)); 6515 if (S_ISREG(inode->i_mode)) { 6516 if (btrfs_test_opt(fs_info, NODATASUM)) 6517 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6518 if (btrfs_test_opt(fs_info, NODATACOW)) 6519 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6520 BTRFS_INODE_NODATASUM; 6521 btrfs_update_inode_mapping_flags(BTRFS_I(inode)); 6522 } 6523 6524 ret = btrfs_insert_inode_locked(inode); 6525 if (ret < 0) { 6526 if (!args->orphan) 6527 BTRFS_I(dir)->index_cnt--; 6528 goto out; 6529 } 6530 6531 /* 6532 * We could have gotten an inode number from somebody who was fsynced 6533 * and then removed in this same transaction, so let's just set full 6534 * sync since it will be a full sync anyway and this will blow away the 6535 * old info in the log. 6536 */ 6537 btrfs_set_inode_full_sync(BTRFS_I(inode)); 6538 6539 key[0].objectid = objectid; 6540 key[0].type = BTRFS_INODE_ITEM_KEY; 6541 key[0].offset = 0; 6542 6543 sizes[0] = sizeof(struct btrfs_inode_item); 6544 6545 if (!args->orphan) { 6546 /* 6547 * Start new inodes with an inode_ref. This is slightly more 6548 * efficient for small numbers of hard links since they will 6549 * be packed into one item. Extended refs will kick in if we 6550 * add more hard links than can fit in the ref item. 6551 */ 6552 key[1].objectid = objectid; 6553 key[1].type = BTRFS_INODE_REF_KEY; 6554 if (args->subvol) { 6555 key[1].offset = objectid; 6556 sizes[1] = 2 + sizeof(*ref); 6557 } else { 6558 key[1].offset = btrfs_ino(BTRFS_I(dir)); 6559 sizes[1] = name->len + sizeof(*ref); 6560 } 6561 } 6562 6563 batch.keys = &key[0]; 6564 batch.data_sizes = &sizes[0]; 6565 batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); 6566 batch.nr = args->orphan ? 1 : 2; 6567 ret = btrfs_insert_empty_items(trans, root, path, &batch); 6568 if (unlikely(ret != 0)) { 6569 btrfs_abort_transaction(trans, ret); 6570 goto discard; 6571 } 6572 6573 ts = simple_inode_init_ts(inode); 6574 BTRFS_I(inode)->i_otime_sec = ts.tv_sec; 6575 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; 6576 6577 /* 6578 * We're going to fill the inode item now, so at this point the inode 6579 * must be fully initialized. 6580 */ 6581 6582 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 6583 struct btrfs_inode_item); 6584 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, 6585 sizeof(*inode_item)); 6586 fill_inode_item(trans, path->nodes[0], inode_item, inode); 6587 6588 if (!args->orphan) { 6589 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 6590 struct btrfs_inode_ref); 6591 ptr = (unsigned long)(ref + 1); 6592 if (args->subvol) { 6593 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); 6594 btrfs_set_inode_ref_index(path->nodes[0], ref, 0); 6595 write_extent_buffer(path->nodes[0], "..", ptr, 2); 6596 } else { 6597 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 6598 name->len); 6599 btrfs_set_inode_ref_index(path->nodes[0], ref, 6600 BTRFS_I(inode)->dir_index); 6601 write_extent_buffer(path->nodes[0], name->name, ptr, 6602 name->len); 6603 } 6604 } 6605 6606 /* 6607 * We don't need the path anymore, plus inheriting properties, adding 6608 * ACLs, security xattrs, orphan item or adding the link, will result in 6609 * allocating yet another path. So just free our path. 6610 */ 6611 btrfs_free_path(path); 6612 path = NULL; 6613 6614 if (args->subvol) { 6615 struct btrfs_inode *parent; 6616 6617 /* 6618 * Subvolumes inherit properties from their parent subvolume, 6619 * not the directory they were created in. 6620 */ 6621 parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); 6622 if (IS_ERR(parent)) { 6623 ret = PTR_ERR(parent); 6624 } else { 6625 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), 6626 parent); 6627 iput(&parent->vfs_inode); 6628 } 6629 } else { 6630 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), 6631 BTRFS_I(dir)); 6632 } 6633 if (ret) { 6634 btrfs_err(fs_info, 6635 "error inheriting props for ino %llu (root %llu): %d", 6636 btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); 6637 } 6638 6639 /* 6640 * Subvolumes don't inherit ACLs or get passed to the LSM. This is 6641 * probably a bug. 6642 */ 6643 if (!args->subvol) { 6644 ret = btrfs_init_inode_security(trans, args); 6645 if (unlikely(ret)) { 6646 btrfs_abort_transaction(trans, ret); 6647 goto discard; 6648 } 6649 } 6650 6651 ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); 6652 if (WARN_ON(ret)) { 6653 /* Shouldn't happen, we used xa_reserve() before. */ 6654 btrfs_abort_transaction(trans, ret); 6655 goto discard; 6656 } 6657 6658 trace_btrfs_inode_new(inode); 6659 btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); 6660 6661 btrfs_update_root_times(trans, root); 6662 6663 if (args->orphan) { 6664 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 6665 if (unlikely(ret)) { 6666 btrfs_abort_transaction(trans, ret); 6667 goto discard; 6668 } 6669 } else { 6670 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 6671 0, BTRFS_I(inode)->dir_index); 6672 if (unlikely(ret)) { 6673 btrfs_abort_transaction(trans, ret); 6674 goto discard; 6675 } 6676 } 6677 6678 return 0; 6679 6680 discard: 6681 /* 6682 * discard_new_inode() calls iput(), but the caller owns the reference 6683 * to the inode. 6684 */ 6685 ihold(inode); 6686 discard_new_inode(inode); 6687 out: 6688 if (xa_reserved) 6689 xa_release(&root->inodes, objectid); 6690 6691 btrfs_free_path(path); 6692 return ret; 6693 } 6694 6695 /* 6696 * utility function to add 'inode' into 'parent_inode' with 6697 * a give name and a given sequence number. 6698 * if 'add_backref' is true, also insert a backref from the 6699 * inode to the parent directory. 6700 */ 6701 int btrfs_add_link(struct btrfs_trans_handle *trans, 6702 struct btrfs_inode *parent_inode, struct btrfs_inode *inode, 6703 const struct fscrypt_str *name, bool add_backref, u64 index) 6704 { 6705 int ret = 0; 6706 struct btrfs_key key; 6707 struct btrfs_root *root = parent_inode->root; 6708 u64 ino = btrfs_ino(inode); 6709 u64 parent_ino = btrfs_ino(parent_inode); 6710 6711 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6712 memcpy(&key, &inode->root->root_key, sizeof(key)); 6713 } else { 6714 key.objectid = ino; 6715 key.type = BTRFS_INODE_ITEM_KEY; 6716 key.offset = 0; 6717 } 6718 6719 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6720 ret = btrfs_add_root_ref(trans, key.objectid, 6721 btrfs_root_id(root), parent_ino, 6722 index, name); 6723 } else if (add_backref) { 6724 ret = btrfs_insert_inode_ref(trans, root, name, 6725 ino, parent_ino, index); 6726 } 6727 6728 /* Nothing to clean up yet */ 6729 if (ret) 6730 return ret; 6731 6732 ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, 6733 btrfs_inode_type(inode), index); 6734 if (ret == -EEXIST || ret == -EOVERFLOW) 6735 goto fail_dir_item; 6736 else if (unlikely(ret)) { 6737 btrfs_abort_transaction(trans, ret); 6738 return ret; 6739 } 6740 6741 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + 6742 name->len * 2); 6743 inode_inc_iversion(&parent_inode->vfs_inode); 6744 update_time_after_link_or_unlink(parent_inode); 6745 6746 ret = btrfs_update_inode(trans, parent_inode); 6747 if (ret) 6748 btrfs_abort_transaction(trans, ret); 6749 return ret; 6750 6751 fail_dir_item: 6752 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6753 u64 local_index; 6754 int ret2; 6755 6756 ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root), 6757 parent_ino, &local_index, name); 6758 if (ret2) 6759 btrfs_abort_transaction(trans, ret2); 6760 } else if (add_backref) { 6761 int ret2; 6762 6763 ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL); 6764 if (ret2) 6765 btrfs_abort_transaction(trans, ret2); 6766 } 6767 6768 /* Return the original error code */ 6769 return ret; 6770 } 6771 6772 static int btrfs_create_common(struct inode *dir, struct dentry *dentry, 6773 struct inode *inode) 6774 { 6775 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 6776 struct btrfs_root *root = BTRFS_I(dir)->root; 6777 struct btrfs_new_inode_args new_inode_args = { 6778 .dir = dir, 6779 .dentry = dentry, 6780 .inode = inode, 6781 }; 6782 unsigned int trans_num_items; 6783 struct btrfs_trans_handle *trans; 6784 int ret; 6785 6786 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 6787 if (ret) 6788 goto out_inode; 6789 6790 trans = btrfs_start_transaction(root, trans_num_items); 6791 if (IS_ERR(trans)) { 6792 ret = PTR_ERR(trans); 6793 goto out_new_inode_args; 6794 } 6795 6796 ret = btrfs_create_new_inode(trans, &new_inode_args); 6797 if (!ret) { 6798 if (S_ISDIR(inode->i_mode)) 6799 inode->i_opflags |= IOP_FASTPERM_MAY_EXEC; 6800 d_instantiate_new(dentry, inode); 6801 } 6802 6803 btrfs_end_transaction(trans); 6804 btrfs_btree_balance_dirty(fs_info); 6805 out_new_inode_args: 6806 btrfs_new_inode_args_destroy(&new_inode_args); 6807 out_inode: 6808 if (ret) 6809 iput(inode); 6810 return ret; 6811 } 6812 6813 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, 6814 struct dentry *dentry, umode_t mode, dev_t rdev) 6815 { 6816 struct inode *inode; 6817 6818 inode = new_inode(dir->i_sb); 6819 if (!inode) 6820 return -ENOMEM; 6821 inode_init_owner(idmap, inode, dir, mode); 6822 inode->i_op = &btrfs_special_inode_operations; 6823 init_special_inode(inode, inode->i_mode, rdev); 6824 return btrfs_create_common(dir, dentry, inode); 6825 } 6826 6827 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir, 6828 struct dentry *dentry, umode_t mode, bool excl) 6829 { 6830 struct inode *inode; 6831 6832 inode = new_inode(dir->i_sb); 6833 if (!inode) 6834 return -ENOMEM; 6835 inode_init_owner(idmap, inode, dir, mode); 6836 inode->i_fop = &btrfs_file_operations; 6837 inode->i_op = &btrfs_file_inode_operations; 6838 inode->i_mapping->a_ops = &btrfs_aops; 6839 return btrfs_create_common(dir, dentry, inode); 6840 } 6841 6842 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6843 struct dentry *dentry) 6844 { 6845 struct btrfs_trans_handle *trans = NULL; 6846 struct btrfs_root *root = BTRFS_I(dir)->root; 6847 struct inode *inode = d_inode(old_dentry); 6848 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 6849 struct fscrypt_name fname; 6850 u64 index; 6851 int ret; 6852 6853 /* do not allow sys_link's with other subvols of the same device */ 6854 if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) 6855 return -EXDEV; 6856 6857 if (inode->i_nlink >= BTRFS_LINK_MAX) 6858 return -EMLINK; 6859 6860 ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); 6861 if (ret) 6862 goto fail; 6863 6864 ret = btrfs_set_inode_index(BTRFS_I(dir), &index); 6865 if (ret) 6866 goto fail; 6867 6868 /* 6869 * 2 items for inode and inode ref 6870 * 2 items for dir items 6871 * 1 item for parent inode 6872 * 1 item for orphan item deletion if O_TMPFILE 6873 */ 6874 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); 6875 if (IS_ERR(trans)) { 6876 ret = PTR_ERR(trans); 6877 trans = NULL; 6878 goto fail; 6879 } 6880 6881 /* There are several dir indexes for this inode, clear the cache. */ 6882 BTRFS_I(inode)->dir_index = 0ULL; 6883 inode_inc_iversion(inode); 6884 inode_set_ctime_current(inode); 6885 6886 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6887 &fname.disk_name, 1, index); 6888 if (ret) 6889 goto fail; 6890 6891 /* Link added now we update the inode item with the new link count. */ 6892 inc_nlink(inode); 6893 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 6894 if (unlikely(ret)) { 6895 btrfs_abort_transaction(trans, ret); 6896 goto fail; 6897 } 6898 6899 if (inode->i_nlink == 1) { 6900 /* 6901 * If the new hard link count is 1, it's a file created with the 6902 * open(2) O_TMPFILE flag. 6903 */ 6904 ret = btrfs_orphan_del(trans, BTRFS_I(inode)); 6905 if (unlikely(ret)) { 6906 btrfs_abort_transaction(trans, ret); 6907 goto fail; 6908 } 6909 } 6910 6911 /* Grab reference for the new dentry passed to d_instantiate(). */ 6912 ihold(inode); 6913 d_instantiate(dentry, inode); 6914 btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent); 6915 6916 fail: 6917 fscrypt_free_filename(&fname); 6918 if (trans) 6919 btrfs_end_transaction(trans); 6920 btrfs_btree_balance_dirty(fs_info); 6921 return ret; 6922 } 6923 6924 static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, 6925 struct dentry *dentry, umode_t mode) 6926 { 6927 struct inode *inode; 6928 6929 inode = new_inode(dir->i_sb); 6930 if (!inode) 6931 return ERR_PTR(-ENOMEM); 6932 inode_init_owner(idmap, inode, dir, S_IFDIR | mode); 6933 inode->i_op = &btrfs_dir_inode_operations; 6934 inode->i_fop = &btrfs_dir_file_operations; 6935 return ERR_PTR(btrfs_create_common(dir, dentry, inode)); 6936 } 6937 6938 static noinline int uncompress_inline(struct btrfs_path *path, 6939 struct folio *folio, 6940 struct btrfs_file_extent_item *item) 6941 { 6942 int ret; 6943 struct extent_buffer *leaf = path->nodes[0]; 6944 const u32 blocksize = leaf->fs_info->sectorsize; 6945 char *tmp; 6946 size_t max_size; 6947 unsigned long inline_size; 6948 unsigned long ptr; 6949 int compress_type; 6950 6951 compress_type = btrfs_file_extent_compression(leaf, item); 6952 max_size = btrfs_file_extent_ram_bytes(leaf, item); 6953 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); 6954 tmp = kmalloc(inline_size, GFP_NOFS); 6955 if (!tmp) 6956 return -ENOMEM; 6957 ptr = btrfs_file_extent_inline_start(item); 6958 6959 read_extent_buffer(leaf, tmp, ptr, inline_size); 6960 6961 max_size = min_t(unsigned long, blocksize, max_size); 6962 ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, 6963 max_size); 6964 6965 /* 6966 * decompression code contains a memset to fill in any space between the end 6967 * of the uncompressed data and the end of max_size in case the decompressed 6968 * data ends up shorter than ram_bytes. That doesn't cover the hole between 6969 * the end of an inline extent and the beginning of the next block, so we 6970 * cover that region here. 6971 */ 6972 6973 if (max_size < blocksize) 6974 folio_zero_range(folio, max_size, blocksize - max_size); 6975 kfree(tmp); 6976 return ret; 6977 } 6978 6979 static int read_inline_extent(struct btrfs_path *path, struct folio *folio) 6980 { 6981 const u32 blocksize = path->nodes[0]->fs_info->sectorsize; 6982 struct btrfs_file_extent_item *fi; 6983 void *kaddr; 6984 size_t copy_size; 6985 6986 if (!folio || folio_test_uptodate(folio)) 6987 return 0; 6988 6989 ASSERT(folio_pos(folio) == 0); 6990 6991 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 6992 struct btrfs_file_extent_item); 6993 if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) 6994 return uncompress_inline(path, folio, fi); 6995 6996 copy_size = min_t(u64, blocksize, 6997 btrfs_file_extent_ram_bytes(path->nodes[0], fi)); 6998 kaddr = kmap_local_folio(folio, 0); 6999 read_extent_buffer(path->nodes[0], kaddr, 7000 btrfs_file_extent_inline_start(fi), copy_size); 7001 kunmap_local(kaddr); 7002 if (copy_size < blocksize) 7003 folio_zero_range(folio, copy_size, blocksize - copy_size); 7004 return 0; 7005 } 7006 7007 /* 7008 * Lookup the first extent overlapping a range in a file. 7009 * 7010 * @inode: file to search in 7011 * @page: page to read extent data into if the extent is inline 7012 * @start: file offset 7013 * @len: length of range starting at @start 7014 * 7015 * Return the first &struct extent_map which overlaps the given range, reading 7016 * it from the B-tree and caching it if necessary. Note that there may be more 7017 * extents which overlap the given range after the returned extent_map. 7018 * 7019 * If @page is not NULL and the extent is inline, this also reads the extent 7020 * data directly into the page and marks the extent up to date in the io_tree. 7021 * 7022 * Return: ERR_PTR on error, non-NULL extent_map on success. 7023 */ 7024 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, 7025 struct folio *folio, u64 start, u64 len) 7026 { 7027 struct btrfs_fs_info *fs_info = inode->root->fs_info; 7028 int ret = 0; 7029 u64 extent_start = 0; 7030 u64 extent_end = 0; 7031 u64 objectid = btrfs_ino(inode); 7032 int extent_type = -1; 7033 struct btrfs_path *path = NULL; 7034 struct btrfs_root *root = inode->root; 7035 struct btrfs_file_extent_item *item; 7036 struct extent_buffer *leaf; 7037 struct btrfs_key found_key; 7038 struct extent_map *em = NULL; 7039 struct extent_map_tree *em_tree = &inode->extent_tree; 7040 7041 read_lock(&em_tree->lock); 7042 em = btrfs_lookup_extent_mapping(em_tree, start, len); 7043 read_unlock(&em_tree->lock); 7044 7045 if (em) { 7046 if (em->start > start || em->start + em->len <= start) 7047 btrfs_free_extent_map(em); 7048 else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) 7049 btrfs_free_extent_map(em); 7050 else 7051 goto out; 7052 } 7053 em = btrfs_alloc_extent_map(); 7054 if (!em) { 7055 ret = -ENOMEM; 7056 goto out; 7057 } 7058 em->start = EXTENT_MAP_HOLE; 7059 em->disk_bytenr = EXTENT_MAP_HOLE; 7060 em->len = (u64)-1; 7061 7062 path = btrfs_alloc_path(); 7063 if (!path) { 7064 ret = -ENOMEM; 7065 goto out; 7066 } 7067 7068 /* Chances are we'll be called again, so go ahead and do readahead */ 7069 path->reada = READA_FORWARD; 7070 7071 /* 7072 * The same explanation in load_free_space_cache applies here as well, 7073 * we only read when we're loading the free space cache, and at that 7074 * point the commit_root has everything we need. 7075 */ 7076 if (btrfs_is_free_space_inode(inode)) { 7077 path->search_commit_root = 1; 7078 path->skip_locking = 1; 7079 } 7080 7081 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); 7082 if (ret < 0) { 7083 goto out; 7084 } else if (ret > 0) { 7085 if (path->slots[0] == 0) 7086 goto not_found; 7087 path->slots[0]--; 7088 ret = 0; 7089 } 7090 7091 leaf = path->nodes[0]; 7092 item = btrfs_item_ptr(leaf, path->slots[0], 7093 struct btrfs_file_extent_item); 7094 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 7095 if (found_key.objectid != objectid || 7096 found_key.type != BTRFS_EXTENT_DATA_KEY) { 7097 /* 7098 * If we backup past the first extent we want to move forward 7099 * and see if there is an extent in front of us, otherwise we'll 7100 * say there is a hole for our whole search range which can 7101 * cause problems. 7102 */ 7103 extent_end = start; 7104 goto next; 7105 } 7106 7107 extent_type = btrfs_file_extent_type(leaf, item); 7108 extent_start = found_key.offset; 7109 extent_end = btrfs_file_extent_end(path); 7110 if (extent_type == BTRFS_FILE_EXTENT_REG || 7111 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 7112 /* Only regular file could have regular/prealloc extent */ 7113 if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) { 7114 ret = -EUCLEAN; 7115 btrfs_crit(fs_info, 7116 "regular/prealloc extent found for non-regular inode %llu", 7117 btrfs_ino(inode)); 7118 goto out; 7119 } 7120 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, 7121 extent_start); 7122 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 7123 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, 7124 path->slots[0], 7125 extent_start); 7126 } 7127 next: 7128 if (start >= extent_end) { 7129 path->slots[0]++; 7130 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 7131 ret = btrfs_next_leaf(root, path); 7132 if (ret < 0) 7133 goto out; 7134 else if (ret > 0) 7135 goto not_found; 7136 7137 leaf = path->nodes[0]; 7138 } 7139 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 7140 if (found_key.objectid != objectid || 7141 found_key.type != BTRFS_EXTENT_DATA_KEY) 7142 goto not_found; 7143 if (start + len <= found_key.offset) 7144 goto not_found; 7145 if (start > found_key.offset) 7146 goto next; 7147 7148 /* New extent overlaps with existing one */ 7149 em->start = start; 7150 em->len = found_key.offset - start; 7151 em->disk_bytenr = EXTENT_MAP_HOLE; 7152 goto insert; 7153 } 7154 7155 btrfs_extent_item_to_extent_map(inode, path, item, em); 7156 7157 if (extent_type == BTRFS_FILE_EXTENT_REG || 7158 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 7159 goto insert; 7160 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 7161 /* 7162 * Inline extent can only exist at file offset 0. This is 7163 * ensured by tree-checker and inline extent creation path. 7164 * Thus all members representing file offsets should be zero. 7165 */ 7166 ASSERT(extent_start == 0); 7167 ASSERT(em->start == 0); 7168 7169 /* 7170 * btrfs_extent_item_to_extent_map() should have properly 7171 * initialized em members already. 7172 * 7173 * Other members are not utilized for inline extents. 7174 */ 7175 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); 7176 ASSERT(em->len == fs_info->sectorsize); 7177 7178 ret = read_inline_extent(path, folio); 7179 if (ret < 0) 7180 goto out; 7181 goto insert; 7182 } 7183 not_found: 7184 em->start = start; 7185 em->len = len; 7186 em->disk_bytenr = EXTENT_MAP_HOLE; 7187 insert: 7188 ret = 0; 7189 btrfs_release_path(path); 7190 if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) { 7191 btrfs_err(fs_info, 7192 "bad extent! em: [%llu %llu] passed [%llu %llu]", 7193 em->start, em->len, start, len); 7194 ret = -EIO; 7195 goto out; 7196 } 7197 7198 write_lock(&em_tree->lock); 7199 ret = btrfs_add_extent_mapping(inode, &em, start, len); 7200 write_unlock(&em_tree->lock); 7201 out: 7202 btrfs_free_path(path); 7203 7204 trace_btrfs_get_extent(root, inode, em); 7205 7206 if (ret) { 7207 btrfs_free_extent_map(em); 7208 return ERR_PTR(ret); 7209 } 7210 return em; 7211 } 7212 7213 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 7214 { 7215 struct btrfs_block_group *block_group; 7216 bool readonly = false; 7217 7218 block_group = btrfs_lookup_block_group(fs_info, bytenr); 7219 if (!block_group || block_group->ro) 7220 readonly = true; 7221 if (block_group) 7222 btrfs_put_block_group(block_group); 7223 return readonly; 7224 } 7225 7226 /* 7227 * Check if we can do nocow write into the range [@offset, @offset + @len) 7228 * 7229 * @offset: File offset 7230 * @len: The length to write, will be updated to the nocow writeable 7231 * range 7232 * @orig_start: (optional) Return the original file offset of the file extent 7233 * @orig_len: (optional) Return the original on-disk length of the file extent 7234 * @ram_bytes: (optional) Return the ram_bytes of the file extent 7235 * 7236 * Return: 7237 * >0 and update @len if we can do nocow write 7238 * 0 if we can't do nocow write 7239 * <0 if error happened 7240 * 7241 * NOTE: This only checks the file extents, caller is responsible to wait for 7242 * any ordered extents. 7243 */ 7244 noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, 7245 struct btrfs_file_extent *file_extent, 7246 bool nowait) 7247 { 7248 struct btrfs_root *root = inode->root; 7249 struct btrfs_fs_info *fs_info = root->fs_info; 7250 struct can_nocow_file_extent_args nocow_args = { 0 }; 7251 BTRFS_PATH_AUTO_FREE(path); 7252 int ret; 7253 struct extent_buffer *leaf; 7254 struct extent_io_tree *io_tree = &inode->io_tree; 7255 struct btrfs_file_extent_item *fi; 7256 struct btrfs_key key; 7257 int found_type; 7258 7259 path = btrfs_alloc_path(); 7260 if (!path) 7261 return -ENOMEM; 7262 path->nowait = nowait; 7263 7264 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), 7265 offset, 0); 7266 if (ret < 0) 7267 return ret; 7268 7269 if (ret == 1) { 7270 if (path->slots[0] == 0) { 7271 /* Can't find the item, must COW. */ 7272 return 0; 7273 } 7274 path->slots[0]--; 7275 } 7276 ret = 0; 7277 leaf = path->nodes[0]; 7278 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 7279 if (key.objectid != btrfs_ino(inode) || 7280 key.type != BTRFS_EXTENT_DATA_KEY) { 7281 /* Not our file or wrong item type, must COW. */ 7282 return 0; 7283 } 7284 7285 if (key.offset > offset) { 7286 /* Wrong offset, must COW. */ 7287 return 0; 7288 } 7289 7290 if (btrfs_file_extent_end(path) <= offset) 7291 return 0; 7292 7293 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 7294 found_type = btrfs_file_extent_type(leaf, fi); 7295 7296 nocow_args.start = offset; 7297 nocow_args.end = offset + *len - 1; 7298 nocow_args.free_path = true; 7299 7300 ret = can_nocow_file_extent(path, &key, inode, &nocow_args); 7301 /* can_nocow_file_extent() has freed the path. */ 7302 path = NULL; 7303 7304 if (ret != 1) { 7305 /* Treat errors as not being able to NOCOW. */ 7306 return 0; 7307 } 7308 7309 if (btrfs_extent_readonly(fs_info, 7310 nocow_args.file_extent.disk_bytenr + 7311 nocow_args.file_extent.offset)) 7312 return 0; 7313 7314 if (!(inode->flags & BTRFS_INODE_NODATACOW) && 7315 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7316 u64 range_end; 7317 7318 range_end = round_up(offset + nocow_args.file_extent.num_bytes, 7319 root->fs_info->sectorsize) - 1; 7320 ret = btrfs_test_range_bit_exists(io_tree, offset, range_end, 7321 EXTENT_DELALLOC); 7322 if (ret) 7323 return -EAGAIN; 7324 } 7325 7326 if (file_extent) 7327 memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); 7328 7329 *len = nocow_args.file_extent.num_bytes; 7330 7331 return 1; 7332 } 7333 7334 /* The callers of this must take lock_extent() */ 7335 struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, 7336 const struct btrfs_file_extent *file_extent, 7337 int type) 7338 { 7339 struct extent_map *em; 7340 int ret; 7341 7342 /* 7343 * Note the missing NOCOW type. 7344 * 7345 * For pure NOCOW writes, we should not create an io extent map, but 7346 * just reusing the existing one. 7347 * Only PREALLOC writes (NOCOW write into preallocated range) can 7348 * create an io extent map. 7349 */ 7350 ASSERT(type == BTRFS_ORDERED_PREALLOC || 7351 type == BTRFS_ORDERED_COMPRESSED || 7352 type == BTRFS_ORDERED_REGULAR); 7353 7354 switch (type) { 7355 case BTRFS_ORDERED_PREALLOC: 7356 /* We're only referring part of a larger preallocated extent. */ 7357 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); 7358 break; 7359 case BTRFS_ORDERED_REGULAR: 7360 /* COW results a new extent matching our file extent size. */ 7361 ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); 7362 ASSERT(file_extent->ram_bytes == file_extent->num_bytes); 7363 7364 /* Since it's a new extent, we should not have any offset. */ 7365 ASSERT(file_extent->offset == 0); 7366 break; 7367 case BTRFS_ORDERED_COMPRESSED: 7368 /* Must be compressed. */ 7369 ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE); 7370 7371 /* 7372 * Encoded write can make us to refer to part of the 7373 * uncompressed extent. 7374 */ 7375 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); 7376 break; 7377 } 7378 7379 em = btrfs_alloc_extent_map(); 7380 if (!em) 7381 return ERR_PTR(-ENOMEM); 7382 7383 em->start = start; 7384 em->len = file_extent->num_bytes; 7385 em->disk_bytenr = file_extent->disk_bytenr; 7386 em->disk_num_bytes = file_extent->disk_num_bytes; 7387 em->ram_bytes = file_extent->ram_bytes; 7388 em->generation = -1; 7389 em->offset = file_extent->offset; 7390 em->flags |= EXTENT_FLAG_PINNED; 7391 if (type == BTRFS_ORDERED_COMPRESSED) 7392 btrfs_extent_map_set_compression(em, file_extent->compression); 7393 7394 ret = btrfs_replace_extent_map_range(inode, em, true); 7395 if (ret) { 7396 btrfs_free_extent_map(em); 7397 return ERR_PTR(ret); 7398 } 7399 7400 /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */ 7401 return em; 7402 } 7403 7404 /* 7405 * For release_folio() and invalidate_folio() we have a race window where 7406 * folio_end_writeback() is called but the subpage spinlock is not yet released. 7407 * If we continue to release/invalidate the page, we could cause use-after-free 7408 * for subpage spinlock. So this function is to spin and wait for subpage 7409 * spinlock. 7410 */ 7411 static void wait_subpage_spinlock(struct folio *folio) 7412 { 7413 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 7414 struct btrfs_folio_state *bfs; 7415 7416 if (!btrfs_is_subpage(fs_info, folio)) 7417 return; 7418 7419 ASSERT(folio_test_private(folio) && folio_get_private(folio)); 7420 bfs = folio_get_private(folio); 7421 7422 /* 7423 * This may look insane as we just acquire the spinlock and release it, 7424 * without doing anything. But we just want to make sure no one is 7425 * still holding the subpage spinlock. 7426 * And since the page is not dirty nor writeback, and we have page 7427 * locked, the only possible way to hold a spinlock is from the endio 7428 * function to clear page writeback. 7429 * 7430 * Here we just acquire the spinlock so that all existing callers 7431 * should exit and we're safe to release/invalidate the page. 7432 */ 7433 spin_lock_irq(&bfs->lock); 7434 spin_unlock_irq(&bfs->lock); 7435 } 7436 7437 static int btrfs_launder_folio(struct folio *folio) 7438 { 7439 return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), 7440 folio_size(folio), NULL); 7441 } 7442 7443 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) 7444 { 7445 if (try_release_extent_mapping(folio, gfp_flags)) { 7446 wait_subpage_spinlock(folio); 7447 clear_folio_extent_mapped(folio); 7448 return true; 7449 } 7450 return false; 7451 } 7452 7453 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) 7454 { 7455 if (folio_test_writeback(folio) || folio_test_dirty(folio)) 7456 return false; 7457 return __btrfs_release_folio(folio, gfp_flags); 7458 } 7459 7460 #ifdef CONFIG_MIGRATION 7461 static int btrfs_migrate_folio(struct address_space *mapping, 7462 struct folio *dst, struct folio *src, 7463 enum migrate_mode mode) 7464 { 7465 int ret = filemap_migrate_folio(mapping, dst, src, mode); 7466 7467 if (ret) 7468 return ret; 7469 7470 if (folio_test_ordered(src)) { 7471 folio_clear_ordered(src); 7472 folio_set_ordered(dst); 7473 } 7474 7475 return 0; 7476 } 7477 #else 7478 #define btrfs_migrate_folio NULL 7479 #endif 7480 7481 static void btrfs_invalidate_folio(struct folio *folio, size_t offset, 7482 size_t length) 7483 { 7484 struct btrfs_inode *inode = folio_to_inode(folio); 7485 struct btrfs_fs_info *fs_info = inode->root->fs_info; 7486 struct extent_io_tree *tree = &inode->io_tree; 7487 struct extent_state *cached_state = NULL; 7488 u64 page_start = folio_pos(folio); 7489 u64 page_end = page_start + folio_size(folio) - 1; 7490 u64 cur; 7491 int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING; 7492 7493 /* 7494 * We have folio locked so no new ordered extent can be created on this 7495 * page, nor bio can be submitted for this folio. 7496 * 7497 * But already submitted bio can still be finished on this folio. 7498 * Furthermore, endio function won't skip folio which has Ordered 7499 * already cleared, so it's possible for endio and 7500 * invalidate_folio to do the same ordered extent accounting twice 7501 * on one folio. 7502 * 7503 * So here we wait for any submitted bios to finish, so that we won't 7504 * do double ordered extent accounting on the same folio. 7505 */ 7506 folio_wait_writeback(folio); 7507 wait_subpage_spinlock(folio); 7508 7509 /* 7510 * For subpage case, we have call sites like 7511 * btrfs_punch_hole_lock_range() which passes range not aligned to 7512 * sectorsize. 7513 * If the range doesn't cover the full folio, we don't need to and 7514 * shouldn't clear page extent mapped, as folio->private can still 7515 * record subpage dirty bits for other part of the range. 7516 * 7517 * For cases that invalidate the full folio even the range doesn't 7518 * cover the full folio, like invalidating the last folio, we're 7519 * still safe to wait for ordered extent to finish. 7520 */ 7521 if (!(offset == 0 && length == folio_size(folio))) { 7522 btrfs_release_folio(folio, GFP_NOFS); 7523 return; 7524 } 7525 7526 if (!inode_evicting) 7527 btrfs_lock_extent(tree, page_start, page_end, &cached_state); 7528 7529 cur = page_start; 7530 while (cur < page_end) { 7531 struct btrfs_ordered_extent *ordered; 7532 u64 range_end; 7533 u32 range_len; 7534 u32 extra_flags = 0; 7535 7536 ordered = btrfs_lookup_first_ordered_range(inode, cur, 7537 page_end + 1 - cur); 7538 if (!ordered) { 7539 range_end = page_end; 7540 /* 7541 * No ordered extent covering this range, we are safe 7542 * to delete all extent states in the range. 7543 */ 7544 extra_flags = EXTENT_CLEAR_ALL_BITS; 7545 goto next; 7546 } 7547 if (ordered->file_offset > cur) { 7548 /* 7549 * There is a range between [cur, oe->file_offset) not 7550 * covered by any ordered extent. 7551 * We are safe to delete all extent states, and handle 7552 * the ordered extent in the next iteration. 7553 */ 7554 range_end = ordered->file_offset - 1; 7555 extra_flags = EXTENT_CLEAR_ALL_BITS; 7556 goto next; 7557 } 7558 7559 range_end = min(ordered->file_offset + ordered->num_bytes - 1, 7560 page_end); 7561 ASSERT(range_end + 1 - cur < U32_MAX); 7562 range_len = range_end + 1 - cur; 7563 if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { 7564 /* 7565 * If Ordered is cleared, it means endio has 7566 * already been executed for the range. 7567 * We can't delete the extent states as 7568 * btrfs_finish_ordered_io() may still use some of them. 7569 */ 7570 goto next; 7571 } 7572 btrfs_folio_clear_ordered(fs_info, folio, cur, range_len); 7573 7574 /* 7575 * IO on this page will never be started, so we need to account 7576 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW 7577 * here, must leave that up for the ordered extent completion. 7578 * 7579 * This will also unlock the range for incoming 7580 * btrfs_finish_ordered_io(). 7581 */ 7582 if (!inode_evicting) 7583 btrfs_clear_extent_bit(tree, cur, range_end, 7584 EXTENT_DELALLOC | 7585 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 7586 EXTENT_DEFRAG, &cached_state); 7587 7588 spin_lock_irq(&inode->ordered_tree_lock); 7589 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 7590 ordered->truncated_len = min(ordered->truncated_len, 7591 cur - ordered->file_offset); 7592 spin_unlock_irq(&inode->ordered_tree_lock); 7593 7594 /* 7595 * If the ordered extent has finished, we're safe to delete all 7596 * the extent states of the range, otherwise 7597 * btrfs_finish_ordered_io() will get executed by endio for 7598 * other pages, so we can't delete extent states. 7599 */ 7600 if (btrfs_dec_test_ordered_pending(inode, &ordered, 7601 cur, range_end + 1 - cur)) { 7602 btrfs_finish_ordered_io(ordered); 7603 /* 7604 * The ordered extent has finished, now we're again 7605 * safe to delete all extent states of the range. 7606 */ 7607 extra_flags = EXTENT_CLEAR_ALL_BITS; 7608 } 7609 next: 7610 if (ordered) 7611 btrfs_put_ordered_extent(ordered); 7612 /* 7613 * Qgroup reserved space handler 7614 * Sector(s) here will be either: 7615 * 7616 * 1) Already written to disk or bio already finished 7617 * Then its QGROUP_RESERVED bit in io_tree is already cleared. 7618 * Qgroup will be handled by its qgroup_record then. 7619 * btrfs_qgroup_free_data() call will do nothing here. 7620 * 7621 * 2) Not written to disk yet 7622 * Then btrfs_qgroup_free_data() call will clear the 7623 * QGROUP_RESERVED bit of its io_tree, and free the qgroup 7624 * reserved data space. 7625 * Since the IO will never happen for this page. 7626 */ 7627 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); 7628 if (!inode_evicting) 7629 btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | 7630 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 7631 EXTENT_DEFRAG | extra_flags, 7632 &cached_state); 7633 cur = range_end + 1; 7634 } 7635 /* 7636 * We have iterated through all ordered extents of the page, the page 7637 * should not have Ordered anymore, or the above iteration 7638 * did something wrong. 7639 */ 7640 ASSERT(!folio_test_ordered(folio)); 7641 btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); 7642 if (!inode_evicting) 7643 __btrfs_release_folio(folio, GFP_NOFS); 7644 clear_folio_extent_mapped(folio); 7645 } 7646 7647 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) 7648 { 7649 struct btrfs_truncate_control control = { 7650 .inode = inode, 7651 .ino = btrfs_ino(inode), 7652 .min_type = BTRFS_EXTENT_DATA_KEY, 7653 .clear_extent_range = true, 7654 }; 7655 struct btrfs_root *root = inode->root; 7656 struct btrfs_fs_info *fs_info = root->fs_info; 7657 struct btrfs_block_rsv rsv; 7658 int ret; 7659 struct btrfs_trans_handle *trans; 7660 u64 mask = fs_info->sectorsize - 1; 7661 const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); 7662 7663 if (!skip_writeback) { 7664 ret = btrfs_wait_ordered_range(inode, 7665 inode->vfs_inode.i_size & (~mask), 7666 (u64)-1); 7667 if (ret) 7668 return ret; 7669 } 7670 7671 /* 7672 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of 7673 * things going on here: 7674 * 7675 * 1) We need to reserve space to update our inode. 7676 * 7677 * 2) We need to have something to cache all the space that is going to 7678 * be free'd up by the truncate operation, but also have some slack 7679 * space reserved in case it uses space during the truncate (thank you 7680 * very much snapshotting). 7681 * 7682 * And we need these to be separate. The fact is we can use a lot of 7683 * space doing the truncate, and we have no earthly idea how much space 7684 * we will use, so we need the truncate reservation to be separate so it 7685 * doesn't end up using space reserved for updating the inode. We also 7686 * need to be able to stop the transaction and start a new one, which 7687 * means we need to be able to update the inode several times, and we 7688 * have no idea of knowing how many times that will be, so we can't just 7689 * reserve 1 item for the entirety of the operation, so that has to be 7690 * done separately as well. 7691 * 7692 * So that leaves us with 7693 * 7694 * 1) rsv - for the truncate reservation, which we will steal from the 7695 * transaction reservation. 7696 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for 7697 * updating the inode. 7698 */ 7699 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); 7700 rsv.size = min_size; 7701 rsv.failfast = true; 7702 7703 /* 7704 * 1 for the truncate slack space 7705 * 1 for updating the inode. 7706 */ 7707 trans = btrfs_start_transaction(root, 2); 7708 if (IS_ERR(trans)) { 7709 ret = PTR_ERR(trans); 7710 goto out; 7711 } 7712 7713 /* Migrate the slack space for the truncate to our reserve */ 7714 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, 7715 min_size, false); 7716 /* 7717 * We have reserved 2 metadata units when we started the transaction and 7718 * min_size matches 1 unit, so this should never fail, but if it does, 7719 * it's not critical we just fail truncation. 7720 */ 7721 if (WARN_ON(ret)) { 7722 btrfs_end_transaction(trans); 7723 goto out; 7724 } 7725 7726 trans->block_rsv = &rsv; 7727 7728 while (1) { 7729 struct extent_state *cached_state = NULL; 7730 const u64 new_size = inode->vfs_inode.i_size; 7731 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); 7732 7733 control.new_size = new_size; 7734 btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); 7735 /* 7736 * We want to drop from the next block forward in case this new 7737 * size is not block aligned since we will be keeping the last 7738 * block of the extent just the way it is. 7739 */ 7740 btrfs_drop_extent_map_range(inode, 7741 ALIGN(new_size, fs_info->sectorsize), 7742 (u64)-1, false); 7743 7744 ret = btrfs_truncate_inode_items(trans, root, &control); 7745 7746 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 7747 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 7748 7749 btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); 7750 7751 trans->block_rsv = &fs_info->trans_block_rsv; 7752 if (ret != -ENOSPC && ret != -EAGAIN) 7753 break; 7754 7755 ret = btrfs_update_inode(trans, inode); 7756 if (ret) 7757 break; 7758 7759 btrfs_end_transaction(trans); 7760 btrfs_btree_balance_dirty(fs_info); 7761 7762 trans = btrfs_start_transaction(root, 2); 7763 if (IS_ERR(trans)) { 7764 ret = PTR_ERR(trans); 7765 trans = NULL; 7766 break; 7767 } 7768 7769 btrfs_block_rsv_release(fs_info, &rsv, -1, NULL); 7770 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 7771 &rsv, min_size, false); 7772 /* 7773 * We have reserved 2 metadata units when we started the 7774 * transaction and min_size matches 1 unit, so this should never 7775 * fail, but if it does, it's not critical we just fail truncation. 7776 */ 7777 if (WARN_ON(ret)) 7778 break; 7779 7780 trans->block_rsv = &rsv; 7781 } 7782 7783 /* 7784 * We can't call btrfs_truncate_block inside a trans handle as we could 7785 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we 7786 * know we've truncated everything except the last little bit, and can 7787 * do btrfs_truncate_block and then update the disk_i_size. 7788 */ 7789 if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { 7790 btrfs_end_transaction(trans); 7791 btrfs_btree_balance_dirty(fs_info); 7792 7793 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 7794 inode->vfs_inode.i_size, (u64)-1); 7795 if (ret) 7796 goto out; 7797 trans = btrfs_start_transaction(root, 1); 7798 if (IS_ERR(trans)) { 7799 ret = PTR_ERR(trans); 7800 goto out; 7801 } 7802 btrfs_inode_safe_disk_i_size_write(inode, 0); 7803 } 7804 7805 if (trans) { 7806 int ret2; 7807 7808 trans->block_rsv = &fs_info->trans_block_rsv; 7809 ret2 = btrfs_update_inode(trans, inode); 7810 if (ret2 && !ret) 7811 ret = ret2; 7812 7813 ret2 = btrfs_end_transaction(trans); 7814 if (ret2 && !ret) 7815 ret = ret2; 7816 btrfs_btree_balance_dirty(fs_info); 7817 } 7818 out: 7819 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); 7820 /* 7821 * So if we truncate and then write and fsync we normally would just 7822 * write the extents that changed, which is a problem if we need to 7823 * first truncate that entire inode. So set this flag so we write out 7824 * all of the extents in the inode to the sync log so we're completely 7825 * safe. 7826 * 7827 * If no extents were dropped or trimmed we don't need to force the next 7828 * fsync to truncate all the inode's items from the log and re-log them 7829 * all. This means the truncate operation did not change the file size, 7830 * or changed it to a smaller size but there was only an implicit hole 7831 * between the old i_size and the new i_size, and there were no prealloc 7832 * extents beyond i_size to drop. 7833 */ 7834 if (control.extents_found > 0) 7835 btrfs_set_inode_full_sync(inode); 7836 7837 return ret; 7838 } 7839 7840 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, 7841 struct inode *dir) 7842 { 7843 struct inode *inode; 7844 7845 inode = new_inode(dir->i_sb); 7846 if (inode) { 7847 /* 7848 * Subvolumes don't inherit the sgid bit or the parent's gid if 7849 * the parent's sgid bit is set. This is probably a bug. 7850 */ 7851 inode_init_owner(idmap, inode, NULL, 7852 S_IFDIR | (~current_umask() & S_IRWXUGO)); 7853 inode->i_op = &btrfs_dir_inode_operations; 7854 inode->i_fop = &btrfs_dir_file_operations; 7855 } 7856 return inode; 7857 } 7858 7859 struct inode *btrfs_alloc_inode(struct super_block *sb) 7860 { 7861 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 7862 struct btrfs_inode *ei; 7863 struct inode *inode; 7864 7865 ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); 7866 if (!ei) 7867 return NULL; 7868 7869 ei->root = NULL; 7870 ei->generation = 0; 7871 ei->last_trans = 0; 7872 ei->last_sub_trans = 0; 7873 ei->logged_trans = 0; 7874 ei->delalloc_bytes = 0; 7875 /* new_delalloc_bytes and last_dir_index_offset are in a union. */ 7876 ei->new_delalloc_bytes = 0; 7877 ei->defrag_bytes = 0; 7878 ei->disk_i_size = 0; 7879 ei->flags = 0; 7880 ei->ro_flags = 0; 7881 /* 7882 * ->index_cnt will be properly initialized later when creating a new 7883 * inode (btrfs_create_new_inode()) or when reading an existing inode 7884 * from disk (btrfs_read_locked_inode()). 7885 */ 7886 ei->csum_bytes = 0; 7887 ei->dir_index = 0; 7888 ei->last_unlink_trans = 0; 7889 ei->last_reflink_trans = 0; 7890 ei->last_log_commit = 0; 7891 7892 spin_lock_init(&ei->lock); 7893 ei->outstanding_extents = 0; 7894 if (sb->s_magic != BTRFS_TEST_MAGIC) 7895 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, 7896 BTRFS_BLOCK_RSV_DELALLOC); 7897 ei->runtime_flags = 0; 7898 ei->prop_compress = BTRFS_COMPRESS_NONE; 7899 ei->defrag_compress = BTRFS_COMPRESS_NONE; 7900 7901 ei->delayed_node = NULL; 7902 7903 ei->i_otime_sec = 0; 7904 ei->i_otime_nsec = 0; 7905 7906 inode = &ei->vfs_inode; 7907 btrfs_extent_map_tree_init(&ei->extent_tree); 7908 7909 /* This io tree sets the valid inode. */ 7910 btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); 7911 ei->io_tree.inode = ei; 7912 7913 ei->file_extent_tree = NULL; 7914 7915 mutex_init(&ei->log_mutex); 7916 spin_lock_init(&ei->ordered_tree_lock); 7917 ei->ordered_tree = RB_ROOT; 7918 ei->ordered_tree_last = NULL; 7919 INIT_LIST_HEAD(&ei->delalloc_inodes); 7920 INIT_LIST_HEAD(&ei->delayed_iput); 7921 init_rwsem(&ei->i_mmap_lock); 7922 7923 return inode; 7924 } 7925 7926 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 7927 void btrfs_test_destroy_inode(struct inode *inode) 7928 { 7929 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); 7930 kfree(BTRFS_I(inode)->file_extent_tree); 7931 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 7932 } 7933 #endif 7934 7935 void btrfs_free_inode(struct inode *inode) 7936 { 7937 kfree(BTRFS_I(inode)->file_extent_tree); 7938 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 7939 } 7940 7941 void btrfs_destroy_inode(struct inode *vfs_inode) 7942 { 7943 struct btrfs_ordered_extent *ordered; 7944 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 7945 struct btrfs_root *root = inode->root; 7946 bool freespace_inode; 7947 7948 WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); 7949 WARN_ON(vfs_inode->i_data.nrpages); 7950 WARN_ON(inode->block_rsv.reserved); 7951 WARN_ON(inode->block_rsv.size); 7952 WARN_ON(inode->outstanding_extents); 7953 if (!S_ISDIR(vfs_inode->i_mode)) { 7954 WARN_ON(inode->delalloc_bytes); 7955 WARN_ON(inode->new_delalloc_bytes); 7956 WARN_ON(inode->csum_bytes); 7957 } 7958 if (!root || !btrfs_is_data_reloc_root(root)) 7959 WARN_ON(inode->defrag_bytes); 7960 7961 /* 7962 * This can happen where we create an inode, but somebody else also 7963 * created the same inode and we need to destroy the one we already 7964 * created. 7965 */ 7966 if (!root) 7967 return; 7968 7969 /* 7970 * If this is a free space inode do not take the ordered extents lockdep 7971 * map. 7972 */ 7973 freespace_inode = btrfs_is_free_space_inode(inode); 7974 7975 while (1) { 7976 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 7977 if (!ordered) 7978 break; 7979 else { 7980 btrfs_err(root->fs_info, 7981 "found ordered extent %llu %llu on inode cleanup", 7982 ordered->file_offset, ordered->num_bytes); 7983 7984 if (!freespace_inode) 7985 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); 7986 7987 btrfs_remove_ordered_extent(inode, ordered); 7988 btrfs_put_ordered_extent(ordered); 7989 btrfs_put_ordered_extent(ordered); 7990 } 7991 } 7992 btrfs_qgroup_check_reserved_leak(inode); 7993 btrfs_del_inode_from_root(inode); 7994 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 7995 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); 7996 btrfs_put_root(inode->root); 7997 } 7998 7999 int btrfs_drop_inode(struct inode *inode) 8000 { 8001 struct btrfs_root *root = BTRFS_I(inode)->root; 8002 8003 if (root == NULL) 8004 return 1; 8005 8006 /* the snap/subvol tree is on deleting */ 8007 if (btrfs_root_refs(&root->root_item) == 0) 8008 return 1; 8009 else 8010 return inode_generic_drop(inode); 8011 } 8012 8013 static void init_once(void *foo) 8014 { 8015 struct btrfs_inode *ei = foo; 8016 8017 inode_init_once(&ei->vfs_inode); 8018 #ifdef CONFIG_FS_VERITY 8019 ei->i_verity_info = NULL; 8020 #endif 8021 } 8022 8023 void __cold btrfs_destroy_cachep(void) 8024 { 8025 /* 8026 * Make sure all delayed rcu free inodes are flushed before we 8027 * destroy cache. 8028 */ 8029 rcu_barrier(); 8030 kmem_cache_destroy(btrfs_inode_cachep); 8031 } 8032 8033 int __init btrfs_init_cachep(void) 8034 { 8035 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 8036 sizeof(struct btrfs_inode), 0, 8037 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, 8038 init_once); 8039 if (!btrfs_inode_cachep) 8040 return -ENOMEM; 8041 8042 return 0; 8043 } 8044 8045 static int btrfs_getattr(struct mnt_idmap *idmap, 8046 const struct path *path, struct kstat *stat, 8047 u32 request_mask, unsigned int flags) 8048 { 8049 u64 delalloc_bytes; 8050 u64 inode_bytes; 8051 struct inode *inode = d_inode(path->dentry); 8052 u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize; 8053 u32 bi_flags = BTRFS_I(inode)->flags; 8054 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; 8055 8056 stat->result_mask |= STATX_BTIME; 8057 stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec; 8058 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec; 8059 if (bi_flags & BTRFS_INODE_APPEND) 8060 stat->attributes |= STATX_ATTR_APPEND; 8061 if (bi_flags & BTRFS_INODE_COMPRESS) 8062 stat->attributes |= STATX_ATTR_COMPRESSED; 8063 if (bi_flags & BTRFS_INODE_IMMUTABLE) 8064 stat->attributes |= STATX_ATTR_IMMUTABLE; 8065 if (bi_flags & BTRFS_INODE_NODUMP) 8066 stat->attributes |= STATX_ATTR_NODUMP; 8067 if (bi_ro_flags & BTRFS_INODE_RO_VERITY) 8068 stat->attributes |= STATX_ATTR_VERITY; 8069 8070 stat->attributes_mask |= (STATX_ATTR_APPEND | 8071 STATX_ATTR_COMPRESSED | 8072 STATX_ATTR_IMMUTABLE | 8073 STATX_ATTR_NODUMP); 8074 8075 generic_fillattr(idmap, request_mask, inode, stat); 8076 stat->dev = BTRFS_I(inode)->root->anon_dev; 8077 8078 stat->subvol = btrfs_root_id(BTRFS_I(inode)->root); 8079 stat->result_mask |= STATX_SUBVOL; 8080 8081 spin_lock(&BTRFS_I(inode)->lock); 8082 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; 8083 inode_bytes = inode_get_bytes(inode); 8084 spin_unlock(&BTRFS_I(inode)->lock); 8085 stat->blocks = (ALIGN(inode_bytes, blocksize) + 8086 ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT; 8087 return 0; 8088 } 8089 8090 static int btrfs_rename_exchange(struct inode *old_dir, 8091 struct dentry *old_dentry, 8092 struct inode *new_dir, 8093 struct dentry *new_dentry) 8094 { 8095 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); 8096 struct btrfs_trans_handle *trans; 8097 unsigned int trans_num_items; 8098 struct btrfs_root *root = BTRFS_I(old_dir)->root; 8099 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 8100 struct inode *new_inode = new_dentry->d_inode; 8101 struct inode *old_inode = old_dentry->d_inode; 8102 struct btrfs_rename_ctx old_rename_ctx; 8103 struct btrfs_rename_ctx new_rename_ctx; 8104 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 8105 u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); 8106 u64 old_idx = 0; 8107 u64 new_idx = 0; 8108 int ret; 8109 int ret2; 8110 bool need_abort = false; 8111 bool logs_pinned = false; 8112 struct fscrypt_name old_fname, new_fname; 8113 struct fscrypt_str *old_name, *new_name; 8114 8115 /* 8116 * For non-subvolumes allow exchange only within one subvolume, in the 8117 * same inode namespace. Two subvolumes (represented as directory) can 8118 * be exchanged as they're a logical link and have a fixed inode number. 8119 */ 8120 if (root != dest && 8121 (old_ino != BTRFS_FIRST_FREE_OBJECTID || 8122 new_ino != BTRFS_FIRST_FREE_OBJECTID)) 8123 return -EXDEV; 8124 8125 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); 8126 if (ret) 8127 return ret; 8128 8129 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); 8130 if (ret) { 8131 fscrypt_free_filename(&old_fname); 8132 return ret; 8133 } 8134 8135 old_name = &old_fname.disk_name; 8136 new_name = &new_fname.disk_name; 8137 8138 /* close the race window with snapshot create/destroy ioctl */ 8139 if (old_ino == BTRFS_FIRST_FREE_OBJECTID || 8140 new_ino == BTRFS_FIRST_FREE_OBJECTID) 8141 down_read(&fs_info->subvol_sem); 8142 8143 /* 8144 * For each inode: 8145 * 1 to remove old dir item 8146 * 1 to remove old dir index 8147 * 1 to add new dir item 8148 * 1 to add new dir index 8149 * 1 to update parent inode 8150 * 8151 * If the parents are the same, we only need to account for one 8152 */ 8153 trans_num_items = (old_dir == new_dir ? 9 : 10); 8154 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 8155 /* 8156 * 1 to remove old root ref 8157 * 1 to remove old root backref 8158 * 1 to add new root ref 8159 * 1 to add new root backref 8160 */ 8161 trans_num_items += 4; 8162 } else { 8163 /* 8164 * 1 to update inode item 8165 * 1 to remove old inode ref 8166 * 1 to add new inode ref 8167 */ 8168 trans_num_items += 3; 8169 } 8170 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) 8171 trans_num_items += 4; 8172 else 8173 trans_num_items += 3; 8174 trans = btrfs_start_transaction(root, trans_num_items); 8175 if (IS_ERR(trans)) { 8176 ret = PTR_ERR(trans); 8177 goto out_notrans; 8178 } 8179 8180 if (dest != root) { 8181 ret = btrfs_record_root_in_trans(trans, dest); 8182 if (ret) 8183 goto out_fail; 8184 } 8185 8186 /* 8187 * We need to find a free sequence number both in the source and 8188 * in the destination directory for the exchange. 8189 */ 8190 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); 8191 if (ret) 8192 goto out_fail; 8193 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); 8194 if (ret) 8195 goto out_fail; 8196 8197 BTRFS_I(old_inode)->dir_index = 0ULL; 8198 BTRFS_I(new_inode)->dir_index = 0ULL; 8199 8200 /* Reference for the source. */ 8201 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 8202 /* force full log commit if subvolume involved. */ 8203 btrfs_set_log_full_commit(trans); 8204 } else { 8205 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, 8206 btrfs_ino(BTRFS_I(new_dir)), 8207 old_idx); 8208 if (ret) 8209 goto out_fail; 8210 need_abort = true; 8211 } 8212 8213 /* And now for the dest. */ 8214 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 8215 /* force full log commit if subvolume involved. */ 8216 btrfs_set_log_full_commit(trans); 8217 } else { 8218 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, 8219 btrfs_ino(BTRFS_I(old_dir)), 8220 new_idx); 8221 if (ret) { 8222 if (unlikely(need_abort)) 8223 btrfs_abort_transaction(trans, ret); 8224 goto out_fail; 8225 } 8226 } 8227 8228 /* Update inode version and ctime/mtime. */ 8229 inode_inc_iversion(old_dir); 8230 inode_inc_iversion(new_dir); 8231 inode_inc_iversion(old_inode); 8232 inode_inc_iversion(new_inode); 8233 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 8234 8235 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && 8236 new_ino != BTRFS_FIRST_FREE_OBJECTID) { 8237 /* 8238 * If we are renaming in the same directory (and it's not for 8239 * root entries) pin the log early to prevent any concurrent 8240 * task from logging the directory after we removed the old 8241 * entries and before we add the new entries, otherwise that 8242 * task can sync a log without any entry for the inodes we are 8243 * renaming and therefore replaying that log, if a power failure 8244 * happens after syncing the log, would result in deleting the 8245 * inodes. 8246 * 8247 * If the rename affects two different directories, we want to 8248 * make sure the that there's no log commit that contains 8249 * updates for only one of the directories but not for the 8250 * other. 8251 * 8252 * If we are renaming an entry for a root, we don't care about 8253 * log updates since we called btrfs_set_log_full_commit(). 8254 */ 8255 btrfs_pin_log_trans(root); 8256 btrfs_pin_log_trans(dest); 8257 logs_pinned = true; 8258 } 8259 8260 if (old_dentry->d_parent != new_dentry->d_parent) { 8261 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 8262 BTRFS_I(old_inode), true); 8263 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), 8264 BTRFS_I(new_inode), true); 8265 } 8266 8267 /* src is a subvolume */ 8268 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 8269 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); 8270 if (unlikely(ret)) { 8271 btrfs_abort_transaction(trans, ret); 8272 goto out_fail; 8273 } 8274 } else { /* src is an inode */ 8275 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), 8276 BTRFS_I(old_dentry->d_inode), 8277 old_name, &old_rename_ctx); 8278 if (unlikely(ret)) { 8279 btrfs_abort_transaction(trans, ret); 8280 goto out_fail; 8281 } 8282 ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); 8283 if (unlikely(ret)) { 8284 btrfs_abort_transaction(trans, ret); 8285 goto out_fail; 8286 } 8287 } 8288 8289 /* dest is a subvolume */ 8290 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 8291 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); 8292 if (unlikely(ret)) { 8293 btrfs_abort_transaction(trans, ret); 8294 goto out_fail; 8295 } 8296 } else { /* dest is an inode */ 8297 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), 8298 BTRFS_I(new_dentry->d_inode), 8299 new_name, &new_rename_ctx); 8300 if (unlikely(ret)) { 8301 btrfs_abort_transaction(trans, ret); 8302 goto out_fail; 8303 } 8304 ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); 8305 if (unlikely(ret)) { 8306 btrfs_abort_transaction(trans, ret); 8307 goto out_fail; 8308 } 8309 } 8310 8311 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 8312 new_name, 0, old_idx); 8313 if (unlikely(ret)) { 8314 btrfs_abort_transaction(trans, ret); 8315 goto out_fail; 8316 } 8317 8318 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), 8319 old_name, 0, new_idx); 8320 if (unlikely(ret)) { 8321 btrfs_abort_transaction(trans, ret); 8322 goto out_fail; 8323 } 8324 8325 if (old_inode->i_nlink == 1) 8326 BTRFS_I(old_inode)->dir_index = old_idx; 8327 if (new_inode->i_nlink == 1) 8328 BTRFS_I(new_inode)->dir_index = new_idx; 8329 8330 /* 8331 * Do the log updates for all inodes. 8332 * 8333 * If either entry is for a root we don't need to update the logs since 8334 * we've called btrfs_set_log_full_commit() before. 8335 */ 8336 if (logs_pinned) { 8337 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), 8338 old_rename_ctx.index, new_dentry->d_parent); 8339 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), 8340 new_rename_ctx.index, old_dentry->d_parent); 8341 } 8342 8343 out_fail: 8344 if (logs_pinned) { 8345 btrfs_end_log_trans(root); 8346 btrfs_end_log_trans(dest); 8347 } 8348 ret2 = btrfs_end_transaction(trans); 8349 ret = ret ? ret : ret2; 8350 out_notrans: 8351 if (new_ino == BTRFS_FIRST_FREE_OBJECTID || 8352 old_ino == BTRFS_FIRST_FREE_OBJECTID) 8353 up_read(&fs_info->subvol_sem); 8354 8355 fscrypt_free_filename(&new_fname); 8356 fscrypt_free_filename(&old_fname); 8357 return ret; 8358 } 8359 8360 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap, 8361 struct inode *dir) 8362 { 8363 struct inode *inode; 8364 8365 inode = new_inode(dir->i_sb); 8366 if (inode) { 8367 inode_init_owner(idmap, inode, dir, 8368 S_IFCHR | WHITEOUT_MODE); 8369 inode->i_op = &btrfs_special_inode_operations; 8370 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); 8371 } 8372 return inode; 8373 } 8374 8375 static int btrfs_rename(struct mnt_idmap *idmap, 8376 struct inode *old_dir, struct dentry *old_dentry, 8377 struct inode *new_dir, struct dentry *new_dentry, 8378 unsigned int flags) 8379 { 8380 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); 8381 struct btrfs_new_inode_args whiteout_args = { 8382 .dir = old_dir, 8383 .dentry = old_dentry, 8384 }; 8385 struct btrfs_trans_handle *trans; 8386 unsigned int trans_num_items; 8387 struct btrfs_root *root = BTRFS_I(old_dir)->root; 8388 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 8389 struct inode *new_inode = d_inode(new_dentry); 8390 struct inode *old_inode = d_inode(old_dentry); 8391 struct btrfs_rename_ctx rename_ctx; 8392 u64 index = 0; 8393 int ret; 8394 int ret2; 8395 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 8396 struct fscrypt_name old_fname, new_fname; 8397 bool logs_pinned = false; 8398 8399 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 8400 return -EPERM; 8401 8402 /* we only allow rename subvolume link between subvolumes */ 8403 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 8404 return -EXDEV; 8405 8406 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 8407 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) 8408 return -ENOTEMPTY; 8409 8410 if (S_ISDIR(old_inode->i_mode) && new_inode && 8411 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 8412 return -ENOTEMPTY; 8413 8414 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); 8415 if (ret) 8416 return ret; 8417 8418 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); 8419 if (ret) { 8420 fscrypt_free_filename(&old_fname); 8421 return ret; 8422 } 8423 8424 /* check for collisions, even if the name isn't there */ 8425 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); 8426 if (ret) { 8427 if (ret == -EEXIST) { 8428 /* we shouldn't get 8429 * eexist without a new_inode */ 8430 if (WARN_ON(!new_inode)) { 8431 goto out_fscrypt_names; 8432 } 8433 } else { 8434 /* maybe -EOVERFLOW */ 8435 goto out_fscrypt_names; 8436 } 8437 } 8438 ret = 0; 8439 8440 /* 8441 * we're using rename to replace one file with another. Start IO on it 8442 * now so we don't add too much work to the end of the transaction 8443 */ 8444 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 8445 filemap_flush(old_inode->i_mapping); 8446 8447 if (flags & RENAME_WHITEOUT) { 8448 whiteout_args.inode = new_whiteout_inode(idmap, old_dir); 8449 if (!whiteout_args.inode) { 8450 ret = -ENOMEM; 8451 goto out_fscrypt_names; 8452 } 8453 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); 8454 if (ret) 8455 goto out_whiteout_inode; 8456 } else { 8457 /* 1 to update the old parent inode. */ 8458 trans_num_items = 1; 8459 } 8460 8461 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 8462 /* Close the race window with snapshot create/destroy ioctl */ 8463 down_read(&fs_info->subvol_sem); 8464 /* 8465 * 1 to remove old root ref 8466 * 1 to remove old root backref 8467 * 1 to add new root ref 8468 * 1 to add new root backref 8469 */ 8470 trans_num_items += 4; 8471 } else { 8472 /* 8473 * 1 to update inode 8474 * 1 to remove old inode ref 8475 * 1 to add new inode ref 8476 */ 8477 trans_num_items += 3; 8478 } 8479 /* 8480 * 1 to remove old dir item 8481 * 1 to remove old dir index 8482 * 1 to add new dir item 8483 * 1 to add new dir index 8484 */ 8485 trans_num_items += 4; 8486 /* 1 to update new parent inode if it's not the same as the old parent */ 8487 if (new_dir != old_dir) 8488 trans_num_items++; 8489 if (new_inode) { 8490 /* 8491 * 1 to update inode 8492 * 1 to remove inode ref 8493 * 1 to remove dir item 8494 * 1 to remove dir index 8495 * 1 to possibly add orphan item 8496 */ 8497 trans_num_items += 5; 8498 } 8499 trans = btrfs_start_transaction(root, trans_num_items); 8500 if (IS_ERR(trans)) { 8501 ret = PTR_ERR(trans); 8502 goto out_notrans; 8503 } 8504 8505 if (dest != root) { 8506 ret = btrfs_record_root_in_trans(trans, dest); 8507 if (ret) 8508 goto out_fail; 8509 } 8510 8511 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); 8512 if (ret) 8513 goto out_fail; 8514 8515 BTRFS_I(old_inode)->dir_index = 0ULL; 8516 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8517 /* force full log commit if subvolume involved. */ 8518 btrfs_set_log_full_commit(trans); 8519 } else { 8520 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, 8521 old_ino, btrfs_ino(BTRFS_I(new_dir)), 8522 index); 8523 if (ret) 8524 goto out_fail; 8525 } 8526 8527 inode_inc_iversion(old_dir); 8528 inode_inc_iversion(new_dir); 8529 inode_inc_iversion(old_inode); 8530 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 8531 8532 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 8533 /* 8534 * If we are renaming in the same directory (and it's not a 8535 * root entry) pin the log to prevent any concurrent task from 8536 * logging the directory after we removed the old entry and 8537 * before we add the new entry, otherwise that task can sync 8538 * a log without any entry for the inode we are renaming and 8539 * therefore replaying that log, if a power failure happens 8540 * after syncing the log, would result in deleting the inode. 8541 * 8542 * If the rename affects two different directories, we want to 8543 * make sure the that there's no log commit that contains 8544 * updates for only one of the directories but not for the 8545 * other. 8546 * 8547 * If we are renaming an entry for a root, we don't care about 8548 * log updates since we called btrfs_set_log_full_commit(). 8549 */ 8550 btrfs_pin_log_trans(root); 8551 btrfs_pin_log_trans(dest); 8552 logs_pinned = true; 8553 } 8554 8555 if (old_dentry->d_parent != new_dentry->d_parent) 8556 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 8557 BTRFS_I(old_inode), true); 8558 8559 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8560 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); 8561 if (unlikely(ret)) { 8562 btrfs_abort_transaction(trans, ret); 8563 goto out_fail; 8564 } 8565 } else { 8566 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), 8567 BTRFS_I(d_inode(old_dentry)), 8568 &old_fname.disk_name, &rename_ctx); 8569 if (unlikely(ret)) { 8570 btrfs_abort_transaction(trans, ret); 8571 goto out_fail; 8572 } 8573 ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); 8574 if (unlikely(ret)) { 8575 btrfs_abort_transaction(trans, ret); 8576 goto out_fail; 8577 } 8578 } 8579 8580 if (new_inode) { 8581 inode_inc_iversion(new_inode); 8582 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == 8583 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 8584 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); 8585 if (unlikely(ret)) { 8586 btrfs_abort_transaction(trans, ret); 8587 goto out_fail; 8588 } 8589 BUG_ON(new_inode->i_nlink == 0); 8590 } else { 8591 ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), 8592 BTRFS_I(d_inode(new_dentry)), 8593 &new_fname.disk_name); 8594 if (unlikely(ret)) { 8595 btrfs_abort_transaction(trans, ret); 8596 goto out_fail; 8597 } 8598 } 8599 if (new_inode->i_nlink == 0) { 8600 ret = btrfs_orphan_add(trans, 8601 BTRFS_I(d_inode(new_dentry))); 8602 if (unlikely(ret)) { 8603 btrfs_abort_transaction(trans, ret); 8604 goto out_fail; 8605 } 8606 } 8607 } 8608 8609 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 8610 &new_fname.disk_name, 0, index); 8611 if (unlikely(ret)) { 8612 btrfs_abort_transaction(trans, ret); 8613 goto out_fail; 8614 } 8615 8616 if (old_inode->i_nlink == 1) 8617 BTRFS_I(old_inode)->dir_index = index; 8618 8619 if (logs_pinned) 8620 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), 8621 rename_ctx.index, new_dentry->d_parent); 8622 8623 if (flags & RENAME_WHITEOUT) { 8624 ret = btrfs_create_new_inode(trans, &whiteout_args); 8625 if (unlikely(ret)) { 8626 btrfs_abort_transaction(trans, ret); 8627 goto out_fail; 8628 } else { 8629 unlock_new_inode(whiteout_args.inode); 8630 iput(whiteout_args.inode); 8631 whiteout_args.inode = NULL; 8632 } 8633 } 8634 out_fail: 8635 if (logs_pinned) { 8636 btrfs_end_log_trans(root); 8637 btrfs_end_log_trans(dest); 8638 } 8639 ret2 = btrfs_end_transaction(trans); 8640 ret = ret ? ret : ret2; 8641 out_notrans: 8642 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 8643 up_read(&fs_info->subvol_sem); 8644 if (flags & RENAME_WHITEOUT) 8645 btrfs_new_inode_args_destroy(&whiteout_args); 8646 out_whiteout_inode: 8647 if (flags & RENAME_WHITEOUT) 8648 iput(whiteout_args.inode); 8649 out_fscrypt_names: 8650 fscrypt_free_filename(&old_fname); 8651 fscrypt_free_filename(&new_fname); 8652 return ret; 8653 } 8654 8655 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, 8656 struct dentry *old_dentry, struct inode *new_dir, 8657 struct dentry *new_dentry, unsigned int flags) 8658 { 8659 int ret; 8660 8661 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 8662 return -EINVAL; 8663 8664 if (flags & RENAME_EXCHANGE) 8665 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, 8666 new_dentry); 8667 else 8668 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir, 8669 new_dentry, flags); 8670 8671 btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); 8672 8673 return ret; 8674 } 8675 8676 struct btrfs_delalloc_work { 8677 struct inode *inode; 8678 struct completion completion; 8679 struct list_head list; 8680 struct btrfs_work work; 8681 }; 8682 8683 static void btrfs_run_delalloc_work(struct btrfs_work *work) 8684 { 8685 struct btrfs_delalloc_work *delalloc_work; 8686 struct inode *inode; 8687 8688 delalloc_work = container_of(work, struct btrfs_delalloc_work, 8689 work); 8690 inode = delalloc_work->inode; 8691 filemap_flush(inode->i_mapping); 8692 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 8693 &BTRFS_I(inode)->runtime_flags)) 8694 filemap_flush(inode->i_mapping); 8695 8696 iput(inode); 8697 complete(&delalloc_work->completion); 8698 } 8699 8700 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) 8701 { 8702 struct btrfs_delalloc_work *work; 8703 8704 work = kmalloc(sizeof(*work), GFP_NOFS); 8705 if (!work) 8706 return NULL; 8707 8708 init_completion(&work->completion); 8709 INIT_LIST_HEAD(&work->list); 8710 work->inode = inode; 8711 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL); 8712 8713 return work; 8714 } 8715 8716 /* 8717 * some fairly slow code that needs optimization. This walks the list 8718 * of all the inodes with pending delalloc and forces them to disk. 8719 */ 8720 static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, 8721 bool snapshot, bool in_reclaim_context) 8722 { 8723 struct btrfs_delalloc_work *work, *next; 8724 LIST_HEAD(works); 8725 LIST_HEAD(splice); 8726 int ret = 0; 8727 8728 mutex_lock(&root->delalloc_mutex); 8729 spin_lock(&root->delalloc_lock); 8730 list_splice_init(&root->delalloc_inodes, &splice); 8731 while (!list_empty(&splice)) { 8732 struct btrfs_inode *inode; 8733 struct inode *tmp_inode; 8734 8735 inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); 8736 8737 list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); 8738 8739 if (in_reclaim_context && 8740 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) 8741 continue; 8742 8743 tmp_inode = igrab(&inode->vfs_inode); 8744 if (!tmp_inode) { 8745 cond_resched_lock(&root->delalloc_lock); 8746 continue; 8747 } 8748 spin_unlock(&root->delalloc_lock); 8749 8750 if (snapshot) 8751 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); 8752 if (nr_to_write == NULL) { 8753 work = btrfs_alloc_delalloc_work(tmp_inode); 8754 if (!work) { 8755 iput(tmp_inode); 8756 ret = -ENOMEM; 8757 goto out; 8758 } 8759 list_add_tail(&work->list, &works); 8760 btrfs_queue_work(root->fs_info->flush_workers, 8761 &work->work); 8762 } else { 8763 ret = filemap_flush_nr(tmp_inode->i_mapping, 8764 nr_to_write); 8765 btrfs_add_delayed_iput(inode); 8766 8767 if (ret || *nr_to_write <= 0) 8768 goto out; 8769 } 8770 cond_resched(); 8771 spin_lock(&root->delalloc_lock); 8772 } 8773 spin_unlock(&root->delalloc_lock); 8774 8775 out: 8776 list_for_each_entry_safe(work, next, &works, list) { 8777 list_del_init(&work->list); 8778 wait_for_completion(&work->completion); 8779 kfree(work); 8780 } 8781 8782 if (!list_empty(&splice)) { 8783 spin_lock(&root->delalloc_lock); 8784 list_splice_tail(&splice, &root->delalloc_inodes); 8785 spin_unlock(&root->delalloc_lock); 8786 } 8787 mutex_unlock(&root->delalloc_mutex); 8788 return ret; 8789 } 8790 8791 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) 8792 { 8793 struct btrfs_fs_info *fs_info = root->fs_info; 8794 8795 if (BTRFS_FS_ERROR(fs_info)) 8796 return -EROFS; 8797 return start_delalloc_inodes(root, NULL, true, in_reclaim_context); 8798 } 8799 8800 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, 8801 bool in_reclaim_context) 8802 { 8803 long *nr_to_write = nr == LONG_MAX ? NULL : &nr; 8804 struct btrfs_root *root; 8805 LIST_HEAD(splice); 8806 int ret; 8807 8808 if (BTRFS_FS_ERROR(fs_info)) 8809 return -EROFS; 8810 8811 mutex_lock(&fs_info->delalloc_root_mutex); 8812 spin_lock(&fs_info->delalloc_root_lock); 8813 list_splice_init(&fs_info->delalloc_roots, &splice); 8814 while (!list_empty(&splice)) { 8815 root = list_first_entry(&splice, struct btrfs_root, 8816 delalloc_root); 8817 root = btrfs_grab_root(root); 8818 BUG_ON(!root); 8819 list_move_tail(&root->delalloc_root, 8820 &fs_info->delalloc_roots); 8821 spin_unlock(&fs_info->delalloc_root_lock); 8822 8823 ret = start_delalloc_inodes(root, nr_to_write, false, 8824 in_reclaim_context); 8825 btrfs_put_root(root); 8826 if (ret < 0 || nr <= 0) 8827 goto out; 8828 spin_lock(&fs_info->delalloc_root_lock); 8829 } 8830 spin_unlock(&fs_info->delalloc_root_lock); 8831 8832 ret = 0; 8833 out: 8834 if (!list_empty(&splice)) { 8835 spin_lock(&fs_info->delalloc_root_lock); 8836 list_splice_tail(&splice, &fs_info->delalloc_roots); 8837 spin_unlock(&fs_info->delalloc_root_lock); 8838 } 8839 mutex_unlock(&fs_info->delalloc_root_mutex); 8840 return ret; 8841 } 8842 8843 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, 8844 struct dentry *dentry, const char *symname) 8845 { 8846 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 8847 struct btrfs_trans_handle *trans; 8848 struct btrfs_root *root = BTRFS_I(dir)->root; 8849 struct btrfs_path *path; 8850 struct btrfs_key key; 8851 struct inode *inode; 8852 struct btrfs_new_inode_args new_inode_args = { 8853 .dir = dir, 8854 .dentry = dentry, 8855 }; 8856 unsigned int trans_num_items; 8857 int ret; 8858 int name_len; 8859 int datasize; 8860 unsigned long ptr; 8861 struct btrfs_file_extent_item *ei; 8862 struct extent_buffer *leaf; 8863 8864 name_len = strlen(symname); 8865 /* 8866 * Symlinks utilize uncompressed inline extent data, which should not 8867 * reach block size. 8868 */ 8869 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || 8870 name_len >= fs_info->sectorsize) 8871 return -ENAMETOOLONG; 8872 8873 inode = new_inode(dir->i_sb); 8874 if (!inode) 8875 return -ENOMEM; 8876 inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO); 8877 inode->i_op = &btrfs_symlink_inode_operations; 8878 inode_nohighmem(inode); 8879 inode->i_mapping->a_ops = &btrfs_aops; 8880 btrfs_i_size_write(BTRFS_I(inode), name_len); 8881 inode_set_bytes(inode, name_len); 8882 8883 new_inode_args.inode = inode; 8884 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 8885 if (ret) 8886 goto out_inode; 8887 /* 1 additional item for the inline extent */ 8888 trans_num_items++; 8889 8890 trans = btrfs_start_transaction(root, trans_num_items); 8891 if (IS_ERR(trans)) { 8892 ret = PTR_ERR(trans); 8893 goto out_new_inode_args; 8894 } 8895 8896 ret = btrfs_create_new_inode(trans, &new_inode_args); 8897 if (ret) 8898 goto out; 8899 8900 path = btrfs_alloc_path(); 8901 if (unlikely(!path)) { 8902 ret = -ENOMEM; 8903 btrfs_abort_transaction(trans, ret); 8904 discard_new_inode(inode); 8905 inode = NULL; 8906 goto out; 8907 } 8908 key.objectid = btrfs_ino(BTRFS_I(inode)); 8909 key.type = BTRFS_EXTENT_DATA_KEY; 8910 key.offset = 0; 8911 datasize = btrfs_file_extent_calc_inline_size(name_len); 8912 ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); 8913 if (unlikely(ret)) { 8914 btrfs_abort_transaction(trans, ret); 8915 btrfs_free_path(path); 8916 discard_new_inode(inode); 8917 inode = NULL; 8918 goto out; 8919 } 8920 leaf = path->nodes[0]; 8921 ei = btrfs_item_ptr(leaf, path->slots[0], 8922 struct btrfs_file_extent_item); 8923 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 8924 btrfs_set_file_extent_type(leaf, ei, 8925 BTRFS_FILE_EXTENT_INLINE); 8926 btrfs_set_file_extent_encryption(leaf, ei, 0); 8927 btrfs_set_file_extent_compression(leaf, ei, 0); 8928 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 8929 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 8930 8931 ptr = btrfs_file_extent_inline_start(ei); 8932 write_extent_buffer(leaf, symname, ptr, name_len); 8933 btrfs_free_path(path); 8934 8935 d_instantiate_new(dentry, inode); 8936 ret = 0; 8937 out: 8938 btrfs_end_transaction(trans); 8939 btrfs_btree_balance_dirty(fs_info); 8940 out_new_inode_args: 8941 btrfs_new_inode_args_destroy(&new_inode_args); 8942 out_inode: 8943 if (ret) 8944 iput(inode); 8945 return ret; 8946 } 8947 8948 static struct btrfs_trans_handle *insert_prealloc_file_extent( 8949 struct btrfs_trans_handle *trans_in, 8950 struct btrfs_inode *inode, 8951 struct btrfs_key *ins, 8952 u64 file_offset) 8953 { 8954 struct btrfs_file_extent_item stack_fi; 8955 struct btrfs_replace_extent_info extent_info; 8956 struct btrfs_trans_handle *trans = trans_in; 8957 struct btrfs_path *path; 8958 u64 start = ins->objectid; 8959 u64 len = ins->offset; 8960 u64 qgroup_released = 0; 8961 int ret; 8962 8963 memset(&stack_fi, 0, sizeof(stack_fi)); 8964 8965 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); 8966 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); 8967 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); 8968 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); 8969 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); 8970 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); 8971 /* Encryption and other encoding is reserved and all 0 */ 8972 8973 ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); 8974 if (ret < 0) 8975 return ERR_PTR(ret); 8976 8977 if (trans) { 8978 ret = insert_reserved_file_extent(trans, inode, 8979 file_offset, &stack_fi, 8980 true, qgroup_released); 8981 if (ret) 8982 goto free_qgroup; 8983 return trans; 8984 } 8985 8986 extent_info.disk_offset = start; 8987 extent_info.disk_len = len; 8988 extent_info.data_offset = 0; 8989 extent_info.data_len = len; 8990 extent_info.file_offset = file_offset; 8991 extent_info.extent_buf = (char *)&stack_fi; 8992 extent_info.is_new_extent = true; 8993 extent_info.update_times = true; 8994 extent_info.qgroup_reserved = qgroup_released; 8995 extent_info.insertions = 0; 8996 8997 path = btrfs_alloc_path(); 8998 if (!path) { 8999 ret = -ENOMEM; 9000 goto free_qgroup; 9001 } 9002 9003 ret = btrfs_replace_file_extents(inode, path, file_offset, 9004 file_offset + len - 1, &extent_info, 9005 &trans); 9006 btrfs_free_path(path); 9007 if (ret) 9008 goto free_qgroup; 9009 return trans; 9010 9011 free_qgroup: 9012 /* 9013 * We have released qgroup data range at the beginning of the function, 9014 * and normally qgroup_released bytes will be freed when committing 9015 * transaction. 9016 * But if we error out early, we have to free what we have released 9017 * or we leak qgroup data reservation. 9018 */ 9019 btrfs_qgroup_free_refroot(inode->root->fs_info, 9020 btrfs_root_id(inode->root), qgroup_released, 9021 BTRFS_QGROUP_RSV_DATA); 9022 return ERR_PTR(ret); 9023 } 9024 9025 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 9026 u64 start, u64 num_bytes, u64 min_size, 9027 loff_t actual_len, u64 *alloc_hint, 9028 struct btrfs_trans_handle *trans) 9029 { 9030 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 9031 struct extent_map *em; 9032 struct btrfs_root *root = BTRFS_I(inode)->root; 9033 struct btrfs_key ins; 9034 u64 cur_offset = start; 9035 u64 clear_offset = start; 9036 u64 i_size; 9037 u64 cur_bytes; 9038 u64 last_alloc = (u64)-1; 9039 int ret = 0; 9040 bool own_trans = true; 9041 u64 end = start + num_bytes - 1; 9042 9043 if (trans) 9044 own_trans = false; 9045 while (num_bytes > 0) { 9046 cur_bytes = min_t(u64, num_bytes, SZ_256M); 9047 cur_bytes = max(cur_bytes, min_size); 9048 /* 9049 * If we are severely fragmented we could end up with really 9050 * small allocations, so if the allocator is returning small 9051 * chunks lets make its job easier by only searching for those 9052 * sized chunks. 9053 */ 9054 cur_bytes = min(cur_bytes, last_alloc); 9055 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, 9056 min_size, 0, *alloc_hint, &ins, 1, 0); 9057 if (ret) 9058 break; 9059 9060 /* 9061 * We've reserved this space, and thus converted it from 9062 * ->bytes_may_use to ->bytes_reserved. Any error that happens 9063 * from here on out we will only need to clear our reservation 9064 * for the remaining unreserved area, so advance our 9065 * clear_offset by our extent size. 9066 */ 9067 clear_offset += ins.offset; 9068 9069 last_alloc = ins.offset; 9070 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), 9071 &ins, cur_offset); 9072 /* 9073 * Now that we inserted the prealloc extent we can finally 9074 * decrement the number of reservations in the block group. 9075 * If we did it before, we could race with relocation and have 9076 * relocation miss the reserved extent, making it fail later. 9077 */ 9078 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 9079 if (IS_ERR(trans)) { 9080 ret = PTR_ERR(trans); 9081 btrfs_free_reserved_extent(fs_info, ins.objectid, 9082 ins.offset, false); 9083 break; 9084 } 9085 9086 em = btrfs_alloc_extent_map(); 9087 if (!em) { 9088 btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, 9089 cur_offset + ins.offset - 1, false); 9090 btrfs_set_inode_full_sync(BTRFS_I(inode)); 9091 goto next; 9092 } 9093 9094 em->start = cur_offset; 9095 em->len = ins.offset; 9096 em->disk_bytenr = ins.objectid; 9097 em->offset = 0; 9098 em->disk_num_bytes = ins.offset; 9099 em->ram_bytes = ins.offset; 9100 em->flags |= EXTENT_FLAG_PREALLOC; 9101 em->generation = trans->transid; 9102 9103 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); 9104 btrfs_free_extent_map(em); 9105 next: 9106 num_bytes -= ins.offset; 9107 cur_offset += ins.offset; 9108 *alloc_hint = ins.objectid + ins.offset; 9109 9110 inode_inc_iversion(inode); 9111 inode_set_ctime_current(inode); 9112 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 9113 if (!(mode & FALLOC_FL_KEEP_SIZE) && 9114 (actual_len > inode->i_size) && 9115 (cur_offset > inode->i_size)) { 9116 if (cur_offset > actual_len) 9117 i_size = actual_len; 9118 else 9119 i_size = cur_offset; 9120 i_size_write(inode, i_size); 9121 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 9122 } 9123 9124 ret = btrfs_update_inode(trans, BTRFS_I(inode)); 9125 9126 if (unlikely(ret)) { 9127 btrfs_abort_transaction(trans, ret); 9128 if (own_trans) 9129 btrfs_end_transaction(trans); 9130 break; 9131 } 9132 9133 if (own_trans) { 9134 btrfs_end_transaction(trans); 9135 trans = NULL; 9136 } 9137 } 9138 if (clear_offset < end) 9139 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, 9140 end - clear_offset + 1); 9141 return ret; 9142 } 9143 9144 int btrfs_prealloc_file_range(struct inode *inode, int mode, 9145 u64 start, u64 num_bytes, u64 min_size, 9146 loff_t actual_len, u64 *alloc_hint) 9147 { 9148 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 9149 min_size, actual_len, alloc_hint, 9150 NULL); 9151 } 9152 9153 int btrfs_prealloc_file_range_trans(struct inode *inode, 9154 struct btrfs_trans_handle *trans, int mode, 9155 u64 start, u64 num_bytes, u64 min_size, 9156 loff_t actual_len, u64 *alloc_hint) 9157 { 9158 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 9159 min_size, actual_len, alloc_hint, trans); 9160 } 9161 9162 /* 9163 * NOTE: in case you are adding MAY_EXEC check for directories: 9164 * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to 9165 * elide calls here. 9166 */ 9167 static int btrfs_permission(struct mnt_idmap *idmap, 9168 struct inode *inode, int mask) 9169 { 9170 struct btrfs_root *root = BTRFS_I(inode)->root; 9171 umode_t mode = inode->i_mode; 9172 9173 if (mask & MAY_WRITE && 9174 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 9175 if (btrfs_root_readonly(root)) 9176 return -EROFS; 9177 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 9178 return -EACCES; 9179 } 9180 return generic_permission(idmap, inode, mask); 9181 } 9182 9183 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, 9184 struct file *file, umode_t mode) 9185 { 9186 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); 9187 struct btrfs_trans_handle *trans; 9188 struct btrfs_root *root = BTRFS_I(dir)->root; 9189 struct inode *inode; 9190 struct btrfs_new_inode_args new_inode_args = { 9191 .dir = dir, 9192 .dentry = file->f_path.dentry, 9193 .orphan = true, 9194 }; 9195 unsigned int trans_num_items; 9196 int ret; 9197 9198 inode = new_inode(dir->i_sb); 9199 if (!inode) 9200 return -ENOMEM; 9201 inode_init_owner(idmap, inode, dir, mode); 9202 inode->i_fop = &btrfs_file_operations; 9203 inode->i_op = &btrfs_file_inode_operations; 9204 inode->i_mapping->a_ops = &btrfs_aops; 9205 9206 new_inode_args.inode = inode; 9207 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 9208 if (ret) 9209 goto out_inode; 9210 9211 trans = btrfs_start_transaction(root, trans_num_items); 9212 if (IS_ERR(trans)) { 9213 ret = PTR_ERR(trans); 9214 goto out_new_inode_args; 9215 } 9216 9217 ret = btrfs_create_new_inode(trans, &new_inode_args); 9218 9219 /* 9220 * We set number of links to 0 in btrfs_create_new_inode(), and here we 9221 * set it to 1 because d_tmpfile() will issue a warning if the count is 9222 * 0, through: 9223 * 9224 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 9225 */ 9226 set_nlink(inode, 1); 9227 9228 if (!ret) { 9229 d_tmpfile(file, inode); 9230 unlock_new_inode(inode); 9231 mark_inode_dirty(inode); 9232 } 9233 9234 btrfs_end_transaction(trans); 9235 btrfs_btree_balance_dirty(fs_info); 9236 out_new_inode_args: 9237 btrfs_new_inode_args_destroy(&new_inode_args); 9238 out_inode: 9239 if (ret) 9240 iput(inode); 9241 return finish_open_simple(file, ret); 9242 } 9243 9244 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, 9245 int compress_type) 9246 { 9247 switch (compress_type) { 9248 case BTRFS_COMPRESS_NONE: 9249 return BTRFS_ENCODED_IO_COMPRESSION_NONE; 9250 case BTRFS_COMPRESS_ZLIB: 9251 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; 9252 case BTRFS_COMPRESS_LZO: 9253 /* 9254 * The LZO format depends on the sector size. 64K is the maximum 9255 * sector size that we support. 9256 */ 9257 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) 9258 return -EINVAL; 9259 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 9260 (fs_info->sectorsize_bits - 12); 9261 case BTRFS_COMPRESS_ZSTD: 9262 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; 9263 default: 9264 return -EUCLEAN; 9265 } 9266 } 9267 9268 static ssize_t btrfs_encoded_read_inline( 9269 struct kiocb *iocb, 9270 struct iov_iter *iter, u64 start, 9271 u64 lockend, 9272 struct extent_state **cached_state, 9273 u64 extent_start, size_t count, 9274 struct btrfs_ioctl_encoded_io_args *encoded, 9275 bool *unlocked) 9276 { 9277 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 9278 struct btrfs_root *root = inode->root; 9279 struct btrfs_fs_info *fs_info = root->fs_info; 9280 struct extent_io_tree *io_tree = &inode->io_tree; 9281 BTRFS_PATH_AUTO_FREE(path); 9282 struct extent_buffer *leaf; 9283 struct btrfs_file_extent_item *item; 9284 u64 ram_bytes; 9285 unsigned long ptr; 9286 void *tmp; 9287 ssize_t ret; 9288 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); 9289 9290 path = btrfs_alloc_path(); 9291 if (!path) 9292 return -ENOMEM; 9293 9294 path->nowait = nowait; 9295 9296 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), 9297 extent_start, 0); 9298 if (ret) { 9299 if (unlikely(ret > 0)) { 9300 /* The extent item disappeared? */ 9301 return -EIO; 9302 } 9303 return ret; 9304 } 9305 leaf = path->nodes[0]; 9306 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 9307 9308 ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); 9309 ptr = btrfs_file_extent_inline_start(item); 9310 9311 encoded->len = min_t(u64, extent_start + ram_bytes, 9312 inode->vfs_inode.i_size) - iocb->ki_pos; 9313 ret = btrfs_encoded_io_compression_from_extent(fs_info, 9314 btrfs_file_extent_compression(leaf, item)); 9315 if (ret < 0) 9316 return ret; 9317 encoded->compression = ret; 9318 if (encoded->compression) { 9319 size_t inline_size; 9320 9321 inline_size = btrfs_file_extent_inline_item_len(leaf, 9322 path->slots[0]); 9323 if (inline_size > count) 9324 return -ENOBUFS; 9325 9326 count = inline_size; 9327 encoded->unencoded_len = ram_bytes; 9328 encoded->unencoded_offset = iocb->ki_pos - extent_start; 9329 } else { 9330 count = min_t(u64, count, encoded->len); 9331 encoded->len = count; 9332 encoded->unencoded_len = count; 9333 ptr += iocb->ki_pos - extent_start; 9334 } 9335 9336 tmp = kmalloc(count, GFP_NOFS); 9337 if (!tmp) 9338 return -ENOMEM; 9339 9340 read_extent_buffer(leaf, tmp, ptr, count); 9341 btrfs_release_path(path); 9342 btrfs_unlock_extent(io_tree, start, lockend, cached_state); 9343 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9344 *unlocked = true; 9345 9346 ret = copy_to_iter(tmp, count, iter); 9347 if (ret != count) 9348 ret = -EFAULT; 9349 kfree(tmp); 9350 9351 return ret; 9352 } 9353 9354 struct btrfs_encoded_read_private { 9355 struct completion *sync_reads; 9356 void *uring_ctx; 9357 refcount_t pending_refs; 9358 blk_status_t status; 9359 }; 9360 9361 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) 9362 { 9363 struct btrfs_encoded_read_private *priv = bbio->private; 9364 9365 if (bbio->bio.bi_status) { 9366 /* 9367 * The memory barrier implied by the refcount_dec_and_test() here 9368 * pairs with the memory barrier implied by the refcount_dec_and_test() 9369 * in btrfs_encoded_read_regular_fill_pages() to ensure that 9370 * this write is observed before the load of status in 9371 * btrfs_encoded_read_regular_fill_pages(). 9372 */ 9373 WRITE_ONCE(priv->status, bbio->bio.bi_status); 9374 } 9375 if (refcount_dec_and_test(&priv->pending_refs)) { 9376 int err = blk_status_to_errno(READ_ONCE(priv->status)); 9377 9378 if (priv->uring_ctx) { 9379 btrfs_uring_read_extent_endio(priv->uring_ctx, err); 9380 kfree(priv); 9381 } else { 9382 complete(priv->sync_reads); 9383 } 9384 } 9385 bio_put(&bbio->bio); 9386 } 9387 9388 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, 9389 u64 disk_bytenr, u64 disk_io_size, 9390 struct page **pages, void *uring_ctx) 9391 { 9392 struct btrfs_fs_info *fs_info = inode->root->fs_info; 9393 struct btrfs_encoded_read_private *priv, sync_priv; 9394 struct completion sync_reads; 9395 unsigned long i = 0; 9396 struct btrfs_bio *bbio; 9397 int ret; 9398 9399 /* 9400 * Fast path for synchronous reads which completes in this call, io_uring 9401 * needs longer time span. 9402 */ 9403 if (uring_ctx) { 9404 priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); 9405 if (!priv) 9406 return -ENOMEM; 9407 } else { 9408 priv = &sync_priv; 9409 init_completion(&sync_reads); 9410 priv->sync_reads = &sync_reads; 9411 } 9412 9413 refcount_set(&priv->pending_refs, 1); 9414 priv->status = 0; 9415 priv->uring_ctx = uring_ctx; 9416 9417 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, 9418 btrfs_encoded_read_endio, priv); 9419 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 9420 bbio->inode = inode; 9421 9422 do { 9423 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); 9424 9425 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { 9426 refcount_inc(&priv->pending_refs); 9427 btrfs_submit_bbio(bbio, 0); 9428 9429 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, 9430 btrfs_encoded_read_endio, priv); 9431 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 9432 bbio->inode = inode; 9433 continue; 9434 } 9435 9436 i++; 9437 disk_bytenr += bytes; 9438 disk_io_size -= bytes; 9439 } while (disk_io_size); 9440 9441 refcount_inc(&priv->pending_refs); 9442 btrfs_submit_bbio(bbio, 0); 9443 9444 if (uring_ctx) { 9445 if (refcount_dec_and_test(&priv->pending_refs)) { 9446 ret = blk_status_to_errno(READ_ONCE(priv->status)); 9447 btrfs_uring_read_extent_endio(uring_ctx, ret); 9448 kfree(priv); 9449 return ret; 9450 } 9451 9452 return -EIOCBQUEUED; 9453 } else { 9454 if (!refcount_dec_and_test(&priv->pending_refs)) 9455 wait_for_completion_io(&sync_reads); 9456 /* See btrfs_encoded_read_endio() for ordering. */ 9457 return blk_status_to_errno(READ_ONCE(priv->status)); 9458 } 9459 } 9460 9461 ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, 9462 u64 start, u64 lockend, 9463 struct extent_state **cached_state, 9464 u64 disk_bytenr, u64 disk_io_size, 9465 size_t count, bool compressed, bool *unlocked) 9466 { 9467 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 9468 struct extent_io_tree *io_tree = &inode->io_tree; 9469 struct page **pages; 9470 unsigned long nr_pages, i; 9471 u64 cur; 9472 size_t page_offset; 9473 ssize_t ret; 9474 9475 nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); 9476 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 9477 if (!pages) 9478 return -ENOMEM; 9479 ret = btrfs_alloc_page_array(nr_pages, pages, false); 9480 if (ret) { 9481 ret = -ENOMEM; 9482 goto out; 9483 } 9484 9485 ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, 9486 disk_io_size, pages, NULL); 9487 if (ret) 9488 goto out; 9489 9490 btrfs_unlock_extent(io_tree, start, lockend, cached_state); 9491 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9492 *unlocked = true; 9493 9494 if (compressed) { 9495 i = 0; 9496 page_offset = 0; 9497 } else { 9498 i = (iocb->ki_pos - start) >> PAGE_SHIFT; 9499 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); 9500 } 9501 cur = 0; 9502 while (cur < count) { 9503 size_t bytes = min_t(size_t, count - cur, 9504 PAGE_SIZE - page_offset); 9505 9506 if (copy_page_to_iter(pages[i], page_offset, bytes, 9507 iter) != bytes) { 9508 ret = -EFAULT; 9509 goto out; 9510 } 9511 i++; 9512 cur += bytes; 9513 page_offset = 0; 9514 } 9515 ret = count; 9516 out: 9517 for (i = 0; i < nr_pages; i++) { 9518 if (pages[i]) 9519 __free_page(pages[i]); 9520 } 9521 kfree(pages); 9522 return ret; 9523 } 9524 9525 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, 9526 struct btrfs_ioctl_encoded_io_args *encoded, 9527 struct extent_state **cached_state, 9528 u64 *disk_bytenr, u64 *disk_io_size) 9529 { 9530 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 9531 struct btrfs_fs_info *fs_info = inode->root->fs_info; 9532 struct extent_io_tree *io_tree = &inode->io_tree; 9533 ssize_t ret; 9534 size_t count = iov_iter_count(iter); 9535 u64 start, lockend; 9536 struct extent_map *em; 9537 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); 9538 bool unlocked = false; 9539 9540 file_accessed(iocb->ki_filp); 9541 9542 ret = btrfs_inode_lock(inode, 9543 BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); 9544 if (ret) 9545 return ret; 9546 9547 if (iocb->ki_pos >= inode->vfs_inode.i_size) { 9548 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9549 return 0; 9550 } 9551 start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); 9552 /* 9553 * We don't know how long the extent containing iocb->ki_pos is, but if 9554 * it's compressed we know that it won't be longer than this. 9555 */ 9556 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; 9557 9558 if (nowait) { 9559 struct btrfs_ordered_extent *ordered; 9560 9561 if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, 9562 start, lockend)) { 9563 ret = -EAGAIN; 9564 goto out_unlock_inode; 9565 } 9566 9567 if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) { 9568 ret = -EAGAIN; 9569 goto out_unlock_inode; 9570 } 9571 9572 ordered = btrfs_lookup_ordered_range(inode, start, 9573 lockend - start + 1); 9574 if (ordered) { 9575 btrfs_put_ordered_extent(ordered); 9576 btrfs_unlock_extent(io_tree, start, lockend, cached_state); 9577 ret = -EAGAIN; 9578 goto out_unlock_inode; 9579 } 9580 } else { 9581 for (;;) { 9582 struct btrfs_ordered_extent *ordered; 9583 9584 ret = btrfs_wait_ordered_range(inode, start, 9585 lockend - start + 1); 9586 if (ret) 9587 goto out_unlock_inode; 9588 9589 btrfs_lock_extent(io_tree, start, lockend, cached_state); 9590 ordered = btrfs_lookup_ordered_range(inode, start, 9591 lockend - start + 1); 9592 if (!ordered) 9593 break; 9594 btrfs_put_ordered_extent(ordered); 9595 btrfs_unlock_extent(io_tree, start, lockend, cached_state); 9596 cond_resched(); 9597 } 9598 } 9599 9600 em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); 9601 if (IS_ERR(em)) { 9602 ret = PTR_ERR(em); 9603 goto out_unlock_extent; 9604 } 9605 9606 if (em->disk_bytenr == EXTENT_MAP_INLINE) { 9607 u64 extent_start = em->start; 9608 9609 /* 9610 * For inline extents we get everything we need out of the 9611 * extent item. 9612 */ 9613 btrfs_free_extent_map(em); 9614 em = NULL; 9615 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, 9616 cached_state, extent_start, 9617 count, encoded, &unlocked); 9618 goto out_unlock_extent; 9619 } 9620 9621 /* 9622 * We only want to return up to EOF even if the extent extends beyond 9623 * that. 9624 */ 9625 encoded->len = min_t(u64, btrfs_extent_map_end(em), 9626 inode->vfs_inode.i_size) - iocb->ki_pos; 9627 if (em->disk_bytenr == EXTENT_MAP_HOLE || 9628 (em->flags & EXTENT_FLAG_PREALLOC)) { 9629 *disk_bytenr = EXTENT_MAP_HOLE; 9630 count = min_t(u64, count, encoded->len); 9631 encoded->len = count; 9632 encoded->unencoded_len = count; 9633 } else if (btrfs_extent_map_is_compressed(em)) { 9634 *disk_bytenr = em->disk_bytenr; 9635 /* 9636 * Bail if the buffer isn't large enough to return the whole 9637 * compressed extent. 9638 */ 9639 if (em->disk_num_bytes > count) { 9640 ret = -ENOBUFS; 9641 goto out_em; 9642 } 9643 *disk_io_size = em->disk_num_bytes; 9644 count = em->disk_num_bytes; 9645 encoded->unencoded_len = em->ram_bytes; 9646 encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); 9647 ret = btrfs_encoded_io_compression_from_extent(fs_info, 9648 btrfs_extent_map_compression(em)); 9649 if (ret < 0) 9650 goto out_em; 9651 encoded->compression = ret; 9652 } else { 9653 *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start); 9654 if (encoded->len > count) 9655 encoded->len = count; 9656 /* 9657 * Don't read beyond what we locked. This also limits the page 9658 * allocations that we'll do. 9659 */ 9660 *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; 9661 count = start + *disk_io_size - iocb->ki_pos; 9662 encoded->len = count; 9663 encoded->unencoded_len = count; 9664 *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); 9665 } 9666 btrfs_free_extent_map(em); 9667 em = NULL; 9668 9669 if (*disk_bytenr == EXTENT_MAP_HOLE) { 9670 btrfs_unlock_extent(io_tree, start, lockend, cached_state); 9671 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9672 unlocked = true; 9673 ret = iov_iter_zero(count, iter); 9674 if (ret != count) 9675 ret = -EFAULT; 9676 } else { 9677 ret = -EIOCBQUEUED; 9678 goto out_unlock_extent; 9679 } 9680 9681 out_em: 9682 btrfs_free_extent_map(em); 9683 out_unlock_extent: 9684 /* Leave inode and extent locked if we need to do a read. */ 9685 if (!unlocked && ret != -EIOCBQUEUED) 9686 btrfs_unlock_extent(io_tree, start, lockend, cached_state); 9687 out_unlock_inode: 9688 if (!unlocked && ret != -EIOCBQUEUED) 9689 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 9690 return ret; 9691 } 9692 9693 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, 9694 const struct btrfs_ioctl_encoded_io_args *encoded) 9695 { 9696 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 9697 struct btrfs_root *root = inode->root; 9698 struct btrfs_fs_info *fs_info = root->fs_info; 9699 struct extent_io_tree *io_tree = &inode->io_tree; 9700 struct extent_changeset *data_reserved = NULL; 9701 struct extent_state *cached_state = NULL; 9702 struct btrfs_ordered_extent *ordered; 9703 struct btrfs_file_extent file_extent; 9704 int compression; 9705 size_t orig_count; 9706 u64 start, end; 9707 u64 num_bytes, ram_bytes, disk_num_bytes; 9708 unsigned long nr_folios, i; 9709 struct folio **folios; 9710 struct btrfs_key ins; 9711 bool extent_reserved = false; 9712 struct extent_map *em; 9713 ssize_t ret; 9714 9715 switch (encoded->compression) { 9716 case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: 9717 compression = BTRFS_COMPRESS_ZLIB; 9718 break; 9719 case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: 9720 compression = BTRFS_COMPRESS_ZSTD; 9721 break; 9722 case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: 9723 case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: 9724 case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: 9725 case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: 9726 case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: 9727 /* The sector size must match for LZO. */ 9728 if (encoded->compression - 9729 BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != 9730 fs_info->sectorsize_bits) 9731 return -EINVAL; 9732 compression = BTRFS_COMPRESS_LZO; 9733 break; 9734 default: 9735 return -EINVAL; 9736 } 9737 if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) 9738 return -EINVAL; 9739 9740 /* 9741 * Compressed extents should always have checksums, so error out if we 9742 * have a NOCOW file or inode was created while mounted with NODATASUM. 9743 */ 9744 if (inode->flags & BTRFS_INODE_NODATASUM) 9745 return -EINVAL; 9746 9747 orig_count = iov_iter_count(from); 9748 9749 /* The extent size must be sane. */ 9750 if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || 9751 orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) 9752 return -EINVAL; 9753 9754 /* 9755 * The compressed data must be smaller than the decompressed data. 9756 * 9757 * It's of course possible for data to compress to larger or the same 9758 * size, but the buffered I/O path falls back to no compression for such 9759 * data, and we don't want to break any assumptions by creating these 9760 * extents. 9761 * 9762 * Note that this is less strict than the current check we have that the 9763 * compressed data must be at least one sector smaller than the 9764 * decompressed data. We only want to enforce the weaker requirement 9765 * from old kernels that it is at least one byte smaller. 9766 */ 9767 if (orig_count >= encoded->unencoded_len) 9768 return -EINVAL; 9769 9770 /* The extent must start on a sector boundary. */ 9771 start = iocb->ki_pos; 9772 if (!IS_ALIGNED(start, fs_info->sectorsize)) 9773 return -EINVAL; 9774 9775 /* 9776 * The extent must end on a sector boundary. However, we allow a write 9777 * which ends at or extends i_size to have an unaligned length; we round 9778 * up the extent size and set i_size to the unaligned end. 9779 */ 9780 if (start + encoded->len < inode->vfs_inode.i_size && 9781 !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) 9782 return -EINVAL; 9783 9784 /* Finally, the offset in the unencoded data must be sector-aligned. */ 9785 if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) 9786 return -EINVAL; 9787 9788 num_bytes = ALIGN(encoded->len, fs_info->sectorsize); 9789 ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); 9790 end = start + num_bytes - 1; 9791 9792 /* 9793 * If the extent cannot be inline, the compressed data on disk must be 9794 * sector-aligned. For convenience, we extend it with zeroes if it 9795 * isn't. 9796 */ 9797 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); 9798 nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); 9799 folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); 9800 if (!folios) 9801 return -ENOMEM; 9802 for (i = 0; i < nr_folios; i++) { 9803 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); 9804 char *kaddr; 9805 9806 folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); 9807 if (!folios[i]) { 9808 ret = -ENOMEM; 9809 goto out_folios; 9810 } 9811 kaddr = kmap_local_folio(folios[i], 0); 9812 if (copy_from_iter(kaddr, bytes, from) != bytes) { 9813 kunmap_local(kaddr); 9814 ret = -EFAULT; 9815 goto out_folios; 9816 } 9817 if (bytes < PAGE_SIZE) 9818 memset(kaddr + bytes, 0, PAGE_SIZE - bytes); 9819 kunmap_local(kaddr); 9820 } 9821 9822 for (;;) { 9823 struct btrfs_ordered_extent *ordered; 9824 9825 ret = btrfs_wait_ordered_range(inode, start, num_bytes); 9826 if (ret) 9827 goto out_folios; 9828 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, 9829 start >> PAGE_SHIFT, 9830 end >> PAGE_SHIFT); 9831 if (ret) 9832 goto out_folios; 9833 btrfs_lock_extent(io_tree, start, end, &cached_state); 9834 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); 9835 if (!ordered && 9836 !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) 9837 break; 9838 if (ordered) 9839 btrfs_put_ordered_extent(ordered); 9840 btrfs_unlock_extent(io_tree, start, end, &cached_state); 9841 cond_resched(); 9842 } 9843 9844 /* 9845 * We don't use the higher-level delalloc space functions because our 9846 * num_bytes and disk_num_bytes are different. 9847 */ 9848 ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); 9849 if (ret) 9850 goto out_unlock; 9851 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); 9852 if (ret) 9853 goto out_free_data_space; 9854 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, 9855 false); 9856 if (ret) 9857 goto out_qgroup_free_data; 9858 9859 /* Try an inline extent first. */ 9860 if (encoded->unencoded_len == encoded->len && 9861 encoded->unencoded_offset == 0 && 9862 can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { 9863 ret = __cow_file_range_inline(inode, encoded->len, 9864 orig_count, compression, folios[0], 9865 true); 9866 if (ret <= 0) { 9867 if (ret == 0) 9868 ret = orig_count; 9869 goto out_delalloc_release; 9870 } 9871 } 9872 9873 ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, 9874 disk_num_bytes, 0, 0, &ins, 1, 1); 9875 if (ret) 9876 goto out_delalloc_release; 9877 extent_reserved = true; 9878 9879 file_extent.disk_bytenr = ins.objectid; 9880 file_extent.disk_num_bytes = ins.offset; 9881 file_extent.num_bytes = num_bytes; 9882 file_extent.ram_bytes = ram_bytes; 9883 file_extent.offset = encoded->unencoded_offset; 9884 file_extent.compression = compression; 9885 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); 9886 if (IS_ERR(em)) { 9887 ret = PTR_ERR(em); 9888 goto out_free_reserved; 9889 } 9890 btrfs_free_extent_map(em); 9891 9892 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 9893 (1U << BTRFS_ORDERED_ENCODED) | 9894 (1U << BTRFS_ORDERED_COMPRESSED)); 9895 if (IS_ERR(ordered)) { 9896 btrfs_drop_extent_map_range(inode, start, end, false); 9897 ret = PTR_ERR(ordered); 9898 goto out_free_reserved; 9899 } 9900 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 9901 9902 if (start + encoded->len > inode->vfs_inode.i_size) 9903 i_size_write(&inode->vfs_inode, start + encoded->len); 9904 9905 btrfs_unlock_extent(io_tree, start, end, &cached_state); 9906 9907 btrfs_delalloc_release_extents(inode, num_bytes); 9908 9909 btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); 9910 ret = orig_count; 9911 goto out; 9912 9913 out_free_reserved: 9914 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 9915 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); 9916 out_delalloc_release: 9917 btrfs_delalloc_release_extents(inode, num_bytes); 9918 btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); 9919 out_qgroup_free_data: 9920 if (ret < 0) 9921 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); 9922 out_free_data_space: 9923 /* 9924 * If btrfs_reserve_extent() succeeded, then we already decremented 9925 * bytes_may_use. 9926 */ 9927 if (!extent_reserved) 9928 btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); 9929 out_unlock: 9930 btrfs_unlock_extent(io_tree, start, end, &cached_state); 9931 out_folios: 9932 for (i = 0; i < nr_folios; i++) { 9933 if (folios[i]) 9934 folio_put(folios[i]); 9935 } 9936 kvfree(folios); 9937 out: 9938 if (ret >= 0) 9939 iocb->ki_pos += encoded->len; 9940 return ret; 9941 } 9942 9943 #ifdef CONFIG_SWAP 9944 /* 9945 * Add an entry indicating a block group or device which is pinned by a 9946 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a 9947 * negative errno on failure. 9948 */ 9949 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, 9950 bool is_block_group) 9951 { 9952 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 9953 struct btrfs_swapfile_pin *sp, *entry; 9954 struct rb_node **p; 9955 struct rb_node *parent = NULL; 9956 9957 sp = kmalloc(sizeof(*sp), GFP_NOFS); 9958 if (!sp) 9959 return -ENOMEM; 9960 sp->ptr = ptr; 9961 sp->inode = inode; 9962 sp->is_block_group = is_block_group; 9963 sp->bg_extent_count = 1; 9964 9965 spin_lock(&fs_info->swapfile_pins_lock); 9966 p = &fs_info->swapfile_pins.rb_node; 9967 while (*p) { 9968 parent = *p; 9969 entry = rb_entry(parent, struct btrfs_swapfile_pin, node); 9970 if (sp->ptr < entry->ptr || 9971 (sp->ptr == entry->ptr && sp->inode < entry->inode)) { 9972 p = &(*p)->rb_left; 9973 } else if (sp->ptr > entry->ptr || 9974 (sp->ptr == entry->ptr && sp->inode > entry->inode)) { 9975 p = &(*p)->rb_right; 9976 } else { 9977 if (is_block_group) 9978 entry->bg_extent_count++; 9979 spin_unlock(&fs_info->swapfile_pins_lock); 9980 kfree(sp); 9981 return 1; 9982 } 9983 } 9984 rb_link_node(&sp->node, parent, p); 9985 rb_insert_color(&sp->node, &fs_info->swapfile_pins); 9986 spin_unlock(&fs_info->swapfile_pins_lock); 9987 return 0; 9988 } 9989 9990 /* Free all of the entries pinned by this swapfile. */ 9991 static void btrfs_free_swapfile_pins(struct inode *inode) 9992 { 9993 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 9994 struct btrfs_swapfile_pin *sp; 9995 struct rb_node *node, *next; 9996 9997 spin_lock(&fs_info->swapfile_pins_lock); 9998 node = rb_first(&fs_info->swapfile_pins); 9999 while (node) { 10000 next = rb_next(node); 10001 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 10002 if (sp->inode == inode) { 10003 rb_erase(&sp->node, &fs_info->swapfile_pins); 10004 if (sp->is_block_group) { 10005 btrfs_dec_block_group_swap_extents(sp->ptr, 10006 sp->bg_extent_count); 10007 btrfs_put_block_group(sp->ptr); 10008 } 10009 kfree(sp); 10010 } 10011 node = next; 10012 } 10013 spin_unlock(&fs_info->swapfile_pins_lock); 10014 } 10015 10016 struct btrfs_swap_info { 10017 u64 start; 10018 u64 block_start; 10019 u64 block_len; 10020 u64 lowest_ppage; 10021 u64 highest_ppage; 10022 unsigned long nr_pages; 10023 int nr_extents; 10024 }; 10025 10026 static int btrfs_add_swap_extent(struct swap_info_struct *sis, 10027 struct btrfs_swap_info *bsi) 10028 { 10029 unsigned long nr_pages; 10030 unsigned long max_pages; 10031 u64 first_ppage, first_ppage_reported, next_ppage; 10032 int ret; 10033 10034 /* 10035 * Our swapfile may have had its size extended after the swap header was 10036 * written. In that case activating the swapfile should not go beyond 10037 * the max size set in the swap header. 10038 */ 10039 if (bsi->nr_pages >= sis->max) 10040 return 0; 10041 10042 max_pages = sis->max - bsi->nr_pages; 10043 first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; 10044 next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; 10045 10046 if (first_ppage >= next_ppage) 10047 return 0; 10048 nr_pages = next_ppage - first_ppage; 10049 nr_pages = min(nr_pages, max_pages); 10050 10051 first_ppage_reported = first_ppage; 10052 if (bsi->start == 0) 10053 first_ppage_reported++; 10054 if (bsi->lowest_ppage > first_ppage_reported) 10055 bsi->lowest_ppage = first_ppage_reported; 10056 if (bsi->highest_ppage < (next_ppage - 1)) 10057 bsi->highest_ppage = next_ppage - 1; 10058 10059 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); 10060 if (ret < 0) 10061 return ret; 10062 bsi->nr_extents += ret; 10063 bsi->nr_pages += nr_pages; 10064 return 0; 10065 } 10066 10067 static void btrfs_swap_deactivate(struct file *file) 10068 { 10069 struct inode *inode = file_inode(file); 10070 10071 btrfs_free_swapfile_pins(inode); 10072 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); 10073 } 10074 10075 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 10076 sector_t *span) 10077 { 10078 struct inode *inode = file_inode(file); 10079 struct btrfs_root *root = BTRFS_I(inode)->root; 10080 struct btrfs_fs_info *fs_info = root->fs_info; 10081 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 10082 struct extent_state *cached_state = NULL; 10083 struct btrfs_chunk_map *map = NULL; 10084 struct btrfs_device *device = NULL; 10085 struct btrfs_swap_info bsi = { 10086 .lowest_ppage = (sector_t)-1ULL, 10087 }; 10088 struct btrfs_backref_share_check_ctx *backref_ctx = NULL; 10089 struct btrfs_path *path = NULL; 10090 int ret = 0; 10091 u64 isize; 10092 u64 prev_extent_end = 0; 10093 10094 /* 10095 * Acquire the inode's mmap lock to prevent races with memory mapped 10096 * writes, as they could happen after we flush delalloc below and before 10097 * we lock the extent range further below. The inode was already locked 10098 * up in the call chain. 10099 */ 10100 btrfs_assert_inode_locked(BTRFS_I(inode)); 10101 down_write(&BTRFS_I(inode)->i_mmap_lock); 10102 10103 /* 10104 * If the swap file was just created, make sure delalloc is done. If the 10105 * file changes again after this, the user is doing something stupid and 10106 * we don't really care. 10107 */ 10108 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 10109 if (ret) 10110 goto out_unlock_mmap; 10111 10112 /* 10113 * The inode is locked, so these flags won't change after we check them. 10114 */ 10115 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { 10116 btrfs_warn(fs_info, "swapfile must not be compressed"); 10117 ret = -EINVAL; 10118 goto out_unlock_mmap; 10119 } 10120 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { 10121 btrfs_warn(fs_info, "swapfile must not be copy-on-write"); 10122 ret = -EINVAL; 10123 goto out_unlock_mmap; 10124 } 10125 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 10126 btrfs_warn(fs_info, "swapfile must not be checksummed"); 10127 ret = -EINVAL; 10128 goto out_unlock_mmap; 10129 } 10130 10131 path = btrfs_alloc_path(); 10132 backref_ctx = btrfs_alloc_backref_share_check_ctx(); 10133 if (!path || !backref_ctx) { 10134 ret = -ENOMEM; 10135 goto out_unlock_mmap; 10136 } 10137 10138 /* 10139 * Balance or device remove/replace/resize can move stuff around from 10140 * under us. The exclop protection makes sure they aren't running/won't 10141 * run concurrently while we are mapping the swap extents, and 10142 * fs_info->swapfile_pins prevents them from running while the swap 10143 * file is active and moving the extents. Note that this also prevents 10144 * a concurrent device add which isn't actually necessary, but it's not 10145 * really worth the trouble to allow it. 10146 */ 10147 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { 10148 btrfs_warn(fs_info, 10149 "cannot activate swapfile while exclusive operation is running"); 10150 ret = -EBUSY; 10151 goto out_unlock_mmap; 10152 } 10153 10154 /* 10155 * Prevent snapshot creation while we are activating the swap file. 10156 * We do not want to race with snapshot creation. If snapshot creation 10157 * already started before we bumped nr_swapfiles from 0 to 1 and 10158 * completes before the first write into the swap file after it is 10159 * activated, than that write would fallback to COW. 10160 */ 10161 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { 10162 btrfs_exclop_finish(fs_info); 10163 btrfs_warn(fs_info, 10164 "cannot activate swapfile because snapshot creation is in progress"); 10165 ret = -EINVAL; 10166 goto out_unlock_mmap; 10167 } 10168 /* 10169 * Snapshots can create extents which require COW even if NODATACOW is 10170 * set. We use this counter to prevent snapshots. We must increment it 10171 * before walking the extents because we don't want a concurrent 10172 * snapshot to run after we've already checked the extents. 10173 * 10174 * It is possible that subvolume is marked for deletion but still not 10175 * removed yet. To prevent this race, we check the root status before 10176 * activating the swapfile. 10177 */ 10178 spin_lock(&root->root_item_lock); 10179 if (btrfs_root_dead(root)) { 10180 spin_unlock(&root->root_item_lock); 10181 10182 btrfs_drew_write_unlock(&root->snapshot_lock); 10183 btrfs_exclop_finish(fs_info); 10184 btrfs_warn(fs_info, 10185 "cannot activate swapfile because subvolume %llu is being deleted", 10186 btrfs_root_id(root)); 10187 ret = -EPERM; 10188 goto out_unlock_mmap; 10189 } 10190 atomic_inc(&root->nr_swapfiles); 10191 spin_unlock(&root->root_item_lock); 10192 10193 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); 10194 10195 btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state); 10196 while (prev_extent_end < isize) { 10197 struct btrfs_key key; 10198 struct extent_buffer *leaf; 10199 struct btrfs_file_extent_item *ei; 10200 struct btrfs_block_group *bg; 10201 u64 logical_block_start; 10202 u64 physical_block_start; 10203 u64 extent_gen; 10204 u64 disk_bytenr; 10205 u64 len; 10206 10207 key.objectid = btrfs_ino(BTRFS_I(inode)); 10208 key.type = BTRFS_EXTENT_DATA_KEY; 10209 key.offset = prev_extent_end; 10210 10211 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 10212 if (ret < 0) 10213 goto out; 10214 10215 /* 10216 * If key not found it means we have an implicit hole (NO_HOLES 10217 * is enabled). 10218 */ 10219 if (ret > 0) { 10220 btrfs_warn(fs_info, "swapfile must not have holes"); 10221 ret = -EINVAL; 10222 goto out; 10223 } 10224 10225 leaf = path->nodes[0]; 10226 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 10227 10228 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { 10229 /* 10230 * It's unlikely we'll ever actually find ourselves 10231 * here, as a file small enough to fit inline won't be 10232 * big enough to store more than the swap header, but in 10233 * case something changes in the future, let's catch it 10234 * here rather than later. 10235 */ 10236 btrfs_warn(fs_info, "swapfile must not be inline"); 10237 ret = -EINVAL; 10238 goto out; 10239 } 10240 10241 if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { 10242 btrfs_warn(fs_info, "swapfile must not be compressed"); 10243 ret = -EINVAL; 10244 goto out; 10245 } 10246 10247 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); 10248 if (disk_bytenr == 0) { 10249 btrfs_warn(fs_info, "swapfile must not have holes"); 10250 ret = -EINVAL; 10251 goto out; 10252 } 10253 10254 logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); 10255 extent_gen = btrfs_file_extent_generation(leaf, ei); 10256 prev_extent_end = btrfs_file_extent_end(path); 10257 10258 if (prev_extent_end > isize) 10259 len = isize - key.offset; 10260 else 10261 len = btrfs_file_extent_num_bytes(leaf, ei); 10262 10263 backref_ctx->curr_leaf_bytenr = leaf->start; 10264 10265 /* 10266 * Don't need the path anymore, release to avoid deadlocks when 10267 * calling btrfs_is_data_extent_shared() because when joining a 10268 * transaction it can block waiting for the current one's commit 10269 * which in turn may be trying to lock the same leaf to flush 10270 * delayed items for example. 10271 */ 10272 btrfs_release_path(path); 10273 10274 ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, 10275 extent_gen, backref_ctx); 10276 if (ret < 0) { 10277 goto out; 10278 } else if (ret > 0) { 10279 btrfs_warn(fs_info, 10280 "swapfile must not be copy-on-write"); 10281 ret = -EINVAL; 10282 goto out; 10283 } 10284 10285 map = btrfs_get_chunk_map(fs_info, logical_block_start, len); 10286 if (IS_ERR(map)) { 10287 ret = PTR_ERR(map); 10288 goto out; 10289 } 10290 10291 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 10292 btrfs_warn(fs_info, 10293 "swapfile must have single data profile"); 10294 ret = -EINVAL; 10295 goto out; 10296 } 10297 10298 if (device == NULL) { 10299 device = map->stripes[0].dev; 10300 ret = btrfs_add_swapfile_pin(inode, device, false); 10301 if (ret == 1) 10302 ret = 0; 10303 else if (ret) 10304 goto out; 10305 } else if (device != map->stripes[0].dev) { 10306 btrfs_warn(fs_info, "swapfile must be on one device"); 10307 ret = -EINVAL; 10308 goto out; 10309 } 10310 10311 physical_block_start = (map->stripes[0].physical + 10312 (logical_block_start - map->start)); 10313 btrfs_free_chunk_map(map); 10314 map = NULL; 10315 10316 bg = btrfs_lookup_block_group(fs_info, logical_block_start); 10317 if (!bg) { 10318 btrfs_warn(fs_info, 10319 "could not find block group containing swapfile"); 10320 ret = -EINVAL; 10321 goto out; 10322 } 10323 10324 if (!btrfs_inc_block_group_swap_extents(bg)) { 10325 btrfs_warn(fs_info, 10326 "block group for swapfile at %llu is read-only%s", 10327 bg->start, 10328 atomic_read(&fs_info->scrubs_running) ? 10329 " (scrub running)" : ""); 10330 btrfs_put_block_group(bg); 10331 ret = -EINVAL; 10332 goto out; 10333 } 10334 10335 ret = btrfs_add_swapfile_pin(inode, bg, true); 10336 if (ret) { 10337 btrfs_put_block_group(bg); 10338 if (ret == 1) 10339 ret = 0; 10340 else 10341 goto out; 10342 } 10343 10344 if (bsi.block_len && 10345 bsi.block_start + bsi.block_len == physical_block_start) { 10346 bsi.block_len += len; 10347 } else { 10348 if (bsi.block_len) { 10349 ret = btrfs_add_swap_extent(sis, &bsi); 10350 if (ret) 10351 goto out; 10352 } 10353 bsi.start = key.offset; 10354 bsi.block_start = physical_block_start; 10355 bsi.block_len = len; 10356 } 10357 10358 if (fatal_signal_pending(current)) { 10359 ret = -EINTR; 10360 goto out; 10361 } 10362 10363 cond_resched(); 10364 } 10365 10366 if (bsi.block_len) 10367 ret = btrfs_add_swap_extent(sis, &bsi); 10368 10369 out: 10370 if (!IS_ERR_OR_NULL(map)) 10371 btrfs_free_chunk_map(map); 10372 10373 btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state); 10374 10375 if (ret) 10376 btrfs_swap_deactivate(file); 10377 10378 btrfs_drew_write_unlock(&root->snapshot_lock); 10379 10380 btrfs_exclop_finish(fs_info); 10381 10382 out_unlock_mmap: 10383 up_write(&BTRFS_I(inode)->i_mmap_lock); 10384 btrfs_free_backref_share_ctx(backref_ctx); 10385 btrfs_free_path(path); 10386 if (ret) 10387 return ret; 10388 10389 if (device) 10390 sis->bdev = device->bdev; 10391 *span = bsi.highest_ppage - bsi.lowest_ppage + 1; 10392 sis->max = bsi.nr_pages; 10393 sis->pages = bsi.nr_pages - 1; 10394 return bsi.nr_extents; 10395 } 10396 #else 10397 static void btrfs_swap_deactivate(struct file *file) 10398 { 10399 } 10400 10401 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 10402 sector_t *span) 10403 { 10404 return -EOPNOTSUPP; 10405 } 10406 #endif 10407 10408 /* 10409 * Update the number of bytes used in the VFS' inode. When we replace extents in 10410 * a range (clone, dedupe, fallocate's zero range), we must update the number of 10411 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls 10412 * always get a correct value. 10413 */ 10414 void btrfs_update_inode_bytes(struct btrfs_inode *inode, 10415 const u64 add_bytes, 10416 const u64 del_bytes) 10417 { 10418 if (add_bytes == del_bytes) 10419 return; 10420 10421 spin_lock(&inode->lock); 10422 if (del_bytes > 0) 10423 inode_sub_bytes(&inode->vfs_inode, del_bytes); 10424 if (add_bytes > 0) 10425 inode_add_bytes(&inode->vfs_inode, add_bytes); 10426 spin_unlock(&inode->lock); 10427 } 10428 10429 /* 10430 * Verify that there are no ordered extents for a given file range. 10431 * 10432 * @inode: The target inode. 10433 * @start: Start offset of the file range, should be sector size aligned. 10434 * @end: End offset (inclusive) of the file range, its value +1 should be 10435 * sector size aligned. 10436 * 10437 * This should typically be used for cases where we locked an inode's VFS lock in 10438 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, 10439 * we have flushed all delalloc in the range, we have waited for all ordered 10440 * extents in the range to complete and finally we have locked the file range in 10441 * the inode's io_tree. 10442 */ 10443 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) 10444 { 10445 struct btrfs_root *root = inode->root; 10446 struct btrfs_ordered_extent *ordered; 10447 10448 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 10449 return; 10450 10451 ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); 10452 if (ordered) { 10453 btrfs_err(root->fs_info, 10454 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", 10455 start, end, btrfs_ino(inode), btrfs_root_id(root), 10456 ordered->file_offset, 10457 ordered->file_offset + ordered->num_bytes - 1); 10458 btrfs_put_ordered_extent(ordered); 10459 } 10460 10461 ASSERT(ordered == NULL); 10462 } 10463 10464 /* 10465 * Find the first inode with a minimum number. 10466 * 10467 * @root: The root to search for. 10468 * @min_ino: The minimum inode number. 10469 * 10470 * Find the first inode in the @root with a number >= @min_ino and return it. 10471 * Returns NULL if no such inode found. 10472 */ 10473 struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) 10474 { 10475 struct btrfs_inode *inode; 10476 unsigned long from = min_ino; 10477 10478 xa_lock(&root->inodes); 10479 while (true) { 10480 inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); 10481 if (!inode) 10482 break; 10483 if (igrab(&inode->vfs_inode)) 10484 break; 10485 10486 from = btrfs_ino(inode) + 1; 10487 cond_resched_lock(&root->inodes.xa_lock); 10488 } 10489 xa_unlock(&root->inodes); 10490 10491 return inode; 10492 } 10493 10494 static const struct inode_operations btrfs_dir_inode_operations = { 10495 .getattr = btrfs_getattr, 10496 .lookup = btrfs_lookup, 10497 .create = btrfs_create, 10498 .unlink = btrfs_unlink, 10499 .link = btrfs_link, 10500 .mkdir = btrfs_mkdir, 10501 .rmdir = btrfs_rmdir, 10502 .rename = btrfs_rename2, 10503 .symlink = btrfs_symlink, 10504 .setattr = btrfs_setattr, 10505 .mknod = btrfs_mknod, 10506 .listxattr = btrfs_listxattr, 10507 .permission = btrfs_permission, 10508 .get_inode_acl = btrfs_get_acl, 10509 .set_acl = btrfs_set_acl, 10510 .update_time = btrfs_update_time, 10511 .tmpfile = btrfs_tmpfile, 10512 .fileattr_get = btrfs_fileattr_get, 10513 .fileattr_set = btrfs_fileattr_set, 10514 }; 10515 10516 static const struct file_operations btrfs_dir_file_operations = { 10517 .llseek = btrfs_dir_llseek, 10518 .read = generic_read_dir, 10519 .iterate_shared = btrfs_real_readdir, 10520 .open = btrfs_opendir, 10521 .unlocked_ioctl = btrfs_ioctl, 10522 #ifdef CONFIG_COMPAT 10523 .compat_ioctl = btrfs_compat_ioctl, 10524 #endif 10525 .release = btrfs_release_file, 10526 .fsync = btrfs_sync_file, 10527 }; 10528 10529 /* 10530 * btrfs doesn't support the bmap operation because swapfiles 10531 * use bmap to make a mapping of extents in the file. They assume 10532 * these extents won't change over the life of the file and they 10533 * use the bmap result to do IO directly to the drive. 10534 * 10535 * the btrfs bmap call would return logical addresses that aren't 10536 * suitable for IO and they also will change frequently as COW 10537 * operations happen. So, swapfile + btrfs == corruption. 10538 * 10539 * For now we're avoiding this by dropping bmap. 10540 */ 10541 static const struct address_space_operations btrfs_aops = { 10542 .read_folio = btrfs_read_folio, 10543 .writepages = btrfs_writepages, 10544 .readahead = btrfs_readahead, 10545 .invalidate_folio = btrfs_invalidate_folio, 10546 .launder_folio = btrfs_launder_folio, 10547 .release_folio = btrfs_release_folio, 10548 .migrate_folio = btrfs_migrate_folio, 10549 .dirty_folio = filemap_dirty_folio, 10550 .error_remove_folio = generic_error_remove_folio, 10551 .swap_activate = btrfs_swap_activate, 10552 .swap_deactivate = btrfs_swap_deactivate, 10553 }; 10554 10555 static const struct inode_operations btrfs_file_inode_operations = { 10556 .getattr = btrfs_getattr, 10557 .setattr = btrfs_setattr, 10558 .listxattr = btrfs_listxattr, 10559 .permission = btrfs_permission, 10560 .fiemap = btrfs_fiemap, 10561 .get_inode_acl = btrfs_get_acl, 10562 .set_acl = btrfs_set_acl, 10563 .update_time = btrfs_update_time, 10564 .fileattr_get = btrfs_fileattr_get, 10565 .fileattr_set = btrfs_fileattr_set, 10566 }; 10567 static const struct inode_operations btrfs_special_inode_operations = { 10568 .getattr = btrfs_getattr, 10569 .setattr = btrfs_setattr, 10570 .permission = btrfs_permission, 10571 .listxattr = btrfs_listxattr, 10572 .get_inode_acl = btrfs_get_acl, 10573 .set_acl = btrfs_set_acl, 10574 .update_time = btrfs_update_time, 10575 }; 10576 static const struct inode_operations btrfs_symlink_inode_operations = { 10577 .get_link = page_get_link, 10578 .getattr = btrfs_getattr, 10579 .setattr = btrfs_setattr, 10580 .permission = btrfs_permission, 10581 .listxattr = btrfs_listxattr, 10582 .update_time = btrfs_update_time, 10583 }; 10584 10585 const struct dentry_operations btrfs_dentry_operations = { 10586 .d_delete = btrfs_dentry_delete, 10587 }; 10588