1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <crypto/hash.h> 7 #include <linux/kernel.h> 8 #include <linux/bio.h> 9 #include <linux/file.h> 10 #include <linux/fs.h> 11 #include <linux/pagemap.h> 12 #include <linux/highmem.h> 13 #include <linux/time.h> 14 #include <linux/init.h> 15 #include <linux/string.h> 16 #include <linux/backing-dev.h> 17 #include <linux/writeback.h> 18 #include <linux/compat.h> 19 #include <linux/xattr.h> 20 #include <linux/posix_acl.h> 21 #include <linux/falloc.h> 22 #include <linux/slab.h> 23 #include <linux/ratelimit.h> 24 #include <linux/btrfs.h> 25 #include <linux/blkdev.h> 26 #include <linux/posix_acl_xattr.h> 27 #include <linux/uio.h> 28 #include <linux/magic.h> 29 #include <linux/iversion.h> 30 #include <linux/swap.h> 31 #include <linux/migrate.h> 32 #include <linux/sched/mm.h> 33 #include <linux/iomap.h> 34 #include <asm/unaligned.h> 35 #include "misc.h" 36 #include "ctree.h" 37 #include "disk-io.h" 38 #include "transaction.h" 39 #include "btrfs_inode.h" 40 #include "print-tree.h" 41 #include "ordered-data.h" 42 #include "xattr.h" 43 #include "tree-log.h" 44 #include "volumes.h" 45 #include "compression.h" 46 #include "locking.h" 47 #include "free-space-cache.h" 48 #include "props.h" 49 #include "qgroup.h" 50 #include "delalloc-space.h" 51 #include "block-group.h" 52 #include "space-info.h" 53 54 struct btrfs_iget_args { 55 u64 ino; 56 struct btrfs_root *root; 57 }; 58 59 struct btrfs_dio_data { 60 u64 reserve; 61 loff_t length; 62 ssize_t submitted; 63 struct extent_changeset *data_reserved; 64 }; 65 66 static const struct inode_operations btrfs_dir_inode_operations; 67 static const struct inode_operations btrfs_symlink_inode_operations; 68 static const struct inode_operations btrfs_special_inode_operations; 69 static const struct inode_operations btrfs_file_inode_operations; 70 static const struct address_space_operations btrfs_aops; 71 static const struct file_operations btrfs_dir_file_operations; 72 73 static struct kmem_cache *btrfs_inode_cachep; 74 struct kmem_cache *btrfs_trans_handle_cachep; 75 struct kmem_cache *btrfs_path_cachep; 76 struct kmem_cache *btrfs_free_space_cachep; 77 struct kmem_cache *btrfs_free_space_bitmap_cachep; 78 79 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 80 static int btrfs_truncate(struct inode *inode, bool skip_writeback); 81 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 82 static noinline int cow_file_range(struct btrfs_inode *inode, 83 struct page *locked_page, 84 u64 start, u64 end, int *page_started, 85 unsigned long *nr_written, int unlock); 86 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, 87 u64 len, u64 orig_start, u64 block_start, 88 u64 block_len, u64 orig_block_len, 89 u64 ram_bytes, int compress_type, 90 int type); 91 92 static void __endio_write_update_ordered(struct btrfs_inode *inode, 93 const u64 offset, const u64 bytes, 94 const bool uptodate); 95 96 /* 97 * btrfs_inode_lock - lock inode i_rwsem based on arguments passed 98 * 99 * ilock_flags can have the following bit set: 100 * 101 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode 102 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt 103 * return -EAGAIN 104 */ 105 int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) 106 { 107 if (ilock_flags & BTRFS_ILOCK_SHARED) { 108 if (ilock_flags & BTRFS_ILOCK_TRY) { 109 if (!inode_trylock_shared(inode)) 110 return -EAGAIN; 111 else 112 return 0; 113 } 114 inode_lock_shared(inode); 115 } else { 116 if (ilock_flags & BTRFS_ILOCK_TRY) { 117 if (!inode_trylock(inode)) 118 return -EAGAIN; 119 else 120 return 0; 121 } 122 inode_lock(inode); 123 } 124 return 0; 125 } 126 127 /* 128 * btrfs_inode_unlock - unock inode i_rwsem 129 * 130 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() 131 * to decide whether the lock acquired is shared or exclusive. 132 */ 133 void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) 134 { 135 if (ilock_flags & BTRFS_ILOCK_SHARED) 136 inode_unlock_shared(inode); 137 else 138 inode_unlock(inode); 139 } 140 141 /* 142 * Cleanup all submitted ordered extents in specified range to handle errors 143 * from the btrfs_run_delalloc_range() callback. 144 * 145 * NOTE: caller must ensure that when an error happens, it can not call 146 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING 147 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata 148 * to be released, which we want to happen only when finishing the ordered 149 * extent (btrfs_finish_ordered_io()). 150 */ 151 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, 152 struct page *locked_page, 153 u64 offset, u64 bytes) 154 { 155 unsigned long index = offset >> PAGE_SHIFT; 156 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; 157 u64 page_start = page_offset(locked_page); 158 u64 page_end = page_start + PAGE_SIZE - 1; 159 160 struct page *page; 161 162 while (index <= end_index) { 163 page = find_get_page(inode->vfs_inode.i_mapping, index); 164 index++; 165 if (!page) 166 continue; 167 ClearPagePrivate2(page); 168 put_page(page); 169 } 170 171 /* 172 * In case this page belongs to the delalloc range being instantiated 173 * then skip it, since the first page of a range is going to be 174 * properly cleaned up by the caller of run_delalloc_range 175 */ 176 if (page_start >= offset && page_end <= (offset + bytes - 1)) { 177 offset += PAGE_SIZE; 178 bytes -= PAGE_SIZE; 179 } 180 181 return __endio_write_update_ordered(inode, offset, bytes, false); 182 } 183 184 static int btrfs_dirty_inode(struct inode *inode); 185 186 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 187 struct inode *inode, struct inode *dir, 188 const struct qstr *qstr) 189 { 190 int err; 191 192 err = btrfs_init_acl(trans, inode, dir); 193 if (!err) 194 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 195 return err; 196 } 197 198 /* 199 * this does all the hard work for inserting an inline extent into 200 * the btree. The caller should have done a btrfs_drop_extents so that 201 * no overlapping inline items exist in the btree 202 */ 203 static int insert_inline_extent(struct btrfs_trans_handle *trans, 204 struct btrfs_path *path, bool extent_inserted, 205 struct btrfs_root *root, struct inode *inode, 206 u64 start, size_t size, size_t compressed_size, 207 int compress_type, 208 struct page **compressed_pages) 209 { 210 struct extent_buffer *leaf; 211 struct page *page = NULL; 212 char *kaddr; 213 unsigned long ptr; 214 struct btrfs_file_extent_item *ei; 215 int ret; 216 size_t cur_size = size; 217 unsigned long offset; 218 219 ASSERT((compressed_size > 0 && compressed_pages) || 220 (compressed_size == 0 && !compressed_pages)); 221 222 if (compressed_size && compressed_pages) 223 cur_size = compressed_size; 224 225 if (!extent_inserted) { 226 struct btrfs_key key; 227 size_t datasize; 228 229 key.objectid = btrfs_ino(BTRFS_I(inode)); 230 key.offset = start; 231 key.type = BTRFS_EXTENT_DATA_KEY; 232 233 datasize = btrfs_file_extent_calc_inline_size(cur_size); 234 ret = btrfs_insert_empty_item(trans, root, path, &key, 235 datasize); 236 if (ret) 237 goto fail; 238 } 239 leaf = path->nodes[0]; 240 ei = btrfs_item_ptr(leaf, path->slots[0], 241 struct btrfs_file_extent_item); 242 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 243 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 244 btrfs_set_file_extent_encryption(leaf, ei, 0); 245 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 246 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 247 ptr = btrfs_file_extent_inline_start(ei); 248 249 if (compress_type != BTRFS_COMPRESS_NONE) { 250 struct page *cpage; 251 int i = 0; 252 while (compressed_size > 0) { 253 cpage = compressed_pages[i]; 254 cur_size = min_t(unsigned long, compressed_size, 255 PAGE_SIZE); 256 257 kaddr = kmap_atomic(cpage); 258 write_extent_buffer(leaf, kaddr, ptr, cur_size); 259 kunmap_atomic(kaddr); 260 261 i++; 262 ptr += cur_size; 263 compressed_size -= cur_size; 264 } 265 btrfs_set_file_extent_compression(leaf, ei, 266 compress_type); 267 } else { 268 page = find_get_page(inode->i_mapping, 269 start >> PAGE_SHIFT); 270 btrfs_set_file_extent_compression(leaf, ei, 0); 271 kaddr = kmap_atomic(page); 272 offset = offset_in_page(start); 273 write_extent_buffer(leaf, kaddr + offset, ptr, size); 274 kunmap_atomic(kaddr); 275 put_page(page); 276 } 277 btrfs_mark_buffer_dirty(leaf); 278 btrfs_release_path(path); 279 280 /* 281 * We align size to sectorsize for inline extents just for simplicity 282 * sake. 283 */ 284 size = ALIGN(size, root->fs_info->sectorsize); 285 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); 286 if (ret) 287 goto fail; 288 289 /* 290 * we're an inline extent, so nobody can 291 * extend the file past i_size without locking 292 * a page we already have locked. 293 * 294 * We must do any isize and inode updates 295 * before we unlock the pages. Otherwise we 296 * could end up racing with unlink. 297 */ 298 BTRFS_I(inode)->disk_i_size = inode->i_size; 299 fail: 300 return ret; 301 } 302 303 304 /* 305 * conditionally insert an inline extent into the file. This 306 * does the checks required to make sure the data is small enough 307 * to fit as an inline extent. 308 */ 309 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, 310 u64 end, size_t compressed_size, 311 int compress_type, 312 struct page **compressed_pages) 313 { 314 struct btrfs_drop_extents_args drop_args = { 0 }; 315 struct btrfs_root *root = inode->root; 316 struct btrfs_fs_info *fs_info = root->fs_info; 317 struct btrfs_trans_handle *trans; 318 u64 isize = i_size_read(&inode->vfs_inode); 319 u64 actual_end = min(end + 1, isize); 320 u64 inline_len = actual_end - start; 321 u64 aligned_end = ALIGN(end, fs_info->sectorsize); 322 u64 data_len = inline_len; 323 int ret; 324 struct btrfs_path *path; 325 326 if (compressed_size) 327 data_len = compressed_size; 328 329 if (start > 0 || 330 actual_end > fs_info->sectorsize || 331 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || 332 (!compressed_size && 333 (actual_end & (fs_info->sectorsize - 1)) == 0) || 334 end + 1 < isize || 335 data_len > fs_info->max_inline) { 336 return 1; 337 } 338 339 path = btrfs_alloc_path(); 340 if (!path) 341 return -ENOMEM; 342 343 trans = btrfs_join_transaction(root); 344 if (IS_ERR(trans)) { 345 btrfs_free_path(path); 346 return PTR_ERR(trans); 347 } 348 trans->block_rsv = &inode->block_rsv; 349 350 drop_args.path = path; 351 drop_args.start = start; 352 drop_args.end = aligned_end; 353 drop_args.drop_cache = true; 354 drop_args.replace_extent = true; 355 356 if (compressed_size && compressed_pages) 357 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( 358 compressed_size); 359 else 360 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( 361 inline_len); 362 363 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 364 if (ret) { 365 btrfs_abort_transaction(trans, ret); 366 goto out; 367 } 368 369 if (isize > actual_end) 370 inline_len = min_t(u64, isize, actual_end); 371 ret = insert_inline_extent(trans, path, drop_args.extent_inserted, 372 root, &inode->vfs_inode, start, 373 inline_len, compressed_size, 374 compress_type, compressed_pages); 375 if (ret && ret != -ENOSPC) { 376 btrfs_abort_transaction(trans, ret); 377 goto out; 378 } else if (ret == -ENOSPC) { 379 ret = 1; 380 goto out; 381 } 382 383 btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); 384 ret = btrfs_update_inode(trans, root, inode); 385 if (ret && ret != -ENOSPC) { 386 btrfs_abort_transaction(trans, ret); 387 goto out; 388 } else if (ret == -ENOSPC) { 389 ret = 1; 390 goto out; 391 } 392 393 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 394 out: 395 /* 396 * Don't forget to free the reserved space, as for inlined extent 397 * it won't count as data extent, free them directly here. 398 * And at reserve time, it's always aligned to page size, so 399 * just free one page here. 400 */ 401 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); 402 btrfs_free_path(path); 403 btrfs_end_transaction(trans); 404 return ret; 405 } 406 407 struct async_extent { 408 u64 start; 409 u64 ram_size; 410 u64 compressed_size; 411 struct page **pages; 412 unsigned long nr_pages; 413 int compress_type; 414 struct list_head list; 415 }; 416 417 struct async_chunk { 418 struct inode *inode; 419 struct page *locked_page; 420 u64 start; 421 u64 end; 422 unsigned int write_flags; 423 struct list_head extents; 424 struct cgroup_subsys_state *blkcg_css; 425 struct btrfs_work work; 426 atomic_t *pending; 427 }; 428 429 struct async_cow { 430 /* Number of chunks in flight; must be first in the structure */ 431 atomic_t num_chunks; 432 struct async_chunk chunks[]; 433 }; 434 435 static noinline int add_async_extent(struct async_chunk *cow, 436 u64 start, u64 ram_size, 437 u64 compressed_size, 438 struct page **pages, 439 unsigned long nr_pages, 440 int compress_type) 441 { 442 struct async_extent *async_extent; 443 444 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 445 BUG_ON(!async_extent); /* -ENOMEM */ 446 async_extent->start = start; 447 async_extent->ram_size = ram_size; 448 async_extent->compressed_size = compressed_size; 449 async_extent->pages = pages; 450 async_extent->nr_pages = nr_pages; 451 async_extent->compress_type = compress_type; 452 list_add_tail(&async_extent->list, &cow->extents); 453 return 0; 454 } 455 456 /* 457 * Check if the inode has flags compatible with compression 458 */ 459 static inline bool inode_can_compress(struct btrfs_inode *inode) 460 { 461 if (inode->flags & BTRFS_INODE_NODATACOW || 462 inode->flags & BTRFS_INODE_NODATASUM) 463 return false; 464 return true; 465 } 466 467 /* 468 * Check if the inode needs to be submitted to compression, based on mount 469 * options, defragmentation, properties or heuristics. 470 */ 471 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, 472 u64 end) 473 { 474 struct btrfs_fs_info *fs_info = inode->root->fs_info; 475 476 if (!inode_can_compress(inode)) { 477 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), 478 KERN_ERR "BTRFS: unexpected compression for ino %llu\n", 479 btrfs_ino(inode)); 480 return 0; 481 } 482 /* force compress */ 483 if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) 484 return 1; 485 /* defrag ioctl */ 486 if (inode->defrag_compress) 487 return 1; 488 /* bad compression ratios */ 489 if (inode->flags & BTRFS_INODE_NOCOMPRESS) 490 return 0; 491 if (btrfs_test_opt(fs_info, COMPRESS) || 492 inode->flags & BTRFS_INODE_COMPRESS || 493 inode->prop_compress) 494 return btrfs_compress_heuristic(&inode->vfs_inode, start, end); 495 return 0; 496 } 497 498 static inline void inode_should_defrag(struct btrfs_inode *inode, 499 u64 start, u64 end, u64 num_bytes, u64 small_write) 500 { 501 /* If this is a small write inside eof, kick off a defrag */ 502 if (num_bytes < small_write && 503 (start > 0 || end + 1 < inode->disk_i_size)) 504 btrfs_add_inode_defrag(NULL, inode); 505 } 506 507 /* 508 * we create compressed extents in two phases. The first 509 * phase compresses a range of pages that have already been 510 * locked (both pages and state bits are locked). 511 * 512 * This is done inside an ordered work queue, and the compression 513 * is spread across many cpus. The actual IO submission is step 514 * two, and the ordered work queue takes care of making sure that 515 * happens in the same order things were put onto the queue by 516 * writepages and friends. 517 * 518 * If this code finds it can't get good compression, it puts an 519 * entry onto the work queue to write the uncompressed bytes. This 520 * makes sure that both compressed inodes and uncompressed inodes 521 * are written in the same order that the flusher thread sent them 522 * down. 523 */ 524 static noinline int compress_file_range(struct async_chunk *async_chunk) 525 { 526 struct inode *inode = async_chunk->inode; 527 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 528 u64 blocksize = fs_info->sectorsize; 529 u64 start = async_chunk->start; 530 u64 end = async_chunk->end; 531 u64 actual_end; 532 u64 i_size; 533 int ret = 0; 534 struct page **pages = NULL; 535 unsigned long nr_pages; 536 unsigned long total_compressed = 0; 537 unsigned long total_in = 0; 538 int i; 539 int will_compress; 540 int compress_type = fs_info->compress_type; 541 int compressed_extents = 0; 542 int redirty = 0; 543 544 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, 545 SZ_16K); 546 547 /* 548 * We need to save i_size before now because it could change in between 549 * us evaluating the size and assigning it. This is because we lock and 550 * unlock the page in truncate and fallocate, and then modify the i_size 551 * later on. 552 * 553 * The barriers are to emulate READ_ONCE, remove that once i_size_read 554 * does that for us. 555 */ 556 barrier(); 557 i_size = i_size_read(inode); 558 barrier(); 559 actual_end = min_t(u64, i_size, end + 1); 560 again: 561 will_compress = 0; 562 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 563 BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); 564 nr_pages = min_t(unsigned long, nr_pages, 565 BTRFS_MAX_COMPRESSED / PAGE_SIZE); 566 567 /* 568 * we don't want to send crud past the end of i_size through 569 * compression, that's just a waste of CPU time. So, if the 570 * end of the file is before the start of our current 571 * requested range of bytes, we bail out to the uncompressed 572 * cleanup code that can deal with all of this. 573 * 574 * It isn't really the fastest way to fix things, but this is a 575 * very uncommon corner. 576 */ 577 if (actual_end <= start) 578 goto cleanup_and_bail_uncompressed; 579 580 total_compressed = actual_end - start; 581 582 /* 583 * skip compression for a small file range(<=blocksize) that 584 * isn't an inline extent, since it doesn't save disk space at all. 585 */ 586 if (total_compressed <= blocksize && 587 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 588 goto cleanup_and_bail_uncompressed; 589 590 total_compressed = min_t(unsigned long, total_compressed, 591 BTRFS_MAX_UNCOMPRESSED); 592 total_in = 0; 593 ret = 0; 594 595 /* 596 * we do compression for mount -o compress and when the 597 * inode has not been flagged as nocompress. This flag can 598 * change at any time if we discover bad compression ratios. 599 */ 600 if (inode_need_compress(BTRFS_I(inode), start, end)) { 601 WARN_ON(pages); 602 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 603 if (!pages) { 604 /* just bail out to the uncompressed code */ 605 nr_pages = 0; 606 goto cont; 607 } 608 609 if (BTRFS_I(inode)->defrag_compress) 610 compress_type = BTRFS_I(inode)->defrag_compress; 611 else if (BTRFS_I(inode)->prop_compress) 612 compress_type = BTRFS_I(inode)->prop_compress; 613 614 /* 615 * we need to call clear_page_dirty_for_io on each 616 * page in the range. Otherwise applications with the file 617 * mmap'd can wander in and change the page contents while 618 * we are compressing them. 619 * 620 * If the compression fails for any reason, we set the pages 621 * dirty again later on. 622 * 623 * Note that the remaining part is redirtied, the start pointer 624 * has moved, the end is the original one. 625 */ 626 if (!redirty) { 627 extent_range_clear_dirty_for_io(inode, start, end); 628 redirty = 1; 629 } 630 631 /* Compression level is applied here and only here */ 632 ret = btrfs_compress_pages( 633 compress_type | (fs_info->compress_level << 4), 634 inode->i_mapping, start, 635 pages, 636 &nr_pages, 637 &total_in, 638 &total_compressed); 639 640 if (!ret) { 641 unsigned long offset = offset_in_page(total_compressed); 642 struct page *page = pages[nr_pages - 1]; 643 char *kaddr; 644 645 /* zero the tail end of the last page, we might be 646 * sending it down to disk 647 */ 648 if (offset) { 649 kaddr = kmap_atomic(page); 650 memset(kaddr + offset, 0, 651 PAGE_SIZE - offset); 652 kunmap_atomic(kaddr); 653 } 654 will_compress = 1; 655 } 656 } 657 cont: 658 if (start == 0) { 659 /* lets try to make an inline extent */ 660 if (ret || total_in < actual_end) { 661 /* we didn't compress the entire range, try 662 * to make an uncompressed inline extent. 663 */ 664 ret = cow_file_range_inline(BTRFS_I(inode), start, end, 665 0, BTRFS_COMPRESS_NONE, 666 NULL); 667 } else { 668 /* try making a compressed inline extent */ 669 ret = cow_file_range_inline(BTRFS_I(inode), start, end, 670 total_compressed, 671 compress_type, pages); 672 } 673 if (ret <= 0) { 674 unsigned long clear_flags = EXTENT_DELALLOC | 675 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 676 EXTENT_DO_ACCOUNTING; 677 unsigned long page_error_op; 678 679 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; 680 681 /* 682 * inline extent creation worked or returned error, 683 * we don't need to create any more async work items. 684 * Unlock and free up our temp pages. 685 * 686 * We use DO_ACCOUNTING here because we need the 687 * delalloc_release_metadata to be done _after_ we drop 688 * our outstanding extent for clearing delalloc for this 689 * range. 690 */ 691 extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, 692 NULL, 693 clear_flags, 694 PAGE_UNLOCK | 695 PAGE_CLEAR_DIRTY | 696 PAGE_SET_WRITEBACK | 697 page_error_op | 698 PAGE_END_WRITEBACK); 699 700 /* 701 * Ensure we only free the compressed pages if we have 702 * them allocated, as we can still reach here with 703 * inode_need_compress() == false. 704 */ 705 if (pages) { 706 for (i = 0; i < nr_pages; i++) { 707 WARN_ON(pages[i]->mapping); 708 put_page(pages[i]); 709 } 710 kfree(pages); 711 } 712 return 0; 713 } 714 } 715 716 if (will_compress) { 717 /* 718 * we aren't doing an inline extent round the compressed size 719 * up to a block size boundary so the allocator does sane 720 * things 721 */ 722 total_compressed = ALIGN(total_compressed, blocksize); 723 724 /* 725 * one last check to make sure the compression is really a 726 * win, compare the page count read with the blocks on disk, 727 * compression must free at least one sector size 728 */ 729 total_in = ALIGN(total_in, PAGE_SIZE); 730 if (total_compressed + blocksize <= total_in) { 731 compressed_extents++; 732 733 /* 734 * The async work queues will take care of doing actual 735 * allocation on disk for these compressed pages, and 736 * will submit them to the elevator. 737 */ 738 add_async_extent(async_chunk, start, total_in, 739 total_compressed, pages, nr_pages, 740 compress_type); 741 742 if (start + total_in < end) { 743 start += total_in; 744 pages = NULL; 745 cond_resched(); 746 goto again; 747 } 748 return compressed_extents; 749 } 750 } 751 if (pages) { 752 /* 753 * the compression code ran but failed to make things smaller, 754 * free any pages it allocated and our page pointer array 755 */ 756 for (i = 0; i < nr_pages; i++) { 757 WARN_ON(pages[i]->mapping); 758 put_page(pages[i]); 759 } 760 kfree(pages); 761 pages = NULL; 762 total_compressed = 0; 763 nr_pages = 0; 764 765 /* flag the file so we don't compress in the future */ 766 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && 767 !(BTRFS_I(inode)->prop_compress)) { 768 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 769 } 770 } 771 cleanup_and_bail_uncompressed: 772 /* 773 * No compression, but we still need to write the pages in the file 774 * we've been given so far. redirty the locked page if it corresponds 775 * to our extent and set things up for the async work queue to run 776 * cow_file_range to do the normal delalloc dance. 777 */ 778 if (async_chunk->locked_page && 779 (page_offset(async_chunk->locked_page) >= start && 780 page_offset(async_chunk->locked_page)) <= end) { 781 __set_page_dirty_nobuffers(async_chunk->locked_page); 782 /* unlocked later on in the async handlers */ 783 } 784 785 if (redirty) 786 extent_range_redirty_for_io(inode, start, end); 787 add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, 788 BTRFS_COMPRESS_NONE); 789 compressed_extents++; 790 791 return compressed_extents; 792 } 793 794 static void free_async_extent_pages(struct async_extent *async_extent) 795 { 796 int i; 797 798 if (!async_extent->pages) 799 return; 800 801 for (i = 0; i < async_extent->nr_pages; i++) { 802 WARN_ON(async_extent->pages[i]->mapping); 803 put_page(async_extent->pages[i]); 804 } 805 kfree(async_extent->pages); 806 async_extent->nr_pages = 0; 807 async_extent->pages = NULL; 808 } 809 810 /* 811 * phase two of compressed writeback. This is the ordered portion 812 * of the code, which only gets called in the order the work was 813 * queued. We walk all the async extents created by compress_file_range 814 * and send them down to the disk. 815 */ 816 static noinline void submit_compressed_extents(struct async_chunk *async_chunk) 817 { 818 struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); 819 struct btrfs_fs_info *fs_info = inode->root->fs_info; 820 struct async_extent *async_extent; 821 u64 alloc_hint = 0; 822 struct btrfs_key ins; 823 struct extent_map *em; 824 struct btrfs_root *root = inode->root; 825 struct extent_io_tree *io_tree = &inode->io_tree; 826 int ret = 0; 827 828 again: 829 while (!list_empty(&async_chunk->extents)) { 830 async_extent = list_entry(async_chunk->extents.next, 831 struct async_extent, list); 832 list_del(&async_extent->list); 833 834 retry: 835 lock_extent(io_tree, async_extent->start, 836 async_extent->start + async_extent->ram_size - 1); 837 /* did the compression code fall back to uncompressed IO? */ 838 if (!async_extent->pages) { 839 int page_started = 0; 840 unsigned long nr_written = 0; 841 842 /* allocate blocks */ 843 ret = cow_file_range(inode, async_chunk->locked_page, 844 async_extent->start, 845 async_extent->start + 846 async_extent->ram_size - 1, 847 &page_started, &nr_written, 0); 848 849 /* JDM XXX */ 850 851 /* 852 * if page_started, cow_file_range inserted an 853 * inline extent and took care of all the unlocking 854 * and IO for us. Otherwise, we need to submit 855 * all those pages down to the drive. 856 */ 857 if (!page_started && !ret) 858 extent_write_locked_range(&inode->vfs_inode, 859 async_extent->start, 860 async_extent->start + 861 async_extent->ram_size - 1, 862 WB_SYNC_ALL); 863 else if (ret && async_chunk->locked_page) 864 unlock_page(async_chunk->locked_page); 865 kfree(async_extent); 866 cond_resched(); 867 continue; 868 } 869 870 ret = btrfs_reserve_extent(root, async_extent->ram_size, 871 async_extent->compressed_size, 872 async_extent->compressed_size, 873 0, alloc_hint, &ins, 1, 1); 874 if (ret) { 875 free_async_extent_pages(async_extent); 876 877 if (ret == -ENOSPC) { 878 unlock_extent(io_tree, async_extent->start, 879 async_extent->start + 880 async_extent->ram_size - 1); 881 882 /* 883 * we need to redirty the pages if we decide to 884 * fallback to uncompressed IO, otherwise we 885 * will not submit these pages down to lower 886 * layers. 887 */ 888 extent_range_redirty_for_io(&inode->vfs_inode, 889 async_extent->start, 890 async_extent->start + 891 async_extent->ram_size - 1); 892 893 goto retry; 894 } 895 goto out_free; 896 } 897 /* 898 * here we're doing allocation and writeback of the 899 * compressed pages 900 */ 901 em = create_io_em(inode, async_extent->start, 902 async_extent->ram_size, /* len */ 903 async_extent->start, /* orig_start */ 904 ins.objectid, /* block_start */ 905 ins.offset, /* block_len */ 906 ins.offset, /* orig_block_len */ 907 async_extent->ram_size, /* ram_bytes */ 908 async_extent->compress_type, 909 BTRFS_ORDERED_COMPRESSED); 910 if (IS_ERR(em)) 911 /* ret value is not necessary due to void function */ 912 goto out_free_reserve; 913 free_extent_map(em); 914 915 ret = btrfs_add_ordered_extent_compress(inode, 916 async_extent->start, 917 ins.objectid, 918 async_extent->ram_size, 919 ins.offset, 920 BTRFS_ORDERED_COMPRESSED, 921 async_extent->compress_type); 922 if (ret) { 923 btrfs_drop_extent_cache(inode, async_extent->start, 924 async_extent->start + 925 async_extent->ram_size - 1, 0); 926 goto out_free_reserve; 927 } 928 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 929 930 /* 931 * clear dirty, set writeback and unlock the pages. 932 */ 933 extent_clear_unlock_delalloc(inode, async_extent->start, 934 async_extent->start + 935 async_extent->ram_size - 1, 936 NULL, EXTENT_LOCKED | EXTENT_DELALLOC, 937 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 938 PAGE_SET_WRITEBACK); 939 if (btrfs_submit_compressed_write(inode, async_extent->start, 940 async_extent->ram_size, 941 ins.objectid, 942 ins.offset, async_extent->pages, 943 async_extent->nr_pages, 944 async_chunk->write_flags, 945 async_chunk->blkcg_css)) { 946 struct page *p = async_extent->pages[0]; 947 const u64 start = async_extent->start; 948 const u64 end = start + async_extent->ram_size - 1; 949 950 p->mapping = inode->vfs_inode.i_mapping; 951 btrfs_writepage_endio_finish_ordered(p, start, end, 0); 952 953 p->mapping = NULL; 954 extent_clear_unlock_delalloc(inode, start, end, NULL, 0, 955 PAGE_END_WRITEBACK | 956 PAGE_SET_ERROR); 957 free_async_extent_pages(async_extent); 958 } 959 alloc_hint = ins.objectid + ins.offset; 960 kfree(async_extent); 961 cond_resched(); 962 } 963 return; 964 out_free_reserve: 965 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 966 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 967 out_free: 968 extent_clear_unlock_delalloc(inode, async_extent->start, 969 async_extent->start + 970 async_extent->ram_size - 1, 971 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 972 EXTENT_DELALLOC_NEW | 973 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 974 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 975 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | 976 PAGE_SET_ERROR); 977 free_async_extent_pages(async_extent); 978 kfree(async_extent); 979 goto again; 980 } 981 982 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, 983 u64 num_bytes) 984 { 985 struct extent_map_tree *em_tree = &inode->extent_tree; 986 struct extent_map *em; 987 u64 alloc_hint = 0; 988 989 read_lock(&em_tree->lock); 990 em = search_extent_mapping(em_tree, start, num_bytes); 991 if (em) { 992 /* 993 * if block start isn't an actual block number then find the 994 * first block in this inode and use that as a hint. If that 995 * block is also bogus then just don't worry about it. 996 */ 997 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 998 free_extent_map(em); 999 em = search_extent_mapping(em_tree, 0, 0); 1000 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 1001 alloc_hint = em->block_start; 1002 if (em) 1003 free_extent_map(em); 1004 } else { 1005 alloc_hint = em->block_start; 1006 free_extent_map(em); 1007 } 1008 } 1009 read_unlock(&em_tree->lock); 1010 1011 return alloc_hint; 1012 } 1013 1014 /* 1015 * when extent_io.c finds a delayed allocation range in the file, 1016 * the call backs end up in this code. The basic idea is to 1017 * allocate extents on disk for the range, and create ordered data structs 1018 * in ram to track those extents. 1019 * 1020 * locked_page is the page that writepage had locked already. We use 1021 * it to make sure we don't do extra locks or unlocks. 1022 * 1023 * *page_started is set to one if we unlock locked_page and do everything 1024 * required to start IO on it. It may be clean and already done with 1025 * IO when we return. 1026 */ 1027 static noinline int cow_file_range(struct btrfs_inode *inode, 1028 struct page *locked_page, 1029 u64 start, u64 end, int *page_started, 1030 unsigned long *nr_written, int unlock) 1031 { 1032 struct btrfs_root *root = inode->root; 1033 struct btrfs_fs_info *fs_info = root->fs_info; 1034 u64 alloc_hint = 0; 1035 u64 num_bytes; 1036 unsigned long ram_size; 1037 u64 cur_alloc_size = 0; 1038 u64 min_alloc_size; 1039 u64 blocksize = fs_info->sectorsize; 1040 struct btrfs_key ins; 1041 struct extent_map *em; 1042 unsigned clear_bits; 1043 unsigned long page_ops; 1044 bool extent_reserved = false; 1045 int ret = 0; 1046 1047 if (btrfs_is_free_space_inode(inode)) { 1048 WARN_ON_ONCE(1); 1049 ret = -EINVAL; 1050 goto out_unlock; 1051 } 1052 1053 num_bytes = ALIGN(end - start + 1, blocksize); 1054 num_bytes = max(blocksize, num_bytes); 1055 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); 1056 1057 inode_should_defrag(inode, start, end, num_bytes, SZ_64K); 1058 1059 if (start == 0) { 1060 /* lets try to make an inline extent */ 1061 ret = cow_file_range_inline(inode, start, end, 0, 1062 BTRFS_COMPRESS_NONE, NULL); 1063 if (ret == 0) { 1064 /* 1065 * We use DO_ACCOUNTING here because we need the 1066 * delalloc_release_metadata to be run _after_ we drop 1067 * our outstanding extent for clearing delalloc for this 1068 * range. 1069 */ 1070 extent_clear_unlock_delalloc(inode, start, end, NULL, 1071 EXTENT_LOCKED | EXTENT_DELALLOC | 1072 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 1073 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1074 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 1075 PAGE_END_WRITEBACK); 1076 *nr_written = *nr_written + 1077 (end - start + PAGE_SIZE) / PAGE_SIZE; 1078 *page_started = 1; 1079 goto out; 1080 } else if (ret < 0) { 1081 goto out_unlock; 1082 } 1083 } 1084 1085 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 1086 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 1087 1088 /* 1089 * Relocation relies on the relocated extents to have exactly the same 1090 * size as the original extents. Normally writeback for relocation data 1091 * extents follows a NOCOW path because relocation preallocates the 1092 * extents. However, due to an operation such as scrub turning a block 1093 * group to RO mode, it may fallback to COW mode, so we must make sure 1094 * an extent allocated during COW has exactly the requested size and can 1095 * not be split into smaller extents, otherwise relocation breaks and 1096 * fails during the stage where it updates the bytenr of file extent 1097 * items. 1098 */ 1099 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1100 min_alloc_size = num_bytes; 1101 else 1102 min_alloc_size = fs_info->sectorsize; 1103 1104 while (num_bytes > 0) { 1105 cur_alloc_size = num_bytes; 1106 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, 1107 min_alloc_size, 0, alloc_hint, 1108 &ins, 1, 1); 1109 if (ret < 0) 1110 goto out_unlock; 1111 cur_alloc_size = ins.offset; 1112 extent_reserved = true; 1113 1114 ram_size = ins.offset; 1115 em = create_io_em(inode, start, ins.offset, /* len */ 1116 start, /* orig_start */ 1117 ins.objectid, /* block_start */ 1118 ins.offset, /* block_len */ 1119 ins.offset, /* orig_block_len */ 1120 ram_size, /* ram_bytes */ 1121 BTRFS_COMPRESS_NONE, /* compress_type */ 1122 BTRFS_ORDERED_REGULAR /* type */); 1123 if (IS_ERR(em)) { 1124 ret = PTR_ERR(em); 1125 goto out_reserve; 1126 } 1127 free_extent_map(em); 1128 1129 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 1130 ram_size, cur_alloc_size, 0); 1131 if (ret) 1132 goto out_drop_extent_cache; 1133 1134 if (root->root_key.objectid == 1135 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1136 ret = btrfs_reloc_clone_csums(inode, start, 1137 cur_alloc_size); 1138 /* 1139 * Only drop cache here, and process as normal. 1140 * 1141 * We must not allow extent_clear_unlock_delalloc() 1142 * at out_unlock label to free meta of this ordered 1143 * extent, as its meta should be freed by 1144 * btrfs_finish_ordered_io(). 1145 * 1146 * So we must continue until @start is increased to 1147 * skip current ordered extent. 1148 */ 1149 if (ret) 1150 btrfs_drop_extent_cache(inode, start, 1151 start + ram_size - 1, 0); 1152 } 1153 1154 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1155 1156 /* we're not doing compressed IO, don't unlock the first 1157 * page (which the caller expects to stay locked), don't 1158 * clear any dirty bits and don't set any writeback bits 1159 * 1160 * Do set the Private2 bit so we know this page was properly 1161 * setup for writepage 1162 */ 1163 page_ops = unlock ? PAGE_UNLOCK : 0; 1164 page_ops |= PAGE_SET_PRIVATE2; 1165 1166 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, 1167 locked_page, 1168 EXTENT_LOCKED | EXTENT_DELALLOC, 1169 page_ops); 1170 if (num_bytes < cur_alloc_size) 1171 num_bytes = 0; 1172 else 1173 num_bytes -= cur_alloc_size; 1174 alloc_hint = ins.objectid + ins.offset; 1175 start += cur_alloc_size; 1176 extent_reserved = false; 1177 1178 /* 1179 * btrfs_reloc_clone_csums() error, since start is increased 1180 * extent_clear_unlock_delalloc() at out_unlock label won't 1181 * free metadata of current ordered extent, we're OK to exit. 1182 */ 1183 if (ret) 1184 goto out_unlock; 1185 } 1186 out: 1187 return ret; 1188 1189 out_drop_extent_cache: 1190 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1191 out_reserve: 1192 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1193 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 1194 out_unlock: 1195 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 1196 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; 1197 page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 1198 PAGE_END_WRITEBACK; 1199 /* 1200 * If we reserved an extent for our delalloc range (or a subrange) and 1201 * failed to create the respective ordered extent, then it means that 1202 * when we reserved the extent we decremented the extent's size from 1203 * the data space_info's bytes_may_use counter and incremented the 1204 * space_info's bytes_reserved counter by the same amount. We must make 1205 * sure extent_clear_unlock_delalloc() does not try to decrement again 1206 * the data space_info's bytes_may_use counter, therefore we do not pass 1207 * it the flag EXTENT_CLEAR_DATA_RESV. 1208 */ 1209 if (extent_reserved) { 1210 extent_clear_unlock_delalloc(inode, start, 1211 start + cur_alloc_size - 1, 1212 locked_page, 1213 clear_bits, 1214 page_ops); 1215 start += cur_alloc_size; 1216 if (start >= end) 1217 goto out; 1218 } 1219 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1220 clear_bits | EXTENT_CLEAR_DATA_RESV, 1221 page_ops); 1222 goto out; 1223 } 1224 1225 /* 1226 * work queue call back to started compression on a file and pages 1227 */ 1228 static noinline void async_cow_start(struct btrfs_work *work) 1229 { 1230 struct async_chunk *async_chunk; 1231 int compressed_extents; 1232 1233 async_chunk = container_of(work, struct async_chunk, work); 1234 1235 compressed_extents = compress_file_range(async_chunk); 1236 if (compressed_extents == 0) { 1237 btrfs_add_delayed_iput(async_chunk->inode); 1238 async_chunk->inode = NULL; 1239 } 1240 } 1241 1242 /* 1243 * work queue call back to submit previously compressed pages 1244 */ 1245 static noinline void async_cow_submit(struct btrfs_work *work) 1246 { 1247 struct async_chunk *async_chunk = container_of(work, struct async_chunk, 1248 work); 1249 struct btrfs_fs_info *fs_info = btrfs_work_owner(work); 1250 unsigned long nr_pages; 1251 1252 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> 1253 PAGE_SHIFT; 1254 1255 /* atomic_sub_return implies a barrier */ 1256 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 1257 5 * SZ_1M) 1258 cond_wake_up_nomb(&fs_info->async_submit_wait); 1259 1260 /* 1261 * ->inode could be NULL if async_chunk_start has failed to compress, 1262 * in which case we don't have anything to submit, yet we need to 1263 * always adjust ->async_delalloc_pages as its paired with the init 1264 * happening in cow_file_range_async 1265 */ 1266 if (async_chunk->inode) 1267 submit_compressed_extents(async_chunk); 1268 } 1269 1270 static noinline void async_cow_free(struct btrfs_work *work) 1271 { 1272 struct async_chunk *async_chunk; 1273 1274 async_chunk = container_of(work, struct async_chunk, work); 1275 if (async_chunk->inode) 1276 btrfs_add_delayed_iput(async_chunk->inode); 1277 if (async_chunk->blkcg_css) 1278 css_put(async_chunk->blkcg_css); 1279 /* 1280 * Since the pointer to 'pending' is at the beginning of the array of 1281 * async_chunk's, freeing it ensures the whole array has been freed. 1282 */ 1283 if (atomic_dec_and_test(async_chunk->pending)) 1284 kvfree(async_chunk->pending); 1285 } 1286 1287 static int cow_file_range_async(struct btrfs_inode *inode, 1288 struct writeback_control *wbc, 1289 struct page *locked_page, 1290 u64 start, u64 end, int *page_started, 1291 unsigned long *nr_written) 1292 { 1293 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1294 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); 1295 struct async_cow *ctx; 1296 struct async_chunk *async_chunk; 1297 unsigned long nr_pages; 1298 u64 cur_end; 1299 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); 1300 int i; 1301 bool should_compress; 1302 unsigned nofs_flag; 1303 const unsigned int write_flags = wbc_to_write_flags(wbc); 1304 1305 unlock_extent(&inode->io_tree, start, end); 1306 1307 if (inode->flags & BTRFS_INODE_NOCOMPRESS && 1308 !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { 1309 num_chunks = 1; 1310 should_compress = false; 1311 } else { 1312 should_compress = true; 1313 } 1314 1315 nofs_flag = memalloc_nofs_save(); 1316 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); 1317 memalloc_nofs_restore(nofs_flag); 1318 1319 if (!ctx) { 1320 unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | 1321 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 1322 EXTENT_DO_ACCOUNTING; 1323 unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 1324 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | 1325 PAGE_SET_ERROR; 1326 1327 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1328 clear_bits, page_ops); 1329 return -ENOMEM; 1330 } 1331 1332 async_chunk = ctx->chunks; 1333 atomic_set(&ctx->num_chunks, num_chunks); 1334 1335 for (i = 0; i < num_chunks; i++) { 1336 if (should_compress) 1337 cur_end = min(end, start + SZ_512K - 1); 1338 else 1339 cur_end = end; 1340 1341 /* 1342 * igrab is called higher up in the call chain, take only the 1343 * lightweight reference for the callback lifetime 1344 */ 1345 ihold(&inode->vfs_inode); 1346 async_chunk[i].pending = &ctx->num_chunks; 1347 async_chunk[i].inode = &inode->vfs_inode; 1348 async_chunk[i].start = start; 1349 async_chunk[i].end = cur_end; 1350 async_chunk[i].write_flags = write_flags; 1351 INIT_LIST_HEAD(&async_chunk[i].extents); 1352 1353 /* 1354 * The locked_page comes all the way from writepage and its 1355 * the original page we were actually given. As we spread 1356 * this large delalloc region across multiple async_chunk 1357 * structs, only the first struct needs a pointer to locked_page 1358 * 1359 * This way we don't need racey decisions about who is supposed 1360 * to unlock it. 1361 */ 1362 if (locked_page) { 1363 /* 1364 * Depending on the compressibility, the pages might or 1365 * might not go through async. We want all of them to 1366 * be accounted against wbc once. Let's do it here 1367 * before the paths diverge. wbc accounting is used 1368 * only for foreign writeback detection and doesn't 1369 * need full accuracy. Just account the whole thing 1370 * against the first page. 1371 */ 1372 wbc_account_cgroup_owner(wbc, locked_page, 1373 cur_end - start); 1374 async_chunk[i].locked_page = locked_page; 1375 locked_page = NULL; 1376 } else { 1377 async_chunk[i].locked_page = NULL; 1378 } 1379 1380 if (blkcg_css != blkcg_root_css) { 1381 css_get(blkcg_css); 1382 async_chunk[i].blkcg_css = blkcg_css; 1383 } else { 1384 async_chunk[i].blkcg_css = NULL; 1385 } 1386 1387 btrfs_init_work(&async_chunk[i].work, async_cow_start, 1388 async_cow_submit, async_cow_free); 1389 1390 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); 1391 atomic_add(nr_pages, &fs_info->async_delalloc_pages); 1392 1393 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); 1394 1395 *nr_written += nr_pages; 1396 start = cur_end + 1; 1397 } 1398 *page_started = 1; 1399 return 0; 1400 } 1401 1402 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, 1403 u64 bytenr, u64 num_bytes) 1404 { 1405 int ret; 1406 struct btrfs_ordered_sum *sums; 1407 LIST_HEAD(list); 1408 1409 ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, 1410 bytenr + num_bytes - 1, &list, 0); 1411 if (ret == 0 && list_empty(&list)) 1412 return 0; 1413 1414 while (!list_empty(&list)) { 1415 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1416 list_del(&sums->list); 1417 kfree(sums); 1418 } 1419 if (ret < 0) 1420 return ret; 1421 return 1; 1422 } 1423 1424 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, 1425 const u64 start, const u64 end, 1426 int *page_started, unsigned long *nr_written) 1427 { 1428 const bool is_space_ino = btrfs_is_free_space_inode(inode); 1429 const bool is_reloc_ino = (inode->root->root_key.objectid == 1430 BTRFS_DATA_RELOC_TREE_OBJECTID); 1431 const u64 range_bytes = end + 1 - start; 1432 struct extent_io_tree *io_tree = &inode->io_tree; 1433 u64 range_start = start; 1434 u64 count; 1435 1436 /* 1437 * If EXTENT_NORESERVE is set it means that when the buffered write was 1438 * made we had not enough available data space and therefore we did not 1439 * reserve data space for it, since we though we could do NOCOW for the 1440 * respective file range (either there is prealloc extent or the inode 1441 * has the NOCOW bit set). 1442 * 1443 * However when we need to fallback to COW mode (because for example the 1444 * block group for the corresponding extent was turned to RO mode by a 1445 * scrub or relocation) we need to do the following: 1446 * 1447 * 1) We increment the bytes_may_use counter of the data space info. 1448 * If COW succeeds, it allocates a new data extent and after doing 1449 * that it decrements the space info's bytes_may_use counter and 1450 * increments its bytes_reserved counter by the same amount (we do 1451 * this at btrfs_add_reserved_bytes()). So we need to increment the 1452 * bytes_may_use counter to compensate (when space is reserved at 1453 * buffered write time, the bytes_may_use counter is incremented); 1454 * 1455 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so 1456 * that if the COW path fails for any reason, it decrements (through 1457 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the 1458 * data space info, which we incremented in the step above. 1459 * 1460 * If we need to fallback to cow and the inode corresponds to a free 1461 * space cache inode or an inode of the data relocation tree, we must 1462 * also increment bytes_may_use of the data space_info for the same 1463 * reason. Space caches and relocated data extents always get a prealloc 1464 * extent for them, however scrub or balance may have set the block 1465 * group that contains that extent to RO mode and therefore force COW 1466 * when starting writeback. 1467 */ 1468 count = count_range_bits(io_tree, &range_start, end, range_bytes, 1469 EXTENT_NORESERVE, 0); 1470 if (count > 0 || is_space_ino || is_reloc_ino) { 1471 u64 bytes = count; 1472 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1473 struct btrfs_space_info *sinfo = fs_info->data_sinfo; 1474 1475 if (is_space_ino || is_reloc_ino) 1476 bytes = range_bytes; 1477 1478 spin_lock(&sinfo->lock); 1479 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); 1480 spin_unlock(&sinfo->lock); 1481 1482 if (count > 0) 1483 clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, 1484 0, 0, NULL); 1485 } 1486 1487 return cow_file_range(inode, locked_page, start, end, page_started, 1488 nr_written, 1); 1489 } 1490 1491 /* 1492 * when nowcow writeback call back. This checks for snapshots or COW copies 1493 * of the extents that exist in the file, and COWs the file as required. 1494 * 1495 * If no cow copies or snapshots exist, we write directly to the existing 1496 * blocks on disk 1497 */ 1498 static noinline int run_delalloc_nocow(struct btrfs_inode *inode, 1499 struct page *locked_page, 1500 const u64 start, const u64 end, 1501 int *page_started, int force, 1502 unsigned long *nr_written) 1503 { 1504 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1505 struct btrfs_root *root = inode->root; 1506 struct btrfs_path *path; 1507 u64 cow_start = (u64)-1; 1508 u64 cur_offset = start; 1509 int ret; 1510 bool check_prev = true; 1511 const bool freespace_inode = btrfs_is_free_space_inode(inode); 1512 u64 ino = btrfs_ino(inode); 1513 bool nocow = false; 1514 u64 disk_bytenr = 0; 1515 1516 path = btrfs_alloc_path(); 1517 if (!path) { 1518 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1519 EXTENT_LOCKED | EXTENT_DELALLOC | 1520 EXTENT_DO_ACCOUNTING | 1521 EXTENT_DEFRAG, PAGE_UNLOCK | 1522 PAGE_CLEAR_DIRTY | 1523 PAGE_SET_WRITEBACK | 1524 PAGE_END_WRITEBACK); 1525 return -ENOMEM; 1526 } 1527 1528 while (1) { 1529 struct btrfs_key found_key; 1530 struct btrfs_file_extent_item *fi; 1531 struct extent_buffer *leaf; 1532 u64 extent_end; 1533 u64 extent_offset; 1534 u64 num_bytes = 0; 1535 u64 disk_num_bytes; 1536 u64 ram_bytes; 1537 int extent_type; 1538 1539 nocow = false; 1540 1541 ret = btrfs_lookup_file_extent(NULL, root, path, ino, 1542 cur_offset, 0); 1543 if (ret < 0) 1544 goto error; 1545 1546 /* 1547 * If there is no extent for our range when doing the initial 1548 * search, then go back to the previous slot as it will be the 1549 * one containing the search offset 1550 */ 1551 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1552 leaf = path->nodes[0]; 1553 btrfs_item_key_to_cpu(leaf, &found_key, 1554 path->slots[0] - 1); 1555 if (found_key.objectid == ino && 1556 found_key.type == BTRFS_EXTENT_DATA_KEY) 1557 path->slots[0]--; 1558 } 1559 check_prev = false; 1560 next_slot: 1561 /* Go to next leaf if we have exhausted the current one */ 1562 leaf = path->nodes[0]; 1563 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1564 ret = btrfs_next_leaf(root, path); 1565 if (ret < 0) { 1566 if (cow_start != (u64)-1) 1567 cur_offset = cow_start; 1568 goto error; 1569 } 1570 if (ret > 0) 1571 break; 1572 leaf = path->nodes[0]; 1573 } 1574 1575 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1576 1577 /* Didn't find anything for our INO */ 1578 if (found_key.objectid > ino) 1579 break; 1580 /* 1581 * Keep searching until we find an EXTENT_ITEM or there are no 1582 * more extents for this inode 1583 */ 1584 if (WARN_ON_ONCE(found_key.objectid < ino) || 1585 found_key.type < BTRFS_EXTENT_DATA_KEY) { 1586 path->slots[0]++; 1587 goto next_slot; 1588 } 1589 1590 /* Found key is not EXTENT_DATA_KEY or starts after req range */ 1591 if (found_key.type > BTRFS_EXTENT_DATA_KEY || 1592 found_key.offset > end) 1593 break; 1594 1595 /* 1596 * If the found extent starts after requested offset, then 1597 * adjust extent_end to be right before this extent begins 1598 */ 1599 if (found_key.offset > cur_offset) { 1600 extent_end = found_key.offset; 1601 extent_type = 0; 1602 goto out_check; 1603 } 1604 1605 /* 1606 * Found extent which begins before our range and potentially 1607 * intersect it 1608 */ 1609 fi = btrfs_item_ptr(leaf, path->slots[0], 1610 struct btrfs_file_extent_item); 1611 extent_type = btrfs_file_extent_type(leaf, fi); 1612 1613 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1614 if (extent_type == BTRFS_FILE_EXTENT_REG || 1615 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1616 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1617 extent_offset = btrfs_file_extent_offset(leaf, fi); 1618 extent_end = found_key.offset + 1619 btrfs_file_extent_num_bytes(leaf, fi); 1620 disk_num_bytes = 1621 btrfs_file_extent_disk_num_bytes(leaf, fi); 1622 /* 1623 * If the extent we got ends before our current offset, 1624 * skip to the next extent. 1625 */ 1626 if (extent_end <= cur_offset) { 1627 path->slots[0]++; 1628 goto next_slot; 1629 } 1630 /* Skip holes */ 1631 if (disk_bytenr == 0) 1632 goto out_check; 1633 /* Skip compressed/encrypted/encoded extents */ 1634 if (btrfs_file_extent_compression(leaf, fi) || 1635 btrfs_file_extent_encryption(leaf, fi) || 1636 btrfs_file_extent_other_encoding(leaf, fi)) 1637 goto out_check; 1638 /* 1639 * If extent is created before the last volume's snapshot 1640 * this implies the extent is shared, hence we can't do 1641 * nocow. This is the same check as in 1642 * btrfs_cross_ref_exist but without calling 1643 * btrfs_search_slot. 1644 */ 1645 if (!freespace_inode && 1646 btrfs_file_extent_generation(leaf, fi) <= 1647 btrfs_root_last_snapshot(&root->root_item)) 1648 goto out_check; 1649 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1650 goto out_check; 1651 1652 /* 1653 * The following checks can be expensive, as they need to 1654 * take other locks and do btree or rbtree searches, so 1655 * release the path to avoid blocking other tasks for too 1656 * long. 1657 */ 1658 btrfs_release_path(path); 1659 1660 /* If extent is RO, we must COW it */ 1661 if (btrfs_extent_readonly(fs_info, disk_bytenr)) 1662 goto out_check; 1663 ret = btrfs_cross_ref_exist(root, ino, 1664 found_key.offset - 1665 extent_offset, disk_bytenr, false); 1666 if (ret) { 1667 /* 1668 * ret could be -EIO if the above fails to read 1669 * metadata. 1670 */ 1671 if (ret < 0) { 1672 if (cow_start != (u64)-1) 1673 cur_offset = cow_start; 1674 goto error; 1675 } 1676 1677 WARN_ON_ONCE(freespace_inode); 1678 goto out_check; 1679 } 1680 disk_bytenr += extent_offset; 1681 disk_bytenr += cur_offset - found_key.offset; 1682 num_bytes = min(end + 1, extent_end) - cur_offset; 1683 /* 1684 * If there are pending snapshots for this root, we 1685 * fall into common COW way 1686 */ 1687 if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) 1688 goto out_check; 1689 /* 1690 * force cow if csum exists in the range. 1691 * this ensure that csum for a given extent are 1692 * either valid or do not exist. 1693 */ 1694 ret = csum_exist_in_range(fs_info, disk_bytenr, 1695 num_bytes); 1696 if (ret) { 1697 /* 1698 * ret could be -EIO if the above fails to read 1699 * metadata. 1700 */ 1701 if (ret < 0) { 1702 if (cow_start != (u64)-1) 1703 cur_offset = cow_start; 1704 goto error; 1705 } 1706 WARN_ON_ONCE(freespace_inode); 1707 goto out_check; 1708 } 1709 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) 1710 goto out_check; 1711 nocow = true; 1712 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1713 extent_end = found_key.offset + ram_bytes; 1714 extent_end = ALIGN(extent_end, fs_info->sectorsize); 1715 /* Skip extents outside of our requested range */ 1716 if (extent_end <= start) { 1717 path->slots[0]++; 1718 goto next_slot; 1719 } 1720 } else { 1721 /* If this triggers then we have a memory corruption */ 1722 BUG(); 1723 } 1724 out_check: 1725 /* 1726 * If nocow is false then record the beginning of the range 1727 * that needs to be COWed 1728 */ 1729 if (!nocow) { 1730 if (cow_start == (u64)-1) 1731 cow_start = cur_offset; 1732 cur_offset = extent_end; 1733 if (cur_offset > end) 1734 break; 1735 if (!path->nodes[0]) 1736 continue; 1737 path->slots[0]++; 1738 goto next_slot; 1739 } 1740 1741 /* 1742 * COW range from cow_start to found_key.offset - 1. As the key 1743 * will contain the beginning of the first extent that can be 1744 * NOCOW, following one which needs to be COW'ed 1745 */ 1746 if (cow_start != (u64)-1) { 1747 ret = fallback_to_cow(inode, locked_page, 1748 cow_start, found_key.offset - 1, 1749 page_started, nr_written); 1750 if (ret) 1751 goto error; 1752 cow_start = (u64)-1; 1753 } 1754 1755 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1756 u64 orig_start = found_key.offset - extent_offset; 1757 struct extent_map *em; 1758 1759 em = create_io_em(inode, cur_offset, num_bytes, 1760 orig_start, 1761 disk_bytenr, /* block_start */ 1762 num_bytes, /* block_len */ 1763 disk_num_bytes, /* orig_block_len */ 1764 ram_bytes, BTRFS_COMPRESS_NONE, 1765 BTRFS_ORDERED_PREALLOC); 1766 if (IS_ERR(em)) { 1767 ret = PTR_ERR(em); 1768 goto error; 1769 } 1770 free_extent_map(em); 1771 ret = btrfs_add_ordered_extent(inode, cur_offset, 1772 disk_bytenr, num_bytes, 1773 num_bytes, 1774 BTRFS_ORDERED_PREALLOC); 1775 if (ret) { 1776 btrfs_drop_extent_cache(inode, cur_offset, 1777 cur_offset + num_bytes - 1, 1778 0); 1779 goto error; 1780 } 1781 } else { 1782 ret = btrfs_add_ordered_extent(inode, cur_offset, 1783 disk_bytenr, num_bytes, 1784 num_bytes, 1785 BTRFS_ORDERED_NOCOW); 1786 if (ret) 1787 goto error; 1788 } 1789 1790 if (nocow) 1791 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1792 nocow = false; 1793 1794 if (root->root_key.objectid == 1795 BTRFS_DATA_RELOC_TREE_OBJECTID) 1796 /* 1797 * Error handled later, as we must prevent 1798 * extent_clear_unlock_delalloc() in error handler 1799 * from freeing metadata of created ordered extent. 1800 */ 1801 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1802 num_bytes); 1803 1804 extent_clear_unlock_delalloc(inode, cur_offset, 1805 cur_offset + num_bytes - 1, 1806 locked_page, EXTENT_LOCKED | 1807 EXTENT_DELALLOC | 1808 EXTENT_CLEAR_DATA_RESV, 1809 PAGE_UNLOCK | PAGE_SET_PRIVATE2); 1810 1811 cur_offset = extent_end; 1812 1813 /* 1814 * btrfs_reloc_clone_csums() error, now we're OK to call error 1815 * handler, as metadata for created ordered extent will only 1816 * be freed by btrfs_finish_ordered_io(). 1817 */ 1818 if (ret) 1819 goto error; 1820 if (cur_offset > end) 1821 break; 1822 } 1823 btrfs_release_path(path); 1824 1825 if (cur_offset <= end && cow_start == (u64)-1) 1826 cow_start = cur_offset; 1827 1828 if (cow_start != (u64)-1) { 1829 cur_offset = end; 1830 ret = fallback_to_cow(inode, locked_page, cow_start, end, 1831 page_started, nr_written); 1832 if (ret) 1833 goto error; 1834 } 1835 1836 error: 1837 if (nocow) 1838 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1839 1840 if (ret && cur_offset < end) 1841 extent_clear_unlock_delalloc(inode, cur_offset, end, 1842 locked_page, EXTENT_LOCKED | 1843 EXTENT_DELALLOC | EXTENT_DEFRAG | 1844 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1845 PAGE_CLEAR_DIRTY | 1846 PAGE_SET_WRITEBACK | 1847 PAGE_END_WRITEBACK); 1848 btrfs_free_path(path); 1849 return ret; 1850 } 1851 1852 static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end) 1853 { 1854 1855 if (!(inode->flags & BTRFS_INODE_NODATACOW) && 1856 !(inode->flags & BTRFS_INODE_PREALLOC)) 1857 return 0; 1858 1859 /* 1860 * @defrag_bytes is a hint value, no spinlock held here, 1861 * if is not zero, it means the file is defragging. 1862 * Force cow if given extent needs to be defragged. 1863 */ 1864 if (inode->defrag_bytes && 1865 test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL)) 1866 return 1; 1867 1868 return 0; 1869 } 1870 1871 /* 1872 * Function to process delayed allocation (create CoW) for ranges which are 1873 * being touched for the first time. 1874 */ 1875 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, 1876 u64 start, u64 end, int *page_started, unsigned long *nr_written, 1877 struct writeback_control *wbc) 1878 { 1879 int ret; 1880 int force_cow = need_force_cow(inode, start, end); 1881 1882 if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { 1883 ret = run_delalloc_nocow(inode, locked_page, start, end, 1884 page_started, 1, nr_written); 1885 } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { 1886 ret = run_delalloc_nocow(inode, locked_page, start, end, 1887 page_started, 0, nr_written); 1888 } else if (!inode_can_compress(inode) || 1889 !inode_need_compress(inode, start, end)) { 1890 ret = cow_file_range(inode, locked_page, start, end, 1891 page_started, nr_written, 1); 1892 } else { 1893 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); 1894 ret = cow_file_range_async(inode, wbc, locked_page, start, end, 1895 page_started, nr_written); 1896 } 1897 if (ret) 1898 btrfs_cleanup_ordered_extents(inode, locked_page, start, 1899 end - start + 1); 1900 return ret; 1901 } 1902 1903 void btrfs_split_delalloc_extent(struct inode *inode, 1904 struct extent_state *orig, u64 split) 1905 { 1906 u64 size; 1907 1908 /* not delalloc, ignore it */ 1909 if (!(orig->state & EXTENT_DELALLOC)) 1910 return; 1911 1912 size = orig->end - orig->start + 1; 1913 if (size > BTRFS_MAX_EXTENT_SIZE) { 1914 u32 num_extents; 1915 u64 new_size; 1916 1917 /* 1918 * See the explanation in btrfs_merge_delalloc_extent, the same 1919 * applies here, just in reverse. 1920 */ 1921 new_size = orig->end - split + 1; 1922 num_extents = count_max_extents(new_size); 1923 new_size = split - orig->start; 1924 num_extents += count_max_extents(new_size); 1925 if (count_max_extents(size) >= num_extents) 1926 return; 1927 } 1928 1929 spin_lock(&BTRFS_I(inode)->lock); 1930 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 1931 spin_unlock(&BTRFS_I(inode)->lock); 1932 } 1933 1934 /* 1935 * Handle merged delayed allocation extents so we can keep track of new extents 1936 * that are just merged onto old extents, such as when we are doing sequential 1937 * writes, so we can properly account for the metadata space we'll need. 1938 */ 1939 void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, 1940 struct extent_state *other) 1941 { 1942 u64 new_size, old_size; 1943 u32 num_extents; 1944 1945 /* not delalloc, ignore it */ 1946 if (!(other->state & EXTENT_DELALLOC)) 1947 return; 1948 1949 if (new->start > other->start) 1950 new_size = new->end - other->start + 1; 1951 else 1952 new_size = other->end - new->start + 1; 1953 1954 /* we're not bigger than the max, unreserve the space and go */ 1955 if (new_size <= BTRFS_MAX_EXTENT_SIZE) { 1956 spin_lock(&BTRFS_I(inode)->lock); 1957 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 1958 spin_unlock(&BTRFS_I(inode)->lock); 1959 return; 1960 } 1961 1962 /* 1963 * We have to add up either side to figure out how many extents were 1964 * accounted for before we merged into one big extent. If the number of 1965 * extents we accounted for is <= the amount we need for the new range 1966 * then we can return, otherwise drop. Think of it like this 1967 * 1968 * [ 4k][MAX_SIZE] 1969 * 1970 * So we've grown the extent by a MAX_SIZE extent, this would mean we 1971 * need 2 outstanding extents, on one side we have 1 and the other side 1972 * we have 1 so they are == and we can return. But in this case 1973 * 1974 * [MAX_SIZE+4k][MAX_SIZE+4k] 1975 * 1976 * Each range on their own accounts for 2 extents, but merged together 1977 * they are only 3 extents worth of accounting, so we need to drop in 1978 * this case. 1979 */ 1980 old_size = other->end - other->start + 1; 1981 num_extents = count_max_extents(old_size); 1982 old_size = new->end - new->start + 1; 1983 num_extents += count_max_extents(old_size); 1984 if (count_max_extents(new_size) >= num_extents) 1985 return; 1986 1987 spin_lock(&BTRFS_I(inode)->lock); 1988 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 1989 spin_unlock(&BTRFS_I(inode)->lock); 1990 } 1991 1992 static void btrfs_add_delalloc_inodes(struct btrfs_root *root, 1993 struct inode *inode) 1994 { 1995 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1996 1997 spin_lock(&root->delalloc_lock); 1998 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1999 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 2000 &root->delalloc_inodes); 2001 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2002 &BTRFS_I(inode)->runtime_flags); 2003 root->nr_delalloc_inodes++; 2004 if (root->nr_delalloc_inodes == 1) { 2005 spin_lock(&fs_info->delalloc_root_lock); 2006 BUG_ON(!list_empty(&root->delalloc_root)); 2007 list_add_tail(&root->delalloc_root, 2008 &fs_info->delalloc_roots); 2009 spin_unlock(&fs_info->delalloc_root_lock); 2010 } 2011 } 2012 spin_unlock(&root->delalloc_lock); 2013 } 2014 2015 2016 void __btrfs_del_delalloc_inode(struct btrfs_root *root, 2017 struct btrfs_inode *inode) 2018 { 2019 struct btrfs_fs_info *fs_info = root->fs_info; 2020 2021 if (!list_empty(&inode->delalloc_inodes)) { 2022 list_del_init(&inode->delalloc_inodes); 2023 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2024 &inode->runtime_flags); 2025 root->nr_delalloc_inodes--; 2026 if (!root->nr_delalloc_inodes) { 2027 ASSERT(list_empty(&root->delalloc_inodes)); 2028 spin_lock(&fs_info->delalloc_root_lock); 2029 BUG_ON(list_empty(&root->delalloc_root)); 2030 list_del_init(&root->delalloc_root); 2031 spin_unlock(&fs_info->delalloc_root_lock); 2032 } 2033 } 2034 } 2035 2036 static void btrfs_del_delalloc_inode(struct btrfs_root *root, 2037 struct btrfs_inode *inode) 2038 { 2039 spin_lock(&root->delalloc_lock); 2040 __btrfs_del_delalloc_inode(root, inode); 2041 spin_unlock(&root->delalloc_lock); 2042 } 2043 2044 /* 2045 * Properly track delayed allocation bytes in the inode and to maintain the 2046 * list of inodes that have pending delalloc work to be done. 2047 */ 2048 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, 2049 unsigned *bits) 2050 { 2051 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2052 2053 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) 2054 WARN_ON(1); 2055 /* 2056 * set_bit and clear bit hooks normally require _irqsave/restore 2057 * but in this case, we are only testing for the DELALLOC 2058 * bit, which is only set or cleared with irqs on 2059 */ 2060 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 2061 struct btrfs_root *root = BTRFS_I(inode)->root; 2062 u64 len = state->end + 1 - state->start; 2063 u32 num_extents = count_max_extents(len); 2064 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); 2065 2066 spin_lock(&BTRFS_I(inode)->lock); 2067 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); 2068 spin_unlock(&BTRFS_I(inode)->lock); 2069 2070 /* For sanity tests */ 2071 if (btrfs_is_testing(fs_info)) 2072 return; 2073 2074 percpu_counter_add_batch(&fs_info->delalloc_bytes, len, 2075 fs_info->delalloc_batch); 2076 spin_lock(&BTRFS_I(inode)->lock); 2077 BTRFS_I(inode)->delalloc_bytes += len; 2078 if (*bits & EXTENT_DEFRAG) 2079 BTRFS_I(inode)->defrag_bytes += len; 2080 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2081 &BTRFS_I(inode)->runtime_flags)) 2082 btrfs_add_delalloc_inodes(root, inode); 2083 spin_unlock(&BTRFS_I(inode)->lock); 2084 } 2085 2086 if (!(state->state & EXTENT_DELALLOC_NEW) && 2087 (*bits & EXTENT_DELALLOC_NEW)) { 2088 spin_lock(&BTRFS_I(inode)->lock); 2089 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - 2090 state->start; 2091 spin_unlock(&BTRFS_I(inode)->lock); 2092 } 2093 } 2094 2095 /* 2096 * Once a range is no longer delalloc this function ensures that proper 2097 * accounting happens. 2098 */ 2099 void btrfs_clear_delalloc_extent(struct inode *vfs_inode, 2100 struct extent_state *state, unsigned *bits) 2101 { 2102 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 2103 struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); 2104 u64 len = state->end + 1 - state->start; 2105 u32 num_extents = count_max_extents(len); 2106 2107 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { 2108 spin_lock(&inode->lock); 2109 inode->defrag_bytes -= len; 2110 spin_unlock(&inode->lock); 2111 } 2112 2113 /* 2114 * set_bit and clear bit hooks normally require _irqsave/restore 2115 * but in this case, we are only testing for the DELALLOC 2116 * bit, which is only set or cleared with irqs on 2117 */ 2118 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 2119 struct btrfs_root *root = inode->root; 2120 bool do_list = !btrfs_is_free_space_inode(inode); 2121 2122 spin_lock(&inode->lock); 2123 btrfs_mod_outstanding_extents(inode, -num_extents); 2124 spin_unlock(&inode->lock); 2125 2126 /* 2127 * We don't reserve metadata space for space cache inodes so we 2128 * don't need to call delalloc_release_metadata if there is an 2129 * error. 2130 */ 2131 if (*bits & EXTENT_CLEAR_META_RESV && 2132 root != fs_info->tree_root) 2133 btrfs_delalloc_release_metadata(inode, len, false); 2134 2135 /* For sanity tests. */ 2136 if (btrfs_is_testing(fs_info)) 2137 return; 2138 2139 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && 2140 do_list && !(state->state & EXTENT_NORESERVE) && 2141 (*bits & EXTENT_CLEAR_DATA_RESV)) 2142 btrfs_free_reserved_data_space_noquota(fs_info, len); 2143 2144 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, 2145 fs_info->delalloc_batch); 2146 spin_lock(&inode->lock); 2147 inode->delalloc_bytes -= len; 2148 if (do_list && inode->delalloc_bytes == 0 && 2149 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2150 &inode->runtime_flags)) 2151 btrfs_del_delalloc_inode(root, inode); 2152 spin_unlock(&inode->lock); 2153 } 2154 2155 if ((state->state & EXTENT_DELALLOC_NEW) && 2156 (*bits & EXTENT_DELALLOC_NEW)) { 2157 spin_lock(&inode->lock); 2158 ASSERT(inode->new_delalloc_bytes >= len); 2159 inode->new_delalloc_bytes -= len; 2160 if (*bits & EXTENT_ADD_INODE_BYTES) 2161 inode_add_bytes(&inode->vfs_inode, len); 2162 spin_unlock(&inode->lock); 2163 } 2164 } 2165 2166 /* 2167 * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit 2168 * in a chunk's stripe. This function ensures that bios do not span a 2169 * stripe/chunk 2170 * 2171 * @page - The page we are about to add to the bio 2172 * @size - size we want to add to the bio 2173 * @bio - bio we want to ensure is smaller than a stripe 2174 * @bio_flags - flags of the bio 2175 * 2176 * return 1 if page cannot be added to the bio 2177 * return 0 if page can be added to the bio 2178 * return error otherwise 2179 */ 2180 int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, 2181 unsigned long bio_flags) 2182 { 2183 struct inode *inode = page->mapping->host; 2184 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2185 u64 logical = bio->bi_iter.bi_sector << 9; 2186 u64 length = 0; 2187 u64 map_length; 2188 int ret; 2189 struct btrfs_io_geometry geom; 2190 2191 if (bio_flags & EXTENT_BIO_COMPRESSED) 2192 return 0; 2193 2194 length = bio->bi_iter.bi_size; 2195 map_length = length; 2196 ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, 2197 &geom); 2198 if (ret < 0) 2199 return ret; 2200 2201 if (geom.len < length + size) 2202 return 1; 2203 return 0; 2204 } 2205 2206 /* 2207 * in order to insert checksums into the metadata in large chunks, 2208 * we wait until bio submission time. All the pages in the bio are 2209 * checksummed and sums are attached onto the ordered extent record. 2210 * 2211 * At IO completion time the cums attached on the ordered extent record 2212 * are inserted into the btree 2213 */ 2214 static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, 2215 u64 dio_file_offset) 2216 { 2217 return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); 2218 } 2219 2220 /* 2221 * extent_io.c submission hook. This does the right thing for csum calculation 2222 * on write, or reading the csums from the tree before a read. 2223 * 2224 * Rules about async/sync submit, 2225 * a) read: sync submit 2226 * 2227 * b) write without checksum: sync submit 2228 * 2229 * c) write with checksum: 2230 * c-1) if bio is issued by fsync: sync submit 2231 * (sync_writers != 0) 2232 * 2233 * c-2) if root is reloc root: sync submit 2234 * (only in case of buffered IO) 2235 * 2236 * c-3) otherwise: async submit 2237 */ 2238 blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 2239 int mirror_num, unsigned long bio_flags) 2240 2241 { 2242 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2243 struct btrfs_root *root = BTRFS_I(inode)->root; 2244 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; 2245 blk_status_t ret = 0; 2246 int skip_sum; 2247 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 2248 2249 skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || 2250 !fs_info->csum_root; 2251 2252 if (btrfs_is_free_space_inode(BTRFS_I(inode))) 2253 metadata = BTRFS_WQ_ENDIO_FREE_SPACE; 2254 2255 if (bio_op(bio) != REQ_OP_WRITE) { 2256 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); 2257 if (ret) 2258 goto out; 2259 2260 if (bio_flags & EXTENT_BIO_COMPRESSED) { 2261 ret = btrfs_submit_compressed_read(inode, bio, 2262 mirror_num, 2263 bio_flags); 2264 goto out; 2265 } else { 2266 /* 2267 * Lookup bio sums does extra checks around whether we 2268 * need to csum or not, which is why we ignore skip_sum 2269 * here. 2270 */ 2271 ret = btrfs_lookup_bio_sums(inode, bio, NULL); 2272 if (ret) 2273 goto out; 2274 } 2275 goto mapit; 2276 } else if (async && !skip_sum) { 2277 /* csum items have already been cloned */ 2278 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 2279 goto mapit; 2280 /* we're doing a write, do the async checksumming */ 2281 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, 2282 0, btrfs_submit_bio_start); 2283 goto out; 2284 } else if (!skip_sum) { 2285 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); 2286 if (ret) 2287 goto out; 2288 } 2289 2290 mapit: 2291 ret = btrfs_map_bio(fs_info, bio, mirror_num); 2292 2293 out: 2294 if (ret) { 2295 bio->bi_status = ret; 2296 bio_endio(bio); 2297 } 2298 return ret; 2299 } 2300 2301 /* 2302 * given a list of ordered sums record them in the inode. This happens 2303 * at IO completion time based on sums calculated at bio submission time. 2304 */ 2305 static int add_pending_csums(struct btrfs_trans_handle *trans, 2306 struct list_head *list) 2307 { 2308 struct btrfs_ordered_sum *sum; 2309 int ret; 2310 2311 list_for_each_entry(sum, list, list) { 2312 trans->adding_csums = true; 2313 ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); 2314 trans->adding_csums = false; 2315 if (ret) 2316 return ret; 2317 } 2318 return 0; 2319 } 2320 2321 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, 2322 const u64 start, 2323 const u64 len, 2324 struct extent_state **cached_state) 2325 { 2326 u64 search_start = start; 2327 const u64 end = start + len - 1; 2328 2329 while (search_start < end) { 2330 const u64 search_len = end - search_start + 1; 2331 struct extent_map *em; 2332 u64 em_len; 2333 int ret = 0; 2334 2335 em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); 2336 if (IS_ERR(em)) 2337 return PTR_ERR(em); 2338 2339 if (em->block_start != EXTENT_MAP_HOLE) 2340 goto next; 2341 2342 em_len = em->len; 2343 if (em->start < search_start) 2344 em_len -= search_start - em->start; 2345 if (em_len > search_len) 2346 em_len = search_len; 2347 2348 ret = set_extent_bit(&inode->io_tree, search_start, 2349 search_start + em_len - 1, 2350 EXTENT_DELALLOC_NEW, 0, NULL, cached_state, 2351 GFP_NOFS, NULL); 2352 next: 2353 search_start = extent_map_end(em); 2354 free_extent_map(em); 2355 if (ret) 2356 return ret; 2357 } 2358 return 0; 2359 } 2360 2361 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2362 unsigned int extra_bits, 2363 struct extent_state **cached_state) 2364 { 2365 WARN_ON(PAGE_ALIGNED(end)); 2366 2367 if (start >= i_size_read(&inode->vfs_inode) && 2368 !(inode->flags & BTRFS_INODE_PREALLOC)) { 2369 /* 2370 * There can't be any extents following eof in this case so just 2371 * set the delalloc new bit for the range directly. 2372 */ 2373 extra_bits |= EXTENT_DELALLOC_NEW; 2374 } else { 2375 int ret; 2376 2377 ret = btrfs_find_new_delalloc_bytes(inode, start, 2378 end + 1 - start, 2379 cached_state); 2380 if (ret) 2381 return ret; 2382 } 2383 2384 return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, 2385 cached_state); 2386 } 2387 2388 /* see btrfs_writepage_start_hook for details on why this is required */ 2389 struct btrfs_writepage_fixup { 2390 struct page *page; 2391 struct inode *inode; 2392 struct btrfs_work work; 2393 }; 2394 2395 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 2396 { 2397 struct btrfs_writepage_fixup *fixup; 2398 struct btrfs_ordered_extent *ordered; 2399 struct extent_state *cached_state = NULL; 2400 struct extent_changeset *data_reserved = NULL; 2401 struct page *page; 2402 struct btrfs_inode *inode; 2403 u64 page_start; 2404 u64 page_end; 2405 int ret = 0; 2406 bool free_delalloc_space = true; 2407 2408 fixup = container_of(work, struct btrfs_writepage_fixup, work); 2409 page = fixup->page; 2410 inode = BTRFS_I(fixup->inode); 2411 page_start = page_offset(page); 2412 page_end = page_offset(page) + PAGE_SIZE - 1; 2413 2414 /* 2415 * This is similar to page_mkwrite, we need to reserve the space before 2416 * we take the page lock. 2417 */ 2418 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 2419 PAGE_SIZE); 2420 again: 2421 lock_page(page); 2422 2423 /* 2424 * Before we queued this fixup, we took a reference on the page. 2425 * page->mapping may go NULL, but it shouldn't be moved to a different 2426 * address space. 2427 */ 2428 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 2429 /* 2430 * Unfortunately this is a little tricky, either 2431 * 2432 * 1) We got here and our page had already been dealt with and 2433 * we reserved our space, thus ret == 0, so we need to just 2434 * drop our space reservation and bail. This can happen the 2435 * first time we come into the fixup worker, or could happen 2436 * while waiting for the ordered extent. 2437 * 2) Our page was already dealt with, but we happened to get an 2438 * ENOSPC above from the btrfs_delalloc_reserve_space. In 2439 * this case we obviously don't have anything to release, but 2440 * because the page was already dealt with we don't want to 2441 * mark the page with an error, so make sure we're resetting 2442 * ret to 0. This is why we have this check _before_ the ret 2443 * check, because we do not want to have a surprise ENOSPC 2444 * when the page was already properly dealt with. 2445 */ 2446 if (!ret) { 2447 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2448 btrfs_delalloc_release_space(inode, data_reserved, 2449 page_start, PAGE_SIZE, 2450 true); 2451 } 2452 ret = 0; 2453 goto out_page; 2454 } 2455 2456 /* 2457 * We can't mess with the page state unless it is locked, so now that 2458 * it is locked bail if we failed to make our space reservation. 2459 */ 2460 if (ret) 2461 goto out_page; 2462 2463 lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); 2464 2465 /* already ordered? We're done */ 2466 if (PagePrivate2(page)) 2467 goto out_reserved; 2468 2469 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 2470 if (ordered) { 2471 unlock_extent_cached(&inode->io_tree, page_start, page_end, 2472 &cached_state); 2473 unlock_page(page); 2474 btrfs_start_ordered_extent(ordered, 1); 2475 btrfs_put_ordered_extent(ordered); 2476 goto again; 2477 } 2478 2479 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 2480 &cached_state); 2481 if (ret) 2482 goto out_reserved; 2483 2484 /* 2485 * Everything went as planned, we're now the owner of a dirty page with 2486 * delayed allocation bits set and space reserved for our COW 2487 * destination. 2488 * 2489 * The page was dirty when we started, nothing should have cleaned it. 2490 */ 2491 BUG_ON(!PageDirty(page)); 2492 free_delalloc_space = false; 2493 out_reserved: 2494 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2495 if (free_delalloc_space) 2496 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2497 PAGE_SIZE, true); 2498 unlock_extent_cached(&inode->io_tree, page_start, page_end, 2499 &cached_state); 2500 out_page: 2501 if (ret) { 2502 /* 2503 * We hit ENOSPC or other errors. Update the mapping and page 2504 * to reflect the errors and clean the page. 2505 */ 2506 mapping_set_error(page->mapping, ret); 2507 end_extent_writepage(page, ret, page_start, page_end); 2508 clear_page_dirty_for_io(page); 2509 SetPageError(page); 2510 } 2511 ClearPageChecked(page); 2512 unlock_page(page); 2513 put_page(page); 2514 kfree(fixup); 2515 extent_changeset_free(data_reserved); 2516 /* 2517 * As a precaution, do a delayed iput in case it would be the last iput 2518 * that could need flushing space. Recursing back to fixup worker would 2519 * deadlock. 2520 */ 2521 btrfs_add_delayed_iput(&inode->vfs_inode); 2522 } 2523 2524 /* 2525 * There are a few paths in the higher layers of the kernel that directly 2526 * set the page dirty bit without asking the filesystem if it is a 2527 * good idea. This causes problems because we want to make sure COW 2528 * properly happens and the data=ordered rules are followed. 2529 * 2530 * In our case any range that doesn't have the ORDERED bit set 2531 * hasn't been properly setup for IO. We kick off an async process 2532 * to fix it up. The async helper will wait for ordered extents, set 2533 * the delalloc bit and make it safe to write the page. 2534 */ 2535 int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) 2536 { 2537 struct inode *inode = page->mapping->host; 2538 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2539 struct btrfs_writepage_fixup *fixup; 2540 2541 /* this page is properly in the ordered list */ 2542 if (TestClearPagePrivate2(page)) 2543 return 0; 2544 2545 /* 2546 * PageChecked is set below when we create a fixup worker for this page, 2547 * don't try to create another one if we're already PageChecked() 2548 * 2549 * The extent_io writepage code will redirty the page if we send back 2550 * EAGAIN. 2551 */ 2552 if (PageChecked(page)) 2553 return -EAGAIN; 2554 2555 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 2556 if (!fixup) 2557 return -EAGAIN; 2558 2559 /* 2560 * We are already holding a reference to this inode from 2561 * write_cache_pages. We need to hold it because the space reservation 2562 * takes place outside of the page lock, and we can't trust 2563 * page->mapping outside of the page lock. 2564 */ 2565 ihold(inode); 2566 SetPageChecked(page); 2567 get_page(page); 2568 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); 2569 fixup->page = page; 2570 fixup->inode = inode; 2571 btrfs_queue_work(fs_info->fixup_workers, &fixup->work); 2572 2573 return -EAGAIN; 2574 } 2575 2576 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 2577 struct btrfs_inode *inode, u64 file_pos, 2578 struct btrfs_file_extent_item *stack_fi, 2579 const bool update_inode_bytes, 2580 u64 qgroup_reserved) 2581 { 2582 struct btrfs_root *root = inode->root; 2583 const u64 sectorsize = root->fs_info->sectorsize; 2584 struct btrfs_path *path; 2585 struct extent_buffer *leaf; 2586 struct btrfs_key ins; 2587 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); 2588 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); 2589 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); 2590 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); 2591 struct btrfs_drop_extents_args drop_args = { 0 }; 2592 int ret; 2593 2594 path = btrfs_alloc_path(); 2595 if (!path) 2596 return -ENOMEM; 2597 2598 /* 2599 * we may be replacing one extent in the tree with another. 2600 * The new extent is pinned in the extent map, and we don't want 2601 * to drop it from the cache until it is completely in the btree. 2602 * 2603 * So, tell btrfs_drop_extents to leave this extent in the cache. 2604 * the caller is expected to unpin it and allow it to be merged 2605 * with the others. 2606 */ 2607 drop_args.path = path; 2608 drop_args.start = file_pos; 2609 drop_args.end = file_pos + num_bytes; 2610 drop_args.replace_extent = true; 2611 drop_args.extent_item_size = sizeof(*stack_fi); 2612 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2613 if (ret) 2614 goto out; 2615 2616 if (!drop_args.extent_inserted) { 2617 ins.objectid = btrfs_ino(inode); 2618 ins.offset = file_pos; 2619 ins.type = BTRFS_EXTENT_DATA_KEY; 2620 2621 ret = btrfs_insert_empty_item(trans, root, path, &ins, 2622 sizeof(*stack_fi)); 2623 if (ret) 2624 goto out; 2625 } 2626 leaf = path->nodes[0]; 2627 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); 2628 write_extent_buffer(leaf, stack_fi, 2629 btrfs_item_ptr_offset(leaf, path->slots[0]), 2630 sizeof(struct btrfs_file_extent_item)); 2631 2632 btrfs_mark_buffer_dirty(leaf); 2633 btrfs_release_path(path); 2634 2635 /* 2636 * If we dropped an inline extent here, we know the range where it is 2637 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the 2638 * number of bytes only for that range contaning the inline extent. 2639 * The remaining of the range will be processed when clearning the 2640 * EXTENT_DELALLOC_BIT bit through the ordered extent completion. 2641 */ 2642 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { 2643 u64 inline_size = round_down(drop_args.bytes_found, sectorsize); 2644 2645 inline_size = drop_args.bytes_found - inline_size; 2646 btrfs_update_inode_bytes(inode, sectorsize, inline_size); 2647 drop_args.bytes_found -= inline_size; 2648 num_bytes -= sectorsize; 2649 } 2650 2651 if (update_inode_bytes) 2652 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); 2653 2654 ins.objectid = disk_bytenr; 2655 ins.offset = disk_num_bytes; 2656 ins.type = BTRFS_EXTENT_ITEM_KEY; 2657 2658 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); 2659 if (ret) 2660 goto out; 2661 2662 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), 2663 file_pos, qgroup_reserved, &ins); 2664 out: 2665 btrfs_free_path(path); 2666 2667 return ret; 2668 } 2669 2670 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, 2671 u64 start, u64 len) 2672 { 2673 struct btrfs_block_group *cache; 2674 2675 cache = btrfs_lookup_block_group(fs_info, start); 2676 ASSERT(cache); 2677 2678 spin_lock(&cache->lock); 2679 cache->delalloc_bytes -= len; 2680 spin_unlock(&cache->lock); 2681 2682 btrfs_put_block_group(cache); 2683 } 2684 2685 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, 2686 struct btrfs_ordered_extent *oe) 2687 { 2688 struct btrfs_file_extent_item stack_fi; 2689 u64 logical_len; 2690 bool update_inode_bytes; 2691 2692 memset(&stack_fi, 0, sizeof(stack_fi)); 2693 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); 2694 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); 2695 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, 2696 oe->disk_num_bytes); 2697 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) 2698 logical_len = oe->truncated_len; 2699 else 2700 logical_len = oe->num_bytes; 2701 btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); 2702 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); 2703 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); 2704 /* Encryption and other encoding is reserved and all 0 */ 2705 2706 /* 2707 * For delalloc, when completing an ordered extent we update the inode's 2708 * bytes when clearing the range in the inode's io tree, so pass false 2709 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), 2710 * except if the ordered extent was truncated. 2711 */ 2712 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || 2713 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); 2714 2715 return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), 2716 oe->file_offset, &stack_fi, 2717 update_inode_bytes, oe->qgroup_rsv); 2718 } 2719 2720 /* 2721 * As ordered data IO finishes, this gets called so we can finish 2722 * an ordered extent if the range of bytes in the file it covers are 2723 * fully written. 2724 */ 2725 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 2726 { 2727 struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); 2728 struct btrfs_root *root = inode->root; 2729 struct btrfs_fs_info *fs_info = root->fs_info; 2730 struct btrfs_trans_handle *trans = NULL; 2731 struct extent_io_tree *io_tree = &inode->io_tree; 2732 struct extent_state *cached_state = NULL; 2733 u64 start, end; 2734 int compress_type = 0; 2735 int ret = 0; 2736 u64 logical_len = ordered_extent->num_bytes; 2737 bool freespace_inode; 2738 bool truncated = false; 2739 bool clear_reserved_extent = true; 2740 unsigned int clear_bits = EXTENT_DEFRAG; 2741 2742 start = ordered_extent->file_offset; 2743 end = start + ordered_extent->num_bytes - 1; 2744 2745 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 2746 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && 2747 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) 2748 clear_bits |= EXTENT_DELALLOC_NEW; 2749 2750 freespace_inode = btrfs_is_free_space_inode(inode); 2751 2752 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 2753 ret = -EIO; 2754 goto out; 2755 } 2756 2757 btrfs_free_io_failure_record(inode, start, end); 2758 2759 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2760 truncated = true; 2761 logical_len = ordered_extent->truncated_len; 2762 /* Truncated the entire extent, don't bother adding */ 2763 if (!logical_len) 2764 goto out; 2765 } 2766 2767 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 2768 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 2769 2770 btrfs_inode_safe_disk_i_size_write(inode, 0); 2771 if (freespace_inode) 2772 trans = btrfs_join_transaction_spacecache(root); 2773 else 2774 trans = btrfs_join_transaction(root); 2775 if (IS_ERR(trans)) { 2776 ret = PTR_ERR(trans); 2777 trans = NULL; 2778 goto out; 2779 } 2780 trans->block_rsv = &inode->block_rsv; 2781 ret = btrfs_update_inode_fallback(trans, root, inode); 2782 if (ret) /* -ENOMEM or corruption */ 2783 btrfs_abort_transaction(trans, ret); 2784 goto out; 2785 } 2786 2787 clear_bits |= EXTENT_LOCKED; 2788 lock_extent_bits(io_tree, start, end, &cached_state); 2789 2790 if (freespace_inode) 2791 trans = btrfs_join_transaction_spacecache(root); 2792 else 2793 trans = btrfs_join_transaction(root); 2794 if (IS_ERR(trans)) { 2795 ret = PTR_ERR(trans); 2796 trans = NULL; 2797 goto out; 2798 } 2799 2800 trans->block_rsv = &inode->block_rsv; 2801 2802 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2803 compress_type = ordered_extent->compress_type; 2804 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 2805 BUG_ON(compress_type); 2806 ret = btrfs_mark_extent_written(trans, inode, 2807 ordered_extent->file_offset, 2808 ordered_extent->file_offset + 2809 logical_len); 2810 } else { 2811 BUG_ON(root == fs_info->tree_root); 2812 ret = insert_ordered_extent_file_extent(trans, ordered_extent); 2813 if (!ret) { 2814 clear_reserved_extent = false; 2815 btrfs_release_delalloc_bytes(fs_info, 2816 ordered_extent->disk_bytenr, 2817 ordered_extent->disk_num_bytes); 2818 } 2819 } 2820 unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, 2821 ordered_extent->num_bytes, trans->transid); 2822 if (ret < 0) { 2823 btrfs_abort_transaction(trans, ret); 2824 goto out; 2825 } 2826 2827 ret = add_pending_csums(trans, &ordered_extent->list); 2828 if (ret) { 2829 btrfs_abort_transaction(trans, ret); 2830 goto out; 2831 } 2832 2833 /* 2834 * If this is a new delalloc range, clear its new delalloc flag to 2835 * update the inode's number of bytes. This needs to be done first 2836 * before updating the inode item. 2837 */ 2838 if ((clear_bits & EXTENT_DELALLOC_NEW) && 2839 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) 2840 clear_extent_bit(&inode->io_tree, start, end, 2841 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, 2842 0, 0, &cached_state); 2843 2844 btrfs_inode_safe_disk_i_size_write(inode, 0); 2845 ret = btrfs_update_inode_fallback(trans, root, inode); 2846 if (ret) { /* -ENOMEM or corruption */ 2847 btrfs_abort_transaction(trans, ret); 2848 goto out; 2849 } 2850 ret = 0; 2851 out: 2852 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 2853 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, 2854 &cached_state); 2855 2856 if (trans) 2857 btrfs_end_transaction(trans); 2858 2859 if (ret || truncated) { 2860 u64 unwritten_start = start; 2861 2862 if (truncated) 2863 unwritten_start += logical_len; 2864 clear_extent_uptodate(io_tree, unwritten_start, end, NULL); 2865 2866 /* Drop the cache for the part of the extent we didn't write. */ 2867 btrfs_drop_extent_cache(inode, unwritten_start, end, 0); 2868 2869 /* 2870 * If the ordered extent had an IOERR or something else went 2871 * wrong we need to return the space for this ordered extent 2872 * back to the allocator. We only free the extent in the 2873 * truncated case if we didn't write out the extent at all. 2874 * 2875 * If we made it past insert_reserved_file_extent before we 2876 * errored out then we don't need to do this as the accounting 2877 * has already been done. 2878 */ 2879 if ((ret || !logical_len) && 2880 clear_reserved_extent && 2881 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 2882 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 2883 /* 2884 * Discard the range before returning it back to the 2885 * free space pool 2886 */ 2887 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) 2888 btrfs_discard_extent(fs_info, 2889 ordered_extent->disk_bytenr, 2890 ordered_extent->disk_num_bytes, 2891 NULL); 2892 btrfs_free_reserved_extent(fs_info, 2893 ordered_extent->disk_bytenr, 2894 ordered_extent->disk_num_bytes, 1); 2895 } 2896 } 2897 2898 /* 2899 * This needs to be done to make sure anybody waiting knows we are done 2900 * updating everything for this ordered extent. 2901 */ 2902 btrfs_remove_ordered_extent(inode, ordered_extent); 2903 2904 /* once for us */ 2905 btrfs_put_ordered_extent(ordered_extent); 2906 /* once for the tree */ 2907 btrfs_put_ordered_extent(ordered_extent); 2908 2909 return ret; 2910 } 2911 2912 static void finish_ordered_fn(struct btrfs_work *work) 2913 { 2914 struct btrfs_ordered_extent *ordered_extent; 2915 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 2916 btrfs_finish_ordered_io(ordered_extent); 2917 } 2918 2919 void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, 2920 u64 end, int uptodate) 2921 { 2922 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 2923 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2924 struct btrfs_ordered_extent *ordered_extent = NULL; 2925 struct btrfs_workqueue *wq; 2926 2927 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2928 2929 ClearPagePrivate2(page); 2930 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 2931 end - start + 1, uptodate)) 2932 return; 2933 2934 if (btrfs_is_free_space_inode(inode)) 2935 wq = fs_info->endio_freespace_worker; 2936 else 2937 wq = fs_info->endio_write_workers; 2938 2939 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); 2940 btrfs_queue_work(wq, &ordered_extent->work); 2941 } 2942 2943 /* 2944 * check_data_csum - verify checksum of one sector of uncompressed data 2945 * @inode: inode 2946 * @io_bio: btrfs_io_bio which contains the csum 2947 * @bio_offset: offset to the beginning of the bio (in bytes) 2948 * @page: page where is the data to be verified 2949 * @pgoff: offset inside the page 2950 * 2951 * The length of such check is always one sector size. 2952 */ 2953 static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, 2954 u32 bio_offset, struct page *page, u32 pgoff) 2955 { 2956 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2957 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 2958 char *kaddr; 2959 u32 len = fs_info->sectorsize; 2960 const u32 csum_size = fs_info->csum_size; 2961 unsigned int offset_sectors; 2962 u8 *csum_expected; 2963 u8 csum[BTRFS_CSUM_SIZE]; 2964 2965 ASSERT(pgoff + len <= PAGE_SIZE); 2966 2967 offset_sectors = bio_offset >> fs_info->sectorsize_bits; 2968 csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size; 2969 2970 kaddr = kmap_atomic(page); 2971 shash->tfm = fs_info->csum_shash; 2972 2973 crypto_shash_digest(shash, kaddr + pgoff, len, csum); 2974 2975 if (memcmp(csum, csum_expected, csum_size)) 2976 goto zeroit; 2977 2978 kunmap_atomic(kaddr); 2979 return 0; 2980 zeroit: 2981 btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff, 2982 csum, csum_expected, io_bio->mirror_num); 2983 if (io_bio->device) 2984 btrfs_dev_stat_inc_and_print(io_bio->device, 2985 BTRFS_DEV_STAT_CORRUPTION_ERRS); 2986 memset(kaddr + pgoff, 1, len); 2987 flush_dcache_page(page); 2988 kunmap_atomic(kaddr); 2989 return -EIO; 2990 } 2991 2992 /* 2993 * When reads are done, we need to check csums to verify the data is correct. 2994 * if there's a match, we allow the bio to finish. If not, the code in 2995 * extent_io.c will try to find good copies for us. 2996 * 2997 * @bio_offset: offset to the beginning of the bio (in bytes) 2998 * @start: file offset of the range start 2999 * @end: file offset of the range end (inclusive) 3000 * @mirror: mirror number 3001 */ 3002 int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, 3003 struct page *page, u64 start, u64 end, int mirror) 3004 { 3005 struct inode *inode = page->mapping->host; 3006 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3007 struct btrfs_root *root = BTRFS_I(inode)->root; 3008 const u32 sectorsize = root->fs_info->sectorsize; 3009 u32 pg_off; 3010 3011 if (PageChecked(page)) { 3012 ClearPageChecked(page); 3013 return 0; 3014 } 3015 3016 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 3017 return 0; 3018 3019 if (!root->fs_info->csum_root) 3020 return 0; 3021 3022 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 3023 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 3024 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); 3025 return 0; 3026 } 3027 3028 ASSERT(page_offset(page) <= start && 3029 end <= page_offset(page) + PAGE_SIZE - 1); 3030 for (pg_off = offset_in_page(start); 3031 pg_off < offset_in_page(end); 3032 pg_off += sectorsize, bio_offset += sectorsize) { 3033 int ret; 3034 3035 ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off); 3036 if (ret < 0) 3037 return -EIO; 3038 } 3039 return 0; 3040 } 3041 3042 /* 3043 * btrfs_add_delayed_iput - perform a delayed iput on @inode 3044 * 3045 * @inode: The inode we want to perform iput on 3046 * 3047 * This function uses the generic vfs_inode::i_count to track whether we should 3048 * just decrement it (in case it's > 1) or if this is the last iput then link 3049 * the inode to the delayed iput machinery. Delayed iputs are processed at 3050 * transaction commit time/superblock commit/cleaner kthread. 3051 */ 3052 void btrfs_add_delayed_iput(struct inode *inode) 3053 { 3054 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3055 struct btrfs_inode *binode = BTRFS_I(inode); 3056 3057 if (atomic_add_unless(&inode->i_count, -1, 1)) 3058 return; 3059 3060 atomic_inc(&fs_info->nr_delayed_iputs); 3061 spin_lock(&fs_info->delayed_iput_lock); 3062 ASSERT(list_empty(&binode->delayed_iput)); 3063 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); 3064 spin_unlock(&fs_info->delayed_iput_lock); 3065 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) 3066 wake_up_process(fs_info->cleaner_kthread); 3067 } 3068 3069 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, 3070 struct btrfs_inode *inode) 3071 { 3072 list_del_init(&inode->delayed_iput); 3073 spin_unlock(&fs_info->delayed_iput_lock); 3074 iput(&inode->vfs_inode); 3075 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) 3076 wake_up(&fs_info->delayed_iputs_wait); 3077 spin_lock(&fs_info->delayed_iput_lock); 3078 } 3079 3080 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, 3081 struct btrfs_inode *inode) 3082 { 3083 if (!list_empty(&inode->delayed_iput)) { 3084 spin_lock(&fs_info->delayed_iput_lock); 3085 if (!list_empty(&inode->delayed_iput)) 3086 run_delayed_iput_locked(fs_info, inode); 3087 spin_unlock(&fs_info->delayed_iput_lock); 3088 } 3089 } 3090 3091 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3092 { 3093 3094 spin_lock(&fs_info->delayed_iput_lock); 3095 while (!list_empty(&fs_info->delayed_iputs)) { 3096 struct btrfs_inode *inode; 3097 3098 inode = list_first_entry(&fs_info->delayed_iputs, 3099 struct btrfs_inode, delayed_iput); 3100 run_delayed_iput_locked(fs_info, inode); 3101 } 3102 spin_unlock(&fs_info->delayed_iput_lock); 3103 } 3104 3105 /** 3106 * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running 3107 * @fs_info - the fs_info for this fs 3108 * @return - EINTR if we were killed, 0 if nothing's pending 3109 * 3110 * This will wait on any delayed iputs that are currently running with KILLABLE 3111 * set. Once they are all done running we will return, unless we are killed in 3112 * which case we return EINTR. This helps in user operations like fallocate etc 3113 * that might get blocked on the iputs. 3114 */ 3115 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) 3116 { 3117 int ret = wait_event_killable(fs_info->delayed_iputs_wait, 3118 atomic_read(&fs_info->nr_delayed_iputs) == 0); 3119 if (ret) 3120 return -EINTR; 3121 return 0; 3122 } 3123 3124 /* 3125 * This creates an orphan entry for the given inode in case something goes wrong 3126 * in the middle of an unlink. 3127 */ 3128 int btrfs_orphan_add(struct btrfs_trans_handle *trans, 3129 struct btrfs_inode *inode) 3130 { 3131 int ret; 3132 3133 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); 3134 if (ret && ret != -EEXIST) { 3135 btrfs_abort_transaction(trans, ret); 3136 return ret; 3137 } 3138 3139 return 0; 3140 } 3141 3142 /* 3143 * We have done the delete so we can go ahead and remove the orphan item for 3144 * this particular inode. 3145 */ 3146 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3147 struct btrfs_inode *inode) 3148 { 3149 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); 3150 } 3151 3152 /* 3153 * this cleans up any orphans that may be left on the list from the last use 3154 * of this root. 3155 */ 3156 int btrfs_orphan_cleanup(struct btrfs_root *root) 3157 { 3158 struct btrfs_fs_info *fs_info = root->fs_info; 3159 struct btrfs_path *path; 3160 struct extent_buffer *leaf; 3161 struct btrfs_key key, found_key; 3162 struct btrfs_trans_handle *trans; 3163 struct inode *inode; 3164 u64 last_objectid = 0; 3165 int ret = 0, nr_unlink = 0; 3166 3167 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 3168 return 0; 3169 3170 path = btrfs_alloc_path(); 3171 if (!path) { 3172 ret = -ENOMEM; 3173 goto out; 3174 } 3175 path->reada = READA_BACK; 3176 3177 key.objectid = BTRFS_ORPHAN_OBJECTID; 3178 key.type = BTRFS_ORPHAN_ITEM_KEY; 3179 key.offset = (u64)-1; 3180 3181 while (1) { 3182 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3183 if (ret < 0) 3184 goto out; 3185 3186 /* 3187 * if ret == 0 means we found what we were searching for, which 3188 * is weird, but possible, so only screw with path if we didn't 3189 * find the key and see if we have stuff that matches 3190 */ 3191 if (ret > 0) { 3192 ret = 0; 3193 if (path->slots[0] == 0) 3194 break; 3195 path->slots[0]--; 3196 } 3197 3198 /* pull out the item */ 3199 leaf = path->nodes[0]; 3200 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3201 3202 /* make sure the item matches what we want */ 3203 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3204 break; 3205 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3206 break; 3207 3208 /* release the path since we're done with it */ 3209 btrfs_release_path(path); 3210 3211 /* 3212 * this is where we are basically btrfs_lookup, without the 3213 * crossing root thing. we store the inode number in the 3214 * offset of the orphan item. 3215 */ 3216 3217 if (found_key.offset == last_objectid) { 3218 btrfs_err(fs_info, 3219 "Error removing orphan entry, stopping orphan cleanup"); 3220 ret = -EINVAL; 3221 goto out; 3222 } 3223 3224 last_objectid = found_key.offset; 3225 3226 found_key.objectid = found_key.offset; 3227 found_key.type = BTRFS_INODE_ITEM_KEY; 3228 found_key.offset = 0; 3229 inode = btrfs_iget(fs_info->sb, last_objectid, root); 3230 ret = PTR_ERR_OR_ZERO(inode); 3231 if (ret && ret != -ENOENT) 3232 goto out; 3233 3234 if (ret == -ENOENT && root == fs_info->tree_root) { 3235 struct btrfs_root *dead_root; 3236 int is_dead_root = 0; 3237 3238 /* 3239 * this is an orphan in the tree root. Currently these 3240 * could come from 2 sources: 3241 * a) a snapshot deletion in progress 3242 * b) a free space cache inode 3243 * We need to distinguish those two, as the snapshot 3244 * orphan must not get deleted. 3245 * find_dead_roots already ran before us, so if this 3246 * is a snapshot deletion, we should find the root 3247 * in the fs_roots radix tree. 3248 */ 3249 3250 spin_lock(&fs_info->fs_roots_radix_lock); 3251 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, 3252 (unsigned long)found_key.objectid); 3253 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) 3254 is_dead_root = 1; 3255 spin_unlock(&fs_info->fs_roots_radix_lock); 3256 3257 if (is_dead_root) { 3258 /* prevent this orphan from being found again */ 3259 key.offset = found_key.objectid - 1; 3260 continue; 3261 } 3262 3263 } 3264 3265 /* 3266 * If we have an inode with links, there are a couple of 3267 * possibilities. Old kernels (before v3.12) used to create an 3268 * orphan item for truncate indicating that there were possibly 3269 * extent items past i_size that needed to be deleted. In v3.12, 3270 * truncate was changed to update i_size in sync with the extent 3271 * items, but the (useless) orphan item was still created. Since 3272 * v4.18, we don't create the orphan item for truncate at all. 3273 * 3274 * So, this item could mean that we need to do a truncate, but 3275 * only if this filesystem was last used on a pre-v3.12 kernel 3276 * and was not cleanly unmounted. The odds of that are quite 3277 * slim, and it's a pain to do the truncate now, so just delete 3278 * the orphan item. 3279 * 3280 * It's also possible that this orphan item was supposed to be 3281 * deleted but wasn't. The inode number may have been reused, 3282 * but either way, we can delete the orphan item. 3283 */ 3284 if (ret == -ENOENT || inode->i_nlink) { 3285 if (!ret) 3286 iput(inode); 3287 trans = btrfs_start_transaction(root, 1); 3288 if (IS_ERR(trans)) { 3289 ret = PTR_ERR(trans); 3290 goto out; 3291 } 3292 btrfs_debug(fs_info, "auto deleting %Lu", 3293 found_key.objectid); 3294 ret = btrfs_del_orphan_item(trans, root, 3295 found_key.objectid); 3296 btrfs_end_transaction(trans); 3297 if (ret) 3298 goto out; 3299 continue; 3300 } 3301 3302 nr_unlink++; 3303 3304 /* this will do delete_inode and everything for us */ 3305 iput(inode); 3306 } 3307 /* release the path since we're done with it */ 3308 btrfs_release_path(path); 3309 3310 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 3311 3312 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3313 trans = btrfs_join_transaction(root); 3314 if (!IS_ERR(trans)) 3315 btrfs_end_transaction(trans); 3316 } 3317 3318 if (nr_unlink) 3319 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); 3320 3321 out: 3322 if (ret) 3323 btrfs_err(fs_info, "could not do orphan cleanup %d", ret); 3324 btrfs_free_path(path); 3325 return ret; 3326 } 3327 3328 /* 3329 * very simple check to peek ahead in the leaf looking for xattrs. If we 3330 * don't find any xattrs, we know there can't be any acls. 3331 * 3332 * slot is the slot the inode is in, objectid is the objectid of the inode 3333 */ 3334 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3335 int slot, u64 objectid, 3336 int *first_xattr_slot) 3337 { 3338 u32 nritems = btrfs_header_nritems(leaf); 3339 struct btrfs_key found_key; 3340 static u64 xattr_access = 0; 3341 static u64 xattr_default = 0; 3342 int scanned = 0; 3343 3344 if (!xattr_access) { 3345 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, 3346 strlen(XATTR_NAME_POSIX_ACL_ACCESS)); 3347 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, 3348 strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); 3349 } 3350 3351 slot++; 3352 *first_xattr_slot = -1; 3353 while (slot < nritems) { 3354 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3355 3356 /* we found a different objectid, there must not be acls */ 3357 if (found_key.objectid != objectid) 3358 return 0; 3359 3360 /* we found an xattr, assume we've got an acl */ 3361 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3362 if (*first_xattr_slot == -1) 3363 *first_xattr_slot = slot; 3364 if (found_key.offset == xattr_access || 3365 found_key.offset == xattr_default) 3366 return 1; 3367 } 3368 3369 /* 3370 * we found a key greater than an xattr key, there can't 3371 * be any acls later on 3372 */ 3373 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3374 return 0; 3375 3376 slot++; 3377 scanned++; 3378 3379 /* 3380 * it goes inode, inode backrefs, xattrs, extents, 3381 * so if there are a ton of hard links to an inode there can 3382 * be a lot of backrefs. Don't waste time searching too hard, 3383 * this is just an optimization 3384 */ 3385 if (scanned >= 8) 3386 break; 3387 } 3388 /* we hit the end of the leaf before we found an xattr or 3389 * something larger than an xattr. We have to assume the inode 3390 * has acls 3391 */ 3392 if (*first_xattr_slot == -1) 3393 *first_xattr_slot = slot; 3394 return 1; 3395 } 3396 3397 /* 3398 * read an inode from the btree into the in-memory inode 3399 */ 3400 static int btrfs_read_locked_inode(struct inode *inode, 3401 struct btrfs_path *in_path) 3402 { 3403 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3404 struct btrfs_path *path = in_path; 3405 struct extent_buffer *leaf; 3406 struct btrfs_inode_item *inode_item; 3407 struct btrfs_root *root = BTRFS_I(inode)->root; 3408 struct btrfs_key location; 3409 unsigned long ptr; 3410 int maybe_acls; 3411 u32 rdev; 3412 int ret; 3413 bool filled = false; 3414 int first_xattr_slot; 3415 3416 ret = btrfs_fill_inode(inode, &rdev); 3417 if (!ret) 3418 filled = true; 3419 3420 if (!path) { 3421 path = btrfs_alloc_path(); 3422 if (!path) 3423 return -ENOMEM; 3424 } 3425 3426 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3427 3428 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3429 if (ret) { 3430 if (path != in_path) 3431 btrfs_free_path(path); 3432 return ret; 3433 } 3434 3435 leaf = path->nodes[0]; 3436 3437 if (filled) 3438 goto cache_index; 3439 3440 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3441 struct btrfs_inode_item); 3442 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3443 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3444 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3445 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3446 btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); 3447 btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, 3448 round_up(i_size_read(inode), fs_info->sectorsize)); 3449 3450 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); 3451 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); 3452 3453 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); 3454 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); 3455 3456 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); 3457 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); 3458 3459 BTRFS_I(inode)->i_otime.tv_sec = 3460 btrfs_timespec_sec(leaf, &inode_item->otime); 3461 BTRFS_I(inode)->i_otime.tv_nsec = 3462 btrfs_timespec_nsec(leaf, &inode_item->otime); 3463 3464 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3465 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3466 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3467 3468 inode_set_iversion_queried(inode, 3469 btrfs_inode_sequence(leaf, inode_item)); 3470 inode->i_generation = BTRFS_I(inode)->generation; 3471 inode->i_rdev = 0; 3472 rdev = btrfs_inode_rdev(leaf, inode_item); 3473 3474 BTRFS_I(inode)->index_cnt = (u64)-1; 3475 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 3476 3477 cache_index: 3478 /* 3479 * If we were modified in the current generation and evicted from memory 3480 * and then re-read we need to do a full sync since we don't have any 3481 * idea about which extents were modified before we were evicted from 3482 * cache. 3483 * 3484 * This is required for both inode re-read from disk and delayed inode 3485 * in delayed_nodes_tree. 3486 */ 3487 if (BTRFS_I(inode)->last_trans == fs_info->generation) 3488 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3489 &BTRFS_I(inode)->runtime_flags); 3490 3491 /* 3492 * We don't persist the id of the transaction where an unlink operation 3493 * against the inode was last made. So here we assume the inode might 3494 * have been evicted, and therefore the exact value of last_unlink_trans 3495 * lost, and set it to last_trans to avoid metadata inconsistencies 3496 * between the inode and its parent if the inode is fsync'ed and the log 3497 * replayed. For example, in the scenario: 3498 * 3499 * touch mydir/foo 3500 * ln mydir/foo mydir/bar 3501 * sync 3502 * unlink mydir/bar 3503 * echo 2 > /proc/sys/vm/drop_caches # evicts inode 3504 * xfs_io -c fsync mydir/foo 3505 * <power failure> 3506 * mount fs, triggers fsync log replay 3507 * 3508 * We must make sure that when we fsync our inode foo we also log its 3509 * parent inode, otherwise after log replay the parent still has the 3510 * dentry with the "bar" name but our inode foo has a link count of 1 3511 * and doesn't have an inode ref with the name "bar" anymore. 3512 * 3513 * Setting last_unlink_trans to last_trans is a pessimistic approach, 3514 * but it guarantees correctness at the expense of occasional full 3515 * transaction commits on fsync if our inode is a directory, or if our 3516 * inode is not a directory, logging its parent unnecessarily. 3517 */ 3518 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; 3519 3520 /* 3521 * Same logic as for last_unlink_trans. We don't persist the generation 3522 * of the last transaction where this inode was used for a reflink 3523 * operation, so after eviction and reloading the inode we must be 3524 * pessimistic and assume the last transaction that modified the inode. 3525 */ 3526 BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; 3527 3528 path->slots[0]++; 3529 if (inode->i_nlink != 1 || 3530 path->slots[0] >= btrfs_header_nritems(leaf)) 3531 goto cache_acl; 3532 3533 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 3534 if (location.objectid != btrfs_ino(BTRFS_I(inode))) 3535 goto cache_acl; 3536 3537 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3538 if (location.type == BTRFS_INODE_REF_KEY) { 3539 struct btrfs_inode_ref *ref; 3540 3541 ref = (struct btrfs_inode_ref *)ptr; 3542 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); 3543 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 3544 struct btrfs_inode_extref *extref; 3545 3546 extref = (struct btrfs_inode_extref *)ptr; 3547 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, 3548 extref); 3549 } 3550 cache_acl: 3551 /* 3552 * try to precache a NULL acl entry for files that don't have 3553 * any xattrs or acls 3554 */ 3555 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3556 btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); 3557 if (first_xattr_slot != -1) { 3558 path->slots[0] = first_xattr_slot; 3559 ret = btrfs_load_inode_props(inode, path); 3560 if (ret) 3561 btrfs_err(fs_info, 3562 "error loading props for ino %llu (root %llu): %d", 3563 btrfs_ino(BTRFS_I(inode)), 3564 root->root_key.objectid, ret); 3565 } 3566 if (path != in_path) 3567 btrfs_free_path(path); 3568 3569 if (!maybe_acls) 3570 cache_no_acl(inode); 3571 3572 switch (inode->i_mode & S_IFMT) { 3573 case S_IFREG: 3574 inode->i_mapping->a_ops = &btrfs_aops; 3575 inode->i_fop = &btrfs_file_operations; 3576 inode->i_op = &btrfs_file_inode_operations; 3577 break; 3578 case S_IFDIR: 3579 inode->i_fop = &btrfs_dir_file_operations; 3580 inode->i_op = &btrfs_dir_inode_operations; 3581 break; 3582 case S_IFLNK: 3583 inode->i_op = &btrfs_symlink_inode_operations; 3584 inode_nohighmem(inode); 3585 inode->i_mapping->a_ops = &btrfs_aops; 3586 break; 3587 default: 3588 inode->i_op = &btrfs_special_inode_operations; 3589 init_special_inode(inode, inode->i_mode, rdev); 3590 break; 3591 } 3592 3593 btrfs_sync_inode_flags_to_i_flags(inode); 3594 return 0; 3595 } 3596 3597 /* 3598 * given a leaf and an inode, copy the inode fields into the leaf 3599 */ 3600 static void fill_inode_item(struct btrfs_trans_handle *trans, 3601 struct extent_buffer *leaf, 3602 struct btrfs_inode_item *item, 3603 struct inode *inode) 3604 { 3605 struct btrfs_map_token token; 3606 3607 btrfs_init_map_token(&token, leaf); 3608 3609 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3610 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3611 btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); 3612 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3613 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3614 3615 btrfs_set_token_timespec_sec(&token, &item->atime, 3616 inode->i_atime.tv_sec); 3617 btrfs_set_token_timespec_nsec(&token, &item->atime, 3618 inode->i_atime.tv_nsec); 3619 3620 btrfs_set_token_timespec_sec(&token, &item->mtime, 3621 inode->i_mtime.tv_sec); 3622 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3623 inode->i_mtime.tv_nsec); 3624 3625 btrfs_set_token_timespec_sec(&token, &item->ctime, 3626 inode->i_ctime.tv_sec); 3627 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3628 inode->i_ctime.tv_nsec); 3629 3630 btrfs_set_token_timespec_sec(&token, &item->otime, 3631 BTRFS_I(inode)->i_otime.tv_sec); 3632 btrfs_set_token_timespec_nsec(&token, &item->otime, 3633 BTRFS_I(inode)->i_otime.tv_nsec); 3634 3635 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); 3636 btrfs_set_token_inode_generation(&token, item, 3637 BTRFS_I(inode)->generation); 3638 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3639 btrfs_set_token_inode_transid(&token, item, trans->transid); 3640 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3641 btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); 3642 btrfs_set_token_inode_block_group(&token, item, 0); 3643 } 3644 3645 /* 3646 * copy everything in the in-memory inode into the btree. 3647 */ 3648 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 3649 struct btrfs_root *root, 3650 struct btrfs_inode *inode) 3651 { 3652 struct btrfs_inode_item *inode_item; 3653 struct btrfs_path *path; 3654 struct extent_buffer *leaf; 3655 int ret; 3656 3657 path = btrfs_alloc_path(); 3658 if (!path) 3659 return -ENOMEM; 3660 3661 ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); 3662 if (ret) { 3663 if (ret > 0) 3664 ret = -ENOENT; 3665 goto failed; 3666 } 3667 3668 leaf = path->nodes[0]; 3669 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3670 struct btrfs_inode_item); 3671 3672 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); 3673 btrfs_mark_buffer_dirty(leaf); 3674 btrfs_set_inode_last_trans(trans, inode); 3675 ret = 0; 3676 failed: 3677 btrfs_free_path(path); 3678 return ret; 3679 } 3680 3681 /* 3682 * copy everything in the in-memory inode into the btree. 3683 */ 3684 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 3685 struct btrfs_root *root, 3686 struct btrfs_inode *inode) 3687 { 3688 struct btrfs_fs_info *fs_info = root->fs_info; 3689 int ret; 3690 3691 /* 3692 * If the inode is a free space inode, we can deadlock during commit 3693 * if we put it into the delayed code. 3694 * 3695 * The data relocation inode should also be directly updated 3696 * without delay 3697 */ 3698 if (!btrfs_is_free_space_inode(inode) 3699 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 3700 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 3701 btrfs_update_root_times(trans, root); 3702 3703 ret = btrfs_delayed_update_inode(trans, root, inode); 3704 if (!ret) 3705 btrfs_set_inode_last_trans(trans, inode); 3706 return ret; 3707 } 3708 3709 return btrfs_update_inode_item(trans, root, inode); 3710 } 3711 3712 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 3713 struct btrfs_root *root, struct btrfs_inode *inode) 3714 { 3715 int ret; 3716 3717 ret = btrfs_update_inode(trans, root, inode); 3718 if (ret == -ENOSPC) 3719 return btrfs_update_inode_item(trans, root, inode); 3720 return ret; 3721 } 3722 3723 /* 3724 * unlink helper that gets used here in inode.c and in the tree logging 3725 * recovery code. It remove a link in a directory with a given name, and 3726 * also drops the back refs in the inode to the directory 3727 */ 3728 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3729 struct btrfs_root *root, 3730 struct btrfs_inode *dir, 3731 struct btrfs_inode *inode, 3732 const char *name, int name_len) 3733 { 3734 struct btrfs_fs_info *fs_info = root->fs_info; 3735 struct btrfs_path *path; 3736 int ret = 0; 3737 struct btrfs_dir_item *di; 3738 u64 index; 3739 u64 ino = btrfs_ino(inode); 3740 u64 dir_ino = btrfs_ino(dir); 3741 3742 path = btrfs_alloc_path(); 3743 if (!path) { 3744 ret = -ENOMEM; 3745 goto out; 3746 } 3747 3748 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3749 name, name_len, -1); 3750 if (IS_ERR_OR_NULL(di)) { 3751 ret = di ? PTR_ERR(di) : -ENOENT; 3752 goto err; 3753 } 3754 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3755 if (ret) 3756 goto err; 3757 btrfs_release_path(path); 3758 3759 /* 3760 * If we don't have dir index, we have to get it by looking up 3761 * the inode ref, since we get the inode ref, remove it directly, 3762 * it is unnecessary to do delayed deletion. 3763 * 3764 * But if we have dir index, needn't search inode ref to get it. 3765 * Since the inode ref is close to the inode item, it is better 3766 * that we delay to delete it, and just do this deletion when 3767 * we update the inode item. 3768 */ 3769 if (inode->dir_index) { 3770 ret = btrfs_delayed_delete_inode_ref(inode); 3771 if (!ret) { 3772 index = inode->dir_index; 3773 goto skip_backref; 3774 } 3775 } 3776 3777 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 3778 dir_ino, &index); 3779 if (ret) { 3780 btrfs_info(fs_info, 3781 "failed to delete reference to %.*s, inode %llu parent %llu", 3782 name_len, name, ino, dir_ino); 3783 btrfs_abort_transaction(trans, ret); 3784 goto err; 3785 } 3786 skip_backref: 3787 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 3788 if (ret) { 3789 btrfs_abort_transaction(trans, ret); 3790 goto err; 3791 } 3792 3793 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, 3794 dir_ino); 3795 if (ret != 0 && ret != -ENOENT) { 3796 btrfs_abort_transaction(trans, ret); 3797 goto err; 3798 } 3799 3800 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, 3801 index); 3802 if (ret == -ENOENT) 3803 ret = 0; 3804 else if (ret) 3805 btrfs_abort_transaction(trans, ret); 3806 3807 /* 3808 * If we have a pending delayed iput we could end up with the final iput 3809 * being run in btrfs-cleaner context. If we have enough of these built 3810 * up we can end up burning a lot of time in btrfs-cleaner without any 3811 * way to throttle the unlinks. Since we're currently holding a ref on 3812 * the inode we can run the delayed iput here without any issues as the 3813 * final iput won't be done until after we drop the ref we're currently 3814 * holding. 3815 */ 3816 btrfs_run_delayed_iput(fs_info, inode); 3817 err: 3818 btrfs_free_path(path); 3819 if (ret) 3820 goto out; 3821 3822 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); 3823 inode_inc_iversion(&inode->vfs_inode); 3824 inode_inc_iversion(&dir->vfs_inode); 3825 inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = 3826 dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); 3827 ret = btrfs_update_inode(trans, root, dir); 3828 out: 3829 return ret; 3830 } 3831 3832 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3833 struct btrfs_root *root, 3834 struct btrfs_inode *dir, struct btrfs_inode *inode, 3835 const char *name, int name_len) 3836 { 3837 int ret; 3838 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 3839 if (!ret) { 3840 drop_nlink(&inode->vfs_inode); 3841 ret = btrfs_update_inode(trans, root, inode); 3842 } 3843 return ret; 3844 } 3845 3846 /* 3847 * helper to start transaction for unlink and rmdir. 3848 * 3849 * unlink and rmdir are special in btrfs, they do not always free space, so 3850 * if we cannot make our reservations the normal way try and see if there is 3851 * plenty of slack room in the global reserve to migrate, otherwise we cannot 3852 * allow the unlink to occur. 3853 */ 3854 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) 3855 { 3856 struct btrfs_root *root = BTRFS_I(dir)->root; 3857 3858 /* 3859 * 1 for the possible orphan item 3860 * 1 for the dir item 3861 * 1 for the dir index 3862 * 1 for the inode ref 3863 * 1 for the inode 3864 */ 3865 return btrfs_start_transaction_fallback_global_rsv(root, 5); 3866 } 3867 3868 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3869 { 3870 struct btrfs_root *root = BTRFS_I(dir)->root; 3871 struct btrfs_trans_handle *trans; 3872 struct inode *inode = d_inode(dentry); 3873 int ret; 3874 3875 trans = __unlink_start_trans(dir); 3876 if (IS_ERR(trans)) 3877 return PTR_ERR(trans); 3878 3879 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 3880 0); 3881 3882 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 3883 BTRFS_I(d_inode(dentry)), dentry->d_name.name, 3884 dentry->d_name.len); 3885 if (ret) 3886 goto out; 3887 3888 if (inode->i_nlink == 0) { 3889 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 3890 if (ret) 3891 goto out; 3892 } 3893 3894 out: 3895 btrfs_end_transaction(trans); 3896 btrfs_btree_balance_dirty(root->fs_info); 3897 return ret; 3898 } 3899 3900 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 3901 struct inode *dir, struct dentry *dentry) 3902 { 3903 struct btrfs_root *root = BTRFS_I(dir)->root; 3904 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 3905 struct btrfs_path *path; 3906 struct extent_buffer *leaf; 3907 struct btrfs_dir_item *di; 3908 struct btrfs_key key; 3909 const char *name = dentry->d_name.name; 3910 int name_len = dentry->d_name.len; 3911 u64 index; 3912 int ret; 3913 u64 objectid; 3914 u64 dir_ino = btrfs_ino(BTRFS_I(dir)); 3915 3916 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { 3917 objectid = inode->root->root_key.objectid; 3918 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 3919 objectid = inode->location.objectid; 3920 } else { 3921 WARN_ON(1); 3922 return -EINVAL; 3923 } 3924 3925 path = btrfs_alloc_path(); 3926 if (!path) 3927 return -ENOMEM; 3928 3929 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3930 name, name_len, -1); 3931 if (IS_ERR_OR_NULL(di)) { 3932 ret = di ? PTR_ERR(di) : -ENOENT; 3933 goto out; 3934 } 3935 3936 leaf = path->nodes[0]; 3937 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3938 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3939 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3940 if (ret) { 3941 btrfs_abort_transaction(trans, ret); 3942 goto out; 3943 } 3944 btrfs_release_path(path); 3945 3946 /* 3947 * This is a placeholder inode for a subvolume we didn't have a 3948 * reference to at the time of the snapshot creation. In the meantime 3949 * we could have renamed the real subvol link into our snapshot, so 3950 * depending on btrfs_del_root_ref to return -ENOENT here is incorret. 3951 * Instead simply lookup the dir_index_item for this entry so we can 3952 * remove it. Otherwise we know we have a ref to the root and we can 3953 * call btrfs_del_root_ref, and it _shouldn't_ fail. 3954 */ 3955 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 3956 di = btrfs_search_dir_index_item(root, path, dir_ino, 3957 name, name_len); 3958 if (IS_ERR_OR_NULL(di)) { 3959 if (!di) 3960 ret = -ENOENT; 3961 else 3962 ret = PTR_ERR(di); 3963 btrfs_abort_transaction(trans, ret); 3964 goto out; 3965 } 3966 3967 leaf = path->nodes[0]; 3968 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3969 index = key.offset; 3970 btrfs_release_path(path); 3971 } else { 3972 ret = btrfs_del_root_ref(trans, objectid, 3973 root->root_key.objectid, dir_ino, 3974 &index, name, name_len); 3975 if (ret) { 3976 btrfs_abort_transaction(trans, ret); 3977 goto out; 3978 } 3979 } 3980 3981 ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); 3982 if (ret) { 3983 btrfs_abort_transaction(trans, ret); 3984 goto out; 3985 } 3986 3987 btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); 3988 inode_inc_iversion(dir); 3989 dir->i_mtime = dir->i_ctime = current_time(dir); 3990 ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); 3991 if (ret) 3992 btrfs_abort_transaction(trans, ret); 3993 out: 3994 btrfs_free_path(path); 3995 return ret; 3996 } 3997 3998 /* 3999 * Helper to check if the subvolume references other subvolumes or if it's 4000 * default. 4001 */ 4002 static noinline int may_destroy_subvol(struct btrfs_root *root) 4003 { 4004 struct btrfs_fs_info *fs_info = root->fs_info; 4005 struct btrfs_path *path; 4006 struct btrfs_dir_item *di; 4007 struct btrfs_key key; 4008 u64 dir_id; 4009 int ret; 4010 4011 path = btrfs_alloc_path(); 4012 if (!path) 4013 return -ENOMEM; 4014 4015 /* Make sure this root isn't set as the default subvol */ 4016 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4017 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, 4018 dir_id, "default", 7, 0); 4019 if (di && !IS_ERR(di)) { 4020 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 4021 if (key.objectid == root->root_key.objectid) { 4022 ret = -EPERM; 4023 btrfs_err(fs_info, 4024 "deleting default subvolume %llu is not allowed", 4025 key.objectid); 4026 goto out; 4027 } 4028 btrfs_release_path(path); 4029 } 4030 4031 key.objectid = root->root_key.objectid; 4032 key.type = BTRFS_ROOT_REF_KEY; 4033 key.offset = (u64)-1; 4034 4035 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4036 if (ret < 0) 4037 goto out; 4038 BUG_ON(ret == 0); 4039 4040 ret = 0; 4041 if (path->slots[0] > 0) { 4042 path->slots[0]--; 4043 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 4044 if (key.objectid == root->root_key.objectid && 4045 key.type == BTRFS_ROOT_REF_KEY) 4046 ret = -ENOTEMPTY; 4047 } 4048 out: 4049 btrfs_free_path(path); 4050 return ret; 4051 } 4052 4053 /* Delete all dentries for inodes belonging to the root */ 4054 static void btrfs_prune_dentries(struct btrfs_root *root) 4055 { 4056 struct btrfs_fs_info *fs_info = root->fs_info; 4057 struct rb_node *node; 4058 struct rb_node *prev; 4059 struct btrfs_inode *entry; 4060 struct inode *inode; 4061 u64 objectid = 0; 4062 4063 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 4064 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4065 4066 spin_lock(&root->inode_lock); 4067 again: 4068 node = root->inode_tree.rb_node; 4069 prev = NULL; 4070 while (node) { 4071 prev = node; 4072 entry = rb_entry(node, struct btrfs_inode, rb_node); 4073 4074 if (objectid < btrfs_ino(entry)) 4075 node = node->rb_left; 4076 else if (objectid > btrfs_ino(entry)) 4077 node = node->rb_right; 4078 else 4079 break; 4080 } 4081 if (!node) { 4082 while (prev) { 4083 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4084 if (objectid <= btrfs_ino(entry)) { 4085 node = prev; 4086 break; 4087 } 4088 prev = rb_next(prev); 4089 } 4090 } 4091 while (node) { 4092 entry = rb_entry(node, struct btrfs_inode, rb_node); 4093 objectid = btrfs_ino(entry) + 1; 4094 inode = igrab(&entry->vfs_inode); 4095 if (inode) { 4096 spin_unlock(&root->inode_lock); 4097 if (atomic_read(&inode->i_count) > 1) 4098 d_prune_aliases(inode); 4099 /* 4100 * btrfs_drop_inode will have it removed from the inode 4101 * cache when its usage count hits zero. 4102 */ 4103 iput(inode); 4104 cond_resched(); 4105 spin_lock(&root->inode_lock); 4106 goto again; 4107 } 4108 4109 if (cond_resched_lock(&root->inode_lock)) 4110 goto again; 4111 4112 node = rb_next(node); 4113 } 4114 spin_unlock(&root->inode_lock); 4115 } 4116 4117 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) 4118 { 4119 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); 4120 struct btrfs_root *root = BTRFS_I(dir)->root; 4121 struct inode *inode = d_inode(dentry); 4122 struct btrfs_root *dest = BTRFS_I(inode)->root; 4123 struct btrfs_trans_handle *trans; 4124 struct btrfs_block_rsv block_rsv; 4125 u64 root_flags; 4126 int ret; 4127 4128 /* 4129 * Don't allow to delete a subvolume with send in progress. This is 4130 * inside the inode lock so the error handling that has to drop the bit 4131 * again is not run concurrently. 4132 */ 4133 spin_lock(&dest->root_item_lock); 4134 if (dest->send_in_progress) { 4135 spin_unlock(&dest->root_item_lock); 4136 btrfs_warn(fs_info, 4137 "attempt to delete subvolume %llu during send", 4138 dest->root_key.objectid); 4139 return -EPERM; 4140 } 4141 root_flags = btrfs_root_flags(&dest->root_item); 4142 btrfs_set_root_flags(&dest->root_item, 4143 root_flags | BTRFS_ROOT_SUBVOL_DEAD); 4144 spin_unlock(&dest->root_item_lock); 4145 4146 down_write(&fs_info->subvol_sem); 4147 4148 ret = may_destroy_subvol(dest); 4149 if (ret) 4150 goto out_up_write; 4151 4152 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 4153 /* 4154 * One for dir inode, 4155 * two for dir entries, 4156 * two for root ref/backref. 4157 */ 4158 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); 4159 if (ret) 4160 goto out_up_write; 4161 4162 trans = btrfs_start_transaction(root, 0); 4163 if (IS_ERR(trans)) { 4164 ret = PTR_ERR(trans); 4165 goto out_release; 4166 } 4167 trans->block_rsv = &block_rsv; 4168 trans->bytes_reserved = block_rsv.size; 4169 4170 btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); 4171 4172 ret = btrfs_unlink_subvol(trans, dir, dentry); 4173 if (ret) { 4174 btrfs_abort_transaction(trans, ret); 4175 goto out_end_trans; 4176 } 4177 4178 btrfs_record_root_in_trans(trans, dest); 4179 4180 memset(&dest->root_item.drop_progress, 0, 4181 sizeof(dest->root_item.drop_progress)); 4182 btrfs_set_root_drop_level(&dest->root_item, 0); 4183 btrfs_set_root_refs(&dest->root_item, 0); 4184 4185 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { 4186 ret = btrfs_insert_orphan_item(trans, 4187 fs_info->tree_root, 4188 dest->root_key.objectid); 4189 if (ret) { 4190 btrfs_abort_transaction(trans, ret); 4191 goto out_end_trans; 4192 } 4193 } 4194 4195 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, 4196 BTRFS_UUID_KEY_SUBVOL, 4197 dest->root_key.objectid); 4198 if (ret && ret != -ENOENT) { 4199 btrfs_abort_transaction(trans, ret); 4200 goto out_end_trans; 4201 } 4202 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { 4203 ret = btrfs_uuid_tree_remove(trans, 4204 dest->root_item.received_uuid, 4205 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4206 dest->root_key.objectid); 4207 if (ret && ret != -ENOENT) { 4208 btrfs_abort_transaction(trans, ret); 4209 goto out_end_trans; 4210 } 4211 } 4212 4213 free_anon_bdev(dest->anon_dev); 4214 dest->anon_dev = 0; 4215 out_end_trans: 4216 trans->block_rsv = NULL; 4217 trans->bytes_reserved = 0; 4218 ret = btrfs_end_transaction(trans); 4219 inode->i_flags |= S_DEAD; 4220 out_release: 4221 btrfs_subvolume_release_metadata(root, &block_rsv); 4222 out_up_write: 4223 up_write(&fs_info->subvol_sem); 4224 if (ret) { 4225 spin_lock(&dest->root_item_lock); 4226 root_flags = btrfs_root_flags(&dest->root_item); 4227 btrfs_set_root_flags(&dest->root_item, 4228 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); 4229 spin_unlock(&dest->root_item_lock); 4230 } else { 4231 d_invalidate(dentry); 4232 btrfs_prune_dentries(dest); 4233 ASSERT(dest->send_in_progress == 0); 4234 } 4235 4236 return ret; 4237 } 4238 4239 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4240 { 4241 struct inode *inode = d_inode(dentry); 4242 int err = 0; 4243 struct btrfs_root *root = BTRFS_I(dir)->root; 4244 struct btrfs_trans_handle *trans; 4245 u64 last_unlink_trans; 4246 4247 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4248 return -ENOTEMPTY; 4249 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) 4250 return btrfs_delete_subvolume(dir, dentry); 4251 4252 trans = __unlink_start_trans(dir); 4253 if (IS_ERR(trans)) 4254 return PTR_ERR(trans); 4255 4256 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4257 err = btrfs_unlink_subvol(trans, dir, dentry); 4258 goto out; 4259 } 4260 4261 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4262 if (err) 4263 goto out; 4264 4265 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; 4266 4267 /* now the directory is empty */ 4268 err = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 4269 BTRFS_I(d_inode(dentry)), dentry->d_name.name, 4270 dentry->d_name.len); 4271 if (!err) { 4272 btrfs_i_size_write(BTRFS_I(inode), 0); 4273 /* 4274 * Propagate the last_unlink_trans value of the deleted dir to 4275 * its parent directory. This is to prevent an unrecoverable 4276 * log tree in the case we do something like this: 4277 * 1) create dir foo 4278 * 2) create snapshot under dir foo 4279 * 3) delete the snapshot 4280 * 4) rmdir foo 4281 * 5) mkdir foo 4282 * 6) fsync foo or some file inside foo 4283 */ 4284 if (last_unlink_trans >= trans->transid) 4285 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; 4286 } 4287 out: 4288 btrfs_end_transaction(trans); 4289 btrfs_btree_balance_dirty(root->fs_info); 4290 4291 return err; 4292 } 4293 4294 /* 4295 * Return this if we need to call truncate_block for the last bit of the 4296 * truncate. 4297 */ 4298 #define NEED_TRUNCATE_BLOCK 1 4299 4300 /* 4301 * this can truncate away extent items, csum items and directory items. 4302 * It starts at a high offset and removes keys until it can't find 4303 * any higher than new_size 4304 * 4305 * csum items that cross the new i_size are truncated to the new size 4306 * as well. 4307 * 4308 * min_type is the minimum key type to truncate down to. If set to 0, this 4309 * will kill all the items on this inode, including the INODE_ITEM_KEY. 4310 */ 4311 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4312 struct btrfs_root *root, 4313 struct btrfs_inode *inode, 4314 u64 new_size, u32 min_type) 4315 { 4316 struct btrfs_fs_info *fs_info = root->fs_info; 4317 struct btrfs_path *path; 4318 struct extent_buffer *leaf; 4319 struct btrfs_file_extent_item *fi; 4320 struct btrfs_key key; 4321 struct btrfs_key found_key; 4322 u64 extent_start = 0; 4323 u64 extent_num_bytes = 0; 4324 u64 extent_offset = 0; 4325 u64 item_end = 0; 4326 u64 last_size = new_size; 4327 u32 found_type = (u8)-1; 4328 int found_extent; 4329 int del_item; 4330 int pending_del_nr = 0; 4331 int pending_del_slot = 0; 4332 int extent_type = -1; 4333 int ret; 4334 u64 ino = btrfs_ino(inode); 4335 u64 bytes_deleted = 0; 4336 bool be_nice = false; 4337 bool should_throttle = false; 4338 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); 4339 struct extent_state *cached_state = NULL; 4340 4341 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4342 4343 /* 4344 * For non-free space inodes and non-shareable roots, we want to back 4345 * off from time to time. This means all inodes in subvolume roots, 4346 * reloc roots, and data reloc roots. 4347 */ 4348 if (!btrfs_is_free_space_inode(inode) && 4349 test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 4350 be_nice = true; 4351 4352 path = btrfs_alloc_path(); 4353 if (!path) 4354 return -ENOMEM; 4355 path->reada = READA_BACK; 4356 4357 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4358 lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, 4359 &cached_state); 4360 4361 /* 4362 * We want to drop from the next block forward in case this 4363 * new size is not block aligned since we will be keeping the 4364 * last block of the extent just the way it is. 4365 */ 4366 btrfs_drop_extent_cache(inode, ALIGN(new_size, 4367 fs_info->sectorsize), 4368 (u64)-1, 0); 4369 } 4370 4371 /* 4372 * This function is also used to drop the items in the log tree before 4373 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 4374 * it is used to drop the logged items. So we shouldn't kill the delayed 4375 * items. 4376 */ 4377 if (min_type == 0 && root == inode->root) 4378 btrfs_kill_delayed_inode_items(inode); 4379 4380 key.objectid = ino; 4381 key.offset = (u64)-1; 4382 key.type = (u8)-1; 4383 4384 search_again: 4385 /* 4386 * with a 16K leaf size and 128MB extents, you can actually queue 4387 * up a huge file in a single leaf. Most of the time that 4388 * bytes_deleted is > 0, it will be huge by the time we get here 4389 */ 4390 if (be_nice && bytes_deleted > SZ_32M && 4391 btrfs_should_end_transaction(trans)) { 4392 ret = -EAGAIN; 4393 goto out; 4394 } 4395 4396 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 4397 if (ret < 0) 4398 goto out; 4399 4400 if (ret > 0) { 4401 ret = 0; 4402 /* there are no items in the tree for us to truncate, we're 4403 * done 4404 */ 4405 if (path->slots[0] == 0) 4406 goto out; 4407 path->slots[0]--; 4408 } 4409 4410 while (1) { 4411 u64 clear_start = 0, clear_len = 0; 4412 4413 fi = NULL; 4414 leaf = path->nodes[0]; 4415 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4416 found_type = found_key.type; 4417 4418 if (found_key.objectid != ino) 4419 break; 4420 4421 if (found_type < min_type) 4422 break; 4423 4424 item_end = found_key.offset; 4425 if (found_type == BTRFS_EXTENT_DATA_KEY) { 4426 fi = btrfs_item_ptr(leaf, path->slots[0], 4427 struct btrfs_file_extent_item); 4428 extent_type = btrfs_file_extent_type(leaf, fi); 4429 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4430 item_end += 4431 btrfs_file_extent_num_bytes(leaf, fi); 4432 4433 trace_btrfs_truncate_show_fi_regular( 4434 inode, leaf, fi, found_key.offset); 4435 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4436 item_end += btrfs_file_extent_ram_bytes(leaf, 4437 fi); 4438 4439 trace_btrfs_truncate_show_fi_inline( 4440 inode, leaf, fi, path->slots[0], 4441 found_key.offset); 4442 } 4443 item_end--; 4444 } 4445 if (found_type > min_type) { 4446 del_item = 1; 4447 } else { 4448 if (item_end < new_size) 4449 break; 4450 if (found_key.offset >= new_size) 4451 del_item = 1; 4452 else 4453 del_item = 0; 4454 } 4455 found_extent = 0; 4456 /* FIXME, shrink the extent if the ref count is only 1 */ 4457 if (found_type != BTRFS_EXTENT_DATA_KEY) 4458 goto delete; 4459 4460 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4461 u64 num_dec; 4462 4463 clear_start = found_key.offset; 4464 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 4465 if (!del_item) { 4466 u64 orig_num_bytes = 4467 btrfs_file_extent_num_bytes(leaf, fi); 4468 extent_num_bytes = ALIGN(new_size - 4469 found_key.offset, 4470 fs_info->sectorsize); 4471 clear_start = ALIGN(new_size, fs_info->sectorsize); 4472 btrfs_set_file_extent_num_bytes(leaf, fi, 4473 extent_num_bytes); 4474 num_dec = (orig_num_bytes - 4475 extent_num_bytes); 4476 if (test_bit(BTRFS_ROOT_SHAREABLE, 4477 &root->state) && 4478 extent_start != 0) 4479 inode_sub_bytes(&inode->vfs_inode, 4480 num_dec); 4481 btrfs_mark_buffer_dirty(leaf); 4482 } else { 4483 extent_num_bytes = 4484 btrfs_file_extent_disk_num_bytes(leaf, 4485 fi); 4486 extent_offset = found_key.offset - 4487 btrfs_file_extent_offset(leaf, fi); 4488 4489 /* FIXME blocksize != 4096 */ 4490 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 4491 if (extent_start != 0) { 4492 found_extent = 1; 4493 if (test_bit(BTRFS_ROOT_SHAREABLE, 4494 &root->state)) 4495 inode_sub_bytes(&inode->vfs_inode, 4496 num_dec); 4497 } 4498 } 4499 clear_len = num_dec; 4500 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4501 /* 4502 * we can't truncate inline items that have had 4503 * special encodings 4504 */ 4505 if (!del_item && 4506 btrfs_file_extent_encryption(leaf, fi) == 0 && 4507 btrfs_file_extent_other_encoding(leaf, fi) == 0 && 4508 btrfs_file_extent_compression(leaf, fi) == 0) { 4509 u32 size = (u32)(new_size - found_key.offset); 4510 4511 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4512 size = btrfs_file_extent_calc_inline_size(size); 4513 btrfs_truncate_item(path, size, 1); 4514 } else if (!del_item) { 4515 /* 4516 * We have to bail so the last_size is set to 4517 * just before this extent. 4518 */ 4519 ret = NEED_TRUNCATE_BLOCK; 4520 break; 4521 } else { 4522 /* 4523 * Inline extents are special, we just treat 4524 * them as a full sector worth in the file 4525 * extent tree just for simplicity sake. 4526 */ 4527 clear_len = fs_info->sectorsize; 4528 } 4529 4530 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 4531 inode_sub_bytes(&inode->vfs_inode, 4532 item_end + 1 - new_size); 4533 } 4534 delete: 4535 /* 4536 * We use btrfs_truncate_inode_items() to clean up log trees for 4537 * multiple fsyncs, and in this case we don't want to clear the 4538 * file extent range because it's just the log. 4539 */ 4540 if (root == inode->root) { 4541 ret = btrfs_inode_clear_file_extent_range(inode, 4542 clear_start, clear_len); 4543 if (ret) { 4544 btrfs_abort_transaction(trans, ret); 4545 break; 4546 } 4547 } 4548 4549 if (del_item) 4550 last_size = found_key.offset; 4551 else 4552 last_size = new_size; 4553 if (del_item) { 4554 if (!pending_del_nr) { 4555 /* no pending yet, add ourselves */ 4556 pending_del_slot = path->slots[0]; 4557 pending_del_nr = 1; 4558 } else if (pending_del_nr && 4559 path->slots[0] + 1 == pending_del_slot) { 4560 /* hop on the pending chunk */ 4561 pending_del_nr++; 4562 pending_del_slot = path->slots[0]; 4563 } else { 4564 BUG(); 4565 } 4566 } else { 4567 break; 4568 } 4569 should_throttle = false; 4570 4571 if (found_extent && 4572 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4573 struct btrfs_ref ref = { 0 }; 4574 4575 bytes_deleted += extent_num_bytes; 4576 4577 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, 4578 extent_start, extent_num_bytes, 0); 4579 ref.real_root = root->root_key.objectid; 4580 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 4581 ino, extent_offset); 4582 ret = btrfs_free_extent(trans, &ref); 4583 if (ret) { 4584 btrfs_abort_transaction(trans, ret); 4585 break; 4586 } 4587 if (be_nice) { 4588 if (btrfs_should_throttle_delayed_refs(trans)) 4589 should_throttle = true; 4590 } 4591 } 4592 4593 if (found_type == BTRFS_INODE_ITEM_KEY) 4594 break; 4595 4596 if (path->slots[0] == 0 || 4597 path->slots[0] != pending_del_slot || 4598 should_throttle) { 4599 if (pending_del_nr) { 4600 ret = btrfs_del_items(trans, root, path, 4601 pending_del_slot, 4602 pending_del_nr); 4603 if (ret) { 4604 btrfs_abort_transaction(trans, ret); 4605 break; 4606 } 4607 pending_del_nr = 0; 4608 } 4609 btrfs_release_path(path); 4610 4611 /* 4612 * We can generate a lot of delayed refs, so we need to 4613 * throttle every once and a while and make sure we're 4614 * adding enough space to keep up with the work we are 4615 * generating. Since we hold a transaction here we 4616 * can't flush, and we don't want to FLUSH_LIMIT because 4617 * we could have generated too many delayed refs to 4618 * actually allocate, so just bail if we're short and 4619 * let the normal reservation dance happen higher up. 4620 */ 4621 if (should_throttle) { 4622 ret = btrfs_delayed_refs_rsv_refill(fs_info, 4623 BTRFS_RESERVE_NO_FLUSH); 4624 if (ret) { 4625 ret = -EAGAIN; 4626 break; 4627 } 4628 } 4629 goto search_again; 4630 } else { 4631 path->slots[0]--; 4632 } 4633 } 4634 out: 4635 if (ret >= 0 && pending_del_nr) { 4636 int err; 4637 4638 err = btrfs_del_items(trans, root, path, pending_del_slot, 4639 pending_del_nr); 4640 if (err) { 4641 btrfs_abort_transaction(trans, err); 4642 ret = err; 4643 } 4644 } 4645 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4646 ASSERT(last_size >= new_size); 4647 if (!ret && last_size > new_size) 4648 last_size = new_size; 4649 btrfs_inode_safe_disk_i_size_write(inode, last_size); 4650 unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, 4651 &cached_state); 4652 } 4653 4654 btrfs_free_path(path); 4655 return ret; 4656 } 4657 4658 /* 4659 * btrfs_truncate_block - read, zero a chunk and write a block 4660 * @inode - inode that we're zeroing 4661 * @from - the offset to start zeroing 4662 * @len - the length to zero, 0 to zero the entire range respective to the 4663 * offset 4664 * @front - zero up to the offset instead of from the offset on 4665 * 4666 * This will find the block for the "from" offset and cow the block and zero the 4667 * part we want to zero. This is used with truncate and hole punching. 4668 */ 4669 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, 4670 int front) 4671 { 4672 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4673 struct address_space *mapping = inode->vfs_inode.i_mapping; 4674 struct extent_io_tree *io_tree = &inode->io_tree; 4675 struct btrfs_ordered_extent *ordered; 4676 struct extent_state *cached_state = NULL; 4677 struct extent_changeset *data_reserved = NULL; 4678 char *kaddr; 4679 bool only_release_metadata = false; 4680 u32 blocksize = fs_info->sectorsize; 4681 pgoff_t index = from >> PAGE_SHIFT; 4682 unsigned offset = from & (blocksize - 1); 4683 struct page *page; 4684 gfp_t mask = btrfs_alloc_write_mask(mapping); 4685 size_t write_bytes = blocksize; 4686 int ret = 0; 4687 u64 block_start; 4688 u64 block_end; 4689 4690 if (IS_ALIGNED(offset, blocksize) && 4691 (!len || IS_ALIGNED(len, blocksize))) 4692 goto out; 4693 4694 block_start = round_down(from, blocksize); 4695 block_end = block_start + blocksize - 1; 4696 4697 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, 4698 blocksize); 4699 if (ret < 0) { 4700 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { 4701 /* For nocow case, no need to reserve data space */ 4702 only_release_metadata = true; 4703 } else { 4704 goto out; 4705 } 4706 } 4707 ret = btrfs_delalloc_reserve_metadata(inode, blocksize); 4708 if (ret < 0) { 4709 if (!only_release_metadata) 4710 btrfs_free_reserved_data_space(inode, data_reserved, 4711 block_start, blocksize); 4712 goto out; 4713 } 4714 again: 4715 page = find_or_create_page(mapping, index, mask); 4716 if (!page) { 4717 btrfs_delalloc_release_space(inode, data_reserved, block_start, 4718 blocksize, true); 4719 btrfs_delalloc_release_extents(inode, blocksize); 4720 ret = -ENOMEM; 4721 goto out; 4722 } 4723 4724 if (!PageUptodate(page)) { 4725 ret = btrfs_readpage(NULL, page); 4726 lock_page(page); 4727 if (page->mapping != mapping) { 4728 unlock_page(page); 4729 put_page(page); 4730 goto again; 4731 } 4732 if (!PageUptodate(page)) { 4733 ret = -EIO; 4734 goto out_unlock; 4735 } 4736 } 4737 wait_on_page_writeback(page); 4738 4739 lock_extent_bits(io_tree, block_start, block_end, &cached_state); 4740 set_page_extent_mapped(page); 4741 4742 ordered = btrfs_lookup_ordered_extent(inode, block_start); 4743 if (ordered) { 4744 unlock_extent_cached(io_tree, block_start, block_end, 4745 &cached_state); 4746 unlock_page(page); 4747 put_page(page); 4748 btrfs_start_ordered_extent(ordered, 1); 4749 btrfs_put_ordered_extent(ordered); 4750 goto again; 4751 } 4752 4753 clear_extent_bit(&inode->io_tree, block_start, block_end, 4754 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4755 0, 0, &cached_state); 4756 4757 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, 4758 &cached_state); 4759 if (ret) { 4760 unlock_extent_cached(io_tree, block_start, block_end, 4761 &cached_state); 4762 goto out_unlock; 4763 } 4764 4765 if (offset != blocksize) { 4766 if (!len) 4767 len = blocksize - offset; 4768 kaddr = kmap(page); 4769 if (front) 4770 memset(kaddr + (block_start - page_offset(page)), 4771 0, offset); 4772 else 4773 memset(kaddr + (block_start - page_offset(page)) + offset, 4774 0, len); 4775 flush_dcache_page(page); 4776 kunmap(page); 4777 } 4778 ClearPageChecked(page); 4779 set_page_dirty(page); 4780 unlock_extent_cached(io_tree, block_start, block_end, &cached_state); 4781 4782 if (only_release_metadata) 4783 set_extent_bit(&inode->io_tree, block_start, block_end, 4784 EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); 4785 4786 out_unlock: 4787 if (ret) { 4788 if (only_release_metadata) 4789 btrfs_delalloc_release_metadata(inode, blocksize, true); 4790 else 4791 btrfs_delalloc_release_space(inode, data_reserved, 4792 block_start, blocksize, true); 4793 } 4794 btrfs_delalloc_release_extents(inode, blocksize); 4795 unlock_page(page); 4796 put_page(page); 4797 out: 4798 if (only_release_metadata) 4799 btrfs_check_nocow_unlock(inode); 4800 extent_changeset_free(data_reserved); 4801 return ret; 4802 } 4803 4804 static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, 4805 u64 offset, u64 len) 4806 { 4807 struct btrfs_fs_info *fs_info = root->fs_info; 4808 struct btrfs_trans_handle *trans; 4809 struct btrfs_drop_extents_args drop_args = { 0 }; 4810 int ret; 4811 4812 /* 4813 * Still need to make sure the inode looks like it's been updated so 4814 * that any holes get logged if we fsync. 4815 */ 4816 if (btrfs_fs_incompat(fs_info, NO_HOLES)) { 4817 inode->last_trans = fs_info->generation; 4818 inode->last_sub_trans = root->log_transid; 4819 inode->last_log_commit = root->last_log_commit; 4820 return 0; 4821 } 4822 4823 /* 4824 * 1 - for the one we're dropping 4825 * 1 - for the one we're adding 4826 * 1 - for updating the inode. 4827 */ 4828 trans = btrfs_start_transaction(root, 3); 4829 if (IS_ERR(trans)) 4830 return PTR_ERR(trans); 4831 4832 drop_args.start = offset; 4833 drop_args.end = offset + len; 4834 drop_args.drop_cache = true; 4835 4836 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 4837 if (ret) { 4838 btrfs_abort_transaction(trans, ret); 4839 btrfs_end_transaction(trans); 4840 return ret; 4841 } 4842 4843 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), 4844 offset, 0, 0, len, 0, len, 0, 0, 0); 4845 if (ret) { 4846 btrfs_abort_transaction(trans, ret); 4847 } else { 4848 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); 4849 btrfs_update_inode(trans, root, inode); 4850 } 4851 btrfs_end_transaction(trans); 4852 return ret; 4853 } 4854 4855 /* 4856 * This function puts in dummy file extents for the area we're creating a hole 4857 * for. So if we are truncating this file to a larger size we need to insert 4858 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 4859 * the range between oldsize and size 4860 */ 4861 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) 4862 { 4863 struct btrfs_root *root = inode->root; 4864 struct btrfs_fs_info *fs_info = root->fs_info; 4865 struct extent_io_tree *io_tree = &inode->io_tree; 4866 struct extent_map *em = NULL; 4867 struct extent_state *cached_state = NULL; 4868 struct extent_map_tree *em_tree = &inode->extent_tree; 4869 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); 4870 u64 block_end = ALIGN(size, fs_info->sectorsize); 4871 u64 last_byte; 4872 u64 cur_offset; 4873 u64 hole_size; 4874 int err = 0; 4875 4876 /* 4877 * If our size started in the middle of a block we need to zero out the 4878 * rest of the block before we expand the i_size, otherwise we could 4879 * expose stale data. 4880 */ 4881 err = btrfs_truncate_block(inode, oldsize, 0, 0); 4882 if (err) 4883 return err; 4884 4885 if (size <= hole_start) 4886 return 0; 4887 4888 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, 4889 &cached_state); 4890 cur_offset = hole_start; 4891 while (1) { 4892 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 4893 block_end - cur_offset); 4894 if (IS_ERR(em)) { 4895 err = PTR_ERR(em); 4896 em = NULL; 4897 break; 4898 } 4899 last_byte = min(extent_map_end(em), block_end); 4900 last_byte = ALIGN(last_byte, fs_info->sectorsize); 4901 hole_size = last_byte - cur_offset; 4902 4903 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4904 struct extent_map *hole_em; 4905 4906 err = maybe_insert_hole(root, inode, cur_offset, 4907 hole_size); 4908 if (err) 4909 break; 4910 4911 err = btrfs_inode_set_file_extent_range(inode, 4912 cur_offset, hole_size); 4913 if (err) 4914 break; 4915 4916 btrfs_drop_extent_cache(inode, cur_offset, 4917 cur_offset + hole_size - 1, 0); 4918 hole_em = alloc_extent_map(); 4919 if (!hole_em) { 4920 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4921 &inode->runtime_flags); 4922 goto next; 4923 } 4924 hole_em->start = cur_offset; 4925 hole_em->len = hole_size; 4926 hole_em->orig_start = cur_offset; 4927 4928 hole_em->block_start = EXTENT_MAP_HOLE; 4929 hole_em->block_len = 0; 4930 hole_em->orig_block_len = 0; 4931 hole_em->ram_bytes = hole_size; 4932 hole_em->compress_type = BTRFS_COMPRESS_NONE; 4933 hole_em->generation = fs_info->generation; 4934 4935 while (1) { 4936 write_lock(&em_tree->lock); 4937 err = add_extent_mapping(em_tree, hole_em, 1); 4938 write_unlock(&em_tree->lock); 4939 if (err != -EEXIST) 4940 break; 4941 btrfs_drop_extent_cache(inode, cur_offset, 4942 cur_offset + 4943 hole_size - 1, 0); 4944 } 4945 free_extent_map(hole_em); 4946 } else { 4947 err = btrfs_inode_set_file_extent_range(inode, 4948 cur_offset, hole_size); 4949 if (err) 4950 break; 4951 } 4952 next: 4953 free_extent_map(em); 4954 em = NULL; 4955 cur_offset = last_byte; 4956 if (cur_offset >= block_end) 4957 break; 4958 } 4959 free_extent_map(em); 4960 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); 4961 return err; 4962 } 4963 4964 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4965 { 4966 struct btrfs_root *root = BTRFS_I(inode)->root; 4967 struct btrfs_trans_handle *trans; 4968 loff_t oldsize = i_size_read(inode); 4969 loff_t newsize = attr->ia_size; 4970 int mask = attr->ia_valid; 4971 int ret; 4972 4973 /* 4974 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 4975 * special case where we need to update the times despite not having 4976 * these flags set. For all other operations the VFS set these flags 4977 * explicitly if it wants a timestamp update. 4978 */ 4979 if (newsize != oldsize) { 4980 inode_inc_iversion(inode); 4981 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) 4982 inode->i_ctime = inode->i_mtime = 4983 current_time(inode); 4984 } 4985 4986 if (newsize > oldsize) { 4987 /* 4988 * Don't do an expanding truncate while snapshotting is ongoing. 4989 * This is to ensure the snapshot captures a fully consistent 4990 * state of this file - if the snapshot captures this expanding 4991 * truncation, it must capture all writes that happened before 4992 * this truncation. 4993 */ 4994 btrfs_drew_write_lock(&root->snapshot_lock); 4995 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); 4996 if (ret) { 4997 btrfs_drew_write_unlock(&root->snapshot_lock); 4998 return ret; 4999 } 5000 5001 trans = btrfs_start_transaction(root, 1); 5002 if (IS_ERR(trans)) { 5003 btrfs_drew_write_unlock(&root->snapshot_lock); 5004 return PTR_ERR(trans); 5005 } 5006 5007 i_size_write(inode, newsize); 5008 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 5009 pagecache_isize_extended(inode, oldsize, newsize); 5010 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 5011 btrfs_drew_write_unlock(&root->snapshot_lock); 5012 btrfs_end_transaction(trans); 5013 } else { 5014 5015 /* 5016 * We're truncating a file that used to have good data down to 5017 * zero. Make sure any new writes to the file get on disk 5018 * on close. 5019 */ 5020 if (newsize == 0) 5021 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 5022 &BTRFS_I(inode)->runtime_flags); 5023 5024 truncate_setsize(inode, newsize); 5025 5026 inode_dio_wait(inode); 5027 5028 ret = btrfs_truncate(inode, newsize == oldsize); 5029 if (ret && inode->i_nlink) { 5030 int err; 5031 5032 /* 5033 * Truncate failed, so fix up the in-memory size. We 5034 * adjusted disk_i_size down as we removed extents, so 5035 * wait for disk_i_size to be stable and then update the 5036 * in-memory size to match. 5037 */ 5038 err = btrfs_wait_ordered_range(inode, 0, (u64)-1); 5039 if (err) 5040 return err; 5041 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 5042 } 5043 } 5044 5045 return ret; 5046 } 5047 5048 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 5049 { 5050 struct inode *inode = d_inode(dentry); 5051 struct btrfs_root *root = BTRFS_I(inode)->root; 5052 int err; 5053 5054 if (btrfs_root_readonly(root)) 5055 return -EROFS; 5056 5057 err = setattr_prepare(dentry, attr); 5058 if (err) 5059 return err; 5060 5061 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 5062 err = btrfs_setsize(inode, attr); 5063 if (err) 5064 return err; 5065 } 5066 5067 if (attr->ia_valid) { 5068 setattr_copy(inode, attr); 5069 inode_inc_iversion(inode); 5070 err = btrfs_dirty_inode(inode); 5071 5072 if (!err && attr->ia_valid & ATTR_MODE) 5073 err = posix_acl_chmod(inode, inode->i_mode); 5074 } 5075 5076 return err; 5077 } 5078 5079 /* 5080 * While truncating the inode pages during eviction, we get the VFS calling 5081 * btrfs_invalidatepage() against each page of the inode. This is slow because 5082 * the calls to btrfs_invalidatepage() result in a huge amount of calls to 5083 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting 5084 * extent_state structures over and over, wasting lots of time. 5085 * 5086 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all 5087 * those expensive operations on a per page basis and do only the ordered io 5088 * finishing, while we release here the extent_map and extent_state structures, 5089 * without the excessive merging and splitting. 5090 */ 5091 static void evict_inode_truncate_pages(struct inode *inode) 5092 { 5093 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5094 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; 5095 struct rb_node *node; 5096 5097 ASSERT(inode->i_state & I_FREEING); 5098 truncate_inode_pages_final(&inode->i_data); 5099 5100 write_lock(&map_tree->lock); 5101 while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { 5102 struct extent_map *em; 5103 5104 node = rb_first_cached(&map_tree->map); 5105 em = rb_entry(node, struct extent_map, rb_node); 5106 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 5107 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 5108 remove_extent_mapping(map_tree, em); 5109 free_extent_map(em); 5110 if (need_resched()) { 5111 write_unlock(&map_tree->lock); 5112 cond_resched(); 5113 write_lock(&map_tree->lock); 5114 } 5115 } 5116 write_unlock(&map_tree->lock); 5117 5118 /* 5119 * Keep looping until we have no more ranges in the io tree. 5120 * We can have ongoing bios started by readahead that have 5121 * their endio callback (extent_io.c:end_bio_extent_readpage) 5122 * still in progress (unlocked the pages in the bio but did not yet 5123 * unlocked the ranges in the io tree). Therefore this means some 5124 * ranges can still be locked and eviction started because before 5125 * submitting those bios, which are executed by a separate task (work 5126 * queue kthread), inode references (inode->i_count) were not taken 5127 * (which would be dropped in the end io callback of each bio). 5128 * Therefore here we effectively end up waiting for those bios and 5129 * anyone else holding locked ranges without having bumped the inode's 5130 * reference count - if we don't do it, when they access the inode's 5131 * io_tree to unlock a range it may be too late, leading to an 5132 * use-after-free issue. 5133 */ 5134 spin_lock(&io_tree->lock); 5135 while (!RB_EMPTY_ROOT(&io_tree->state)) { 5136 struct extent_state *state; 5137 struct extent_state *cached_state = NULL; 5138 u64 start; 5139 u64 end; 5140 unsigned state_flags; 5141 5142 node = rb_first(&io_tree->state); 5143 state = rb_entry(node, struct extent_state, rb_node); 5144 start = state->start; 5145 end = state->end; 5146 state_flags = state->state; 5147 spin_unlock(&io_tree->lock); 5148 5149 lock_extent_bits(io_tree, start, end, &cached_state); 5150 5151 /* 5152 * If still has DELALLOC flag, the extent didn't reach disk, 5153 * and its reserved space won't be freed by delayed_ref. 5154 * So we need to free its reserved space here. 5155 * (Refer to comment in btrfs_invalidatepage, case 2) 5156 * 5157 * Note, end is the bytenr of last byte, so we need + 1 here. 5158 */ 5159 if (state_flags & EXTENT_DELALLOC) 5160 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, 5161 end - start + 1); 5162 5163 clear_extent_bit(io_tree, start, end, 5164 EXTENT_LOCKED | EXTENT_DELALLOC | 5165 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, 5166 &cached_state); 5167 5168 cond_resched(); 5169 spin_lock(&io_tree->lock); 5170 } 5171 spin_unlock(&io_tree->lock); 5172 } 5173 5174 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, 5175 struct btrfs_block_rsv *rsv) 5176 { 5177 struct btrfs_fs_info *fs_info = root->fs_info; 5178 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5179 struct btrfs_trans_handle *trans; 5180 u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); 5181 int ret; 5182 5183 /* 5184 * Eviction should be taking place at some place safe because of our 5185 * delayed iputs. However the normal flushing code will run delayed 5186 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. 5187 * 5188 * We reserve the delayed_refs_extra here again because we can't use 5189 * btrfs_start_transaction(root, 0) for the same deadlocky reason as 5190 * above. We reserve our extra bit here because we generate a ton of 5191 * delayed refs activity by truncating. 5192 * 5193 * If we cannot make our reservation we'll attempt to steal from the 5194 * global reserve, because we really want to be able to free up space. 5195 */ 5196 ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, 5197 BTRFS_RESERVE_FLUSH_EVICT); 5198 if (ret) { 5199 /* 5200 * Try to steal from the global reserve if there is space for 5201 * it. 5202 */ 5203 if (btrfs_check_space_for_delayed_refs(fs_info) || 5204 btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { 5205 btrfs_warn(fs_info, 5206 "could not allocate space for delete; will truncate on mount"); 5207 return ERR_PTR(-ENOSPC); 5208 } 5209 delayed_refs_extra = 0; 5210 } 5211 5212 trans = btrfs_join_transaction(root); 5213 if (IS_ERR(trans)) 5214 return trans; 5215 5216 if (delayed_refs_extra) { 5217 trans->block_rsv = &fs_info->trans_block_rsv; 5218 trans->bytes_reserved = delayed_refs_extra; 5219 btrfs_block_rsv_migrate(rsv, trans->block_rsv, 5220 delayed_refs_extra, 1); 5221 } 5222 return trans; 5223 } 5224 5225 void btrfs_evict_inode(struct inode *inode) 5226 { 5227 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5228 struct btrfs_trans_handle *trans; 5229 struct btrfs_root *root = BTRFS_I(inode)->root; 5230 struct btrfs_block_rsv *rsv; 5231 int ret; 5232 5233 trace_btrfs_inode_evict(inode); 5234 5235 if (!root) { 5236 clear_inode(inode); 5237 return; 5238 } 5239 5240 evict_inode_truncate_pages(inode); 5241 5242 if (inode->i_nlink && 5243 ((btrfs_root_refs(&root->root_item) != 0 && 5244 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 5245 btrfs_is_free_space_inode(BTRFS_I(inode)))) 5246 goto no_delete; 5247 5248 if (is_bad_inode(inode)) 5249 goto no_delete; 5250 5251 btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); 5252 5253 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 5254 goto no_delete; 5255 5256 if (inode->i_nlink > 0) { 5257 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 5258 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); 5259 goto no_delete; 5260 } 5261 5262 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); 5263 if (ret) 5264 goto no_delete; 5265 5266 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 5267 if (!rsv) 5268 goto no_delete; 5269 rsv->size = btrfs_calc_metadata_size(fs_info, 1); 5270 rsv->failfast = 1; 5271 5272 btrfs_i_size_write(BTRFS_I(inode), 0); 5273 5274 while (1) { 5275 trans = evict_refill_and_join(root, rsv); 5276 if (IS_ERR(trans)) 5277 goto free_rsv; 5278 5279 trans->block_rsv = rsv; 5280 5281 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 5282 0, 0); 5283 trans->block_rsv = &fs_info->trans_block_rsv; 5284 btrfs_end_transaction(trans); 5285 btrfs_btree_balance_dirty(fs_info); 5286 if (ret && ret != -ENOSPC && ret != -EAGAIN) 5287 goto free_rsv; 5288 else if (!ret) 5289 break; 5290 } 5291 5292 /* 5293 * Errors here aren't a big deal, it just means we leave orphan items in 5294 * the tree. They will be cleaned up on the next mount. If the inode 5295 * number gets reused, cleanup deletes the orphan item without doing 5296 * anything, and unlink reuses the existing orphan item. 5297 * 5298 * If it turns out that we are dropping too many of these, we might want 5299 * to add a mechanism for retrying these after a commit. 5300 */ 5301 trans = evict_refill_and_join(root, rsv); 5302 if (!IS_ERR(trans)) { 5303 trans->block_rsv = rsv; 5304 btrfs_orphan_del(trans, BTRFS_I(inode)); 5305 trans->block_rsv = &fs_info->trans_block_rsv; 5306 btrfs_end_transaction(trans); 5307 } 5308 5309 free_rsv: 5310 btrfs_free_block_rsv(fs_info, rsv); 5311 no_delete: 5312 /* 5313 * If we didn't successfully delete, the orphan item will still be in 5314 * the tree and we'll retry on the next mount. Again, we might also want 5315 * to retry these periodically in the future. 5316 */ 5317 btrfs_remove_delayed_node(BTRFS_I(inode)); 5318 clear_inode(inode); 5319 } 5320 5321 /* 5322 * Return the key found in the dir entry in the location pointer, fill @type 5323 * with BTRFS_FT_*, and return 0. 5324 * 5325 * If no dir entries were found, returns -ENOENT. 5326 * If found a corrupted location in dir entry, returns -EUCLEAN. 5327 */ 5328 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 5329 struct btrfs_key *location, u8 *type) 5330 { 5331 const char *name = dentry->d_name.name; 5332 int namelen = dentry->d_name.len; 5333 struct btrfs_dir_item *di; 5334 struct btrfs_path *path; 5335 struct btrfs_root *root = BTRFS_I(dir)->root; 5336 int ret = 0; 5337 5338 path = btrfs_alloc_path(); 5339 if (!path) 5340 return -ENOMEM; 5341 5342 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), 5343 name, namelen, 0); 5344 if (IS_ERR_OR_NULL(di)) { 5345 ret = di ? PTR_ERR(di) : -ENOENT; 5346 goto out; 5347 } 5348 5349 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 5350 if (location->type != BTRFS_INODE_ITEM_KEY && 5351 location->type != BTRFS_ROOT_ITEM_KEY) { 5352 ret = -EUCLEAN; 5353 btrfs_warn(root->fs_info, 5354 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", 5355 __func__, name, btrfs_ino(BTRFS_I(dir)), 5356 location->objectid, location->type, location->offset); 5357 } 5358 if (!ret) 5359 *type = btrfs_dir_type(path->nodes[0], di); 5360 out: 5361 btrfs_free_path(path); 5362 return ret; 5363 } 5364 5365 /* 5366 * when we hit a tree root in a directory, the btrfs part of the inode 5367 * needs to be changed to reflect the root directory of the tree root. This 5368 * is kind of like crossing a mount point. 5369 */ 5370 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, 5371 struct inode *dir, 5372 struct dentry *dentry, 5373 struct btrfs_key *location, 5374 struct btrfs_root **sub_root) 5375 { 5376 struct btrfs_path *path; 5377 struct btrfs_root *new_root; 5378 struct btrfs_root_ref *ref; 5379 struct extent_buffer *leaf; 5380 struct btrfs_key key; 5381 int ret; 5382 int err = 0; 5383 5384 path = btrfs_alloc_path(); 5385 if (!path) { 5386 err = -ENOMEM; 5387 goto out; 5388 } 5389 5390 err = -ENOENT; 5391 key.objectid = BTRFS_I(dir)->root->root_key.objectid; 5392 key.type = BTRFS_ROOT_REF_KEY; 5393 key.offset = location->objectid; 5394 5395 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 5396 if (ret) { 5397 if (ret < 0) 5398 err = ret; 5399 goto out; 5400 } 5401 5402 leaf = path->nodes[0]; 5403 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 5404 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || 5405 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 5406 goto out; 5407 5408 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 5409 (unsigned long)(ref + 1), 5410 dentry->d_name.len); 5411 if (ret) 5412 goto out; 5413 5414 btrfs_release_path(path); 5415 5416 new_root = btrfs_get_fs_root(fs_info, location->objectid, true); 5417 if (IS_ERR(new_root)) { 5418 err = PTR_ERR(new_root); 5419 goto out; 5420 } 5421 5422 *sub_root = new_root; 5423 location->objectid = btrfs_root_dirid(&new_root->root_item); 5424 location->type = BTRFS_INODE_ITEM_KEY; 5425 location->offset = 0; 5426 err = 0; 5427 out: 5428 btrfs_free_path(path); 5429 return err; 5430 } 5431 5432 static void inode_tree_add(struct inode *inode) 5433 { 5434 struct btrfs_root *root = BTRFS_I(inode)->root; 5435 struct btrfs_inode *entry; 5436 struct rb_node **p; 5437 struct rb_node *parent; 5438 struct rb_node *new = &BTRFS_I(inode)->rb_node; 5439 u64 ino = btrfs_ino(BTRFS_I(inode)); 5440 5441 if (inode_unhashed(inode)) 5442 return; 5443 parent = NULL; 5444 spin_lock(&root->inode_lock); 5445 p = &root->inode_tree.rb_node; 5446 while (*p) { 5447 parent = *p; 5448 entry = rb_entry(parent, struct btrfs_inode, rb_node); 5449 5450 if (ino < btrfs_ino(entry)) 5451 p = &parent->rb_left; 5452 else if (ino > btrfs_ino(entry)) 5453 p = &parent->rb_right; 5454 else { 5455 WARN_ON(!(entry->vfs_inode.i_state & 5456 (I_WILL_FREE | I_FREEING))); 5457 rb_replace_node(parent, new, &root->inode_tree); 5458 RB_CLEAR_NODE(parent); 5459 spin_unlock(&root->inode_lock); 5460 return; 5461 } 5462 } 5463 rb_link_node(new, parent, p); 5464 rb_insert_color(new, &root->inode_tree); 5465 spin_unlock(&root->inode_lock); 5466 } 5467 5468 static void inode_tree_del(struct btrfs_inode *inode) 5469 { 5470 struct btrfs_root *root = inode->root; 5471 int empty = 0; 5472 5473 spin_lock(&root->inode_lock); 5474 if (!RB_EMPTY_NODE(&inode->rb_node)) { 5475 rb_erase(&inode->rb_node, &root->inode_tree); 5476 RB_CLEAR_NODE(&inode->rb_node); 5477 empty = RB_EMPTY_ROOT(&root->inode_tree); 5478 } 5479 spin_unlock(&root->inode_lock); 5480 5481 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5482 spin_lock(&root->inode_lock); 5483 empty = RB_EMPTY_ROOT(&root->inode_tree); 5484 spin_unlock(&root->inode_lock); 5485 if (empty) 5486 btrfs_add_dead_root(root); 5487 } 5488 } 5489 5490 5491 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5492 { 5493 struct btrfs_iget_args *args = p; 5494 5495 inode->i_ino = args->ino; 5496 BTRFS_I(inode)->location.objectid = args->ino; 5497 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; 5498 BTRFS_I(inode)->location.offset = 0; 5499 BTRFS_I(inode)->root = btrfs_grab_root(args->root); 5500 BUG_ON(args->root && !BTRFS_I(inode)->root); 5501 return 0; 5502 } 5503 5504 static int btrfs_find_actor(struct inode *inode, void *opaque) 5505 { 5506 struct btrfs_iget_args *args = opaque; 5507 5508 return args->ino == BTRFS_I(inode)->location.objectid && 5509 args->root == BTRFS_I(inode)->root; 5510 } 5511 5512 static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, 5513 struct btrfs_root *root) 5514 { 5515 struct inode *inode; 5516 struct btrfs_iget_args args; 5517 unsigned long hashval = btrfs_inode_hash(ino, root); 5518 5519 args.ino = ino; 5520 args.root = root; 5521 5522 inode = iget5_locked(s, hashval, btrfs_find_actor, 5523 btrfs_init_locked_inode, 5524 (void *)&args); 5525 return inode; 5526 } 5527 5528 /* 5529 * Get an inode object given its inode number and corresponding root. 5530 * Path can be preallocated to prevent recursing back to iget through 5531 * allocator. NULL is also valid but may require an additional allocation 5532 * later. 5533 */ 5534 struct inode *btrfs_iget_path(struct super_block *s, u64 ino, 5535 struct btrfs_root *root, struct btrfs_path *path) 5536 { 5537 struct inode *inode; 5538 5539 inode = btrfs_iget_locked(s, ino, root); 5540 if (!inode) 5541 return ERR_PTR(-ENOMEM); 5542 5543 if (inode->i_state & I_NEW) { 5544 int ret; 5545 5546 ret = btrfs_read_locked_inode(inode, path); 5547 if (!ret) { 5548 inode_tree_add(inode); 5549 unlock_new_inode(inode); 5550 } else { 5551 iget_failed(inode); 5552 /* 5553 * ret > 0 can come from btrfs_search_slot called by 5554 * btrfs_read_locked_inode, this means the inode item 5555 * was not found. 5556 */ 5557 if (ret > 0) 5558 ret = -ENOENT; 5559 inode = ERR_PTR(ret); 5560 } 5561 } 5562 5563 return inode; 5564 } 5565 5566 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) 5567 { 5568 return btrfs_iget_path(s, ino, root, NULL); 5569 } 5570 5571 static struct inode *new_simple_dir(struct super_block *s, 5572 struct btrfs_key *key, 5573 struct btrfs_root *root) 5574 { 5575 struct inode *inode = new_inode(s); 5576 5577 if (!inode) 5578 return ERR_PTR(-ENOMEM); 5579 5580 BTRFS_I(inode)->root = btrfs_grab_root(root); 5581 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 5582 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 5583 5584 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 5585 /* 5586 * We only need lookup, the rest is read-only and there's no inode 5587 * associated with the dentry 5588 */ 5589 inode->i_op = &simple_dir_inode_operations; 5590 inode->i_opflags &= ~IOP_XATTR; 5591 inode->i_fop = &simple_dir_operations; 5592 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5593 inode->i_mtime = current_time(inode); 5594 inode->i_atime = inode->i_mtime; 5595 inode->i_ctime = inode->i_mtime; 5596 BTRFS_I(inode)->i_otime = inode->i_mtime; 5597 5598 return inode; 5599 } 5600 5601 static inline u8 btrfs_inode_type(struct inode *inode) 5602 { 5603 /* 5604 * Compile-time asserts that generic FT_* types still match 5605 * BTRFS_FT_* types 5606 */ 5607 BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); 5608 BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); 5609 BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); 5610 BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); 5611 BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); 5612 BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); 5613 BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); 5614 BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); 5615 5616 return fs_umode_to_ftype(inode->i_mode); 5617 } 5618 5619 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 5620 { 5621 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 5622 struct inode *inode; 5623 struct btrfs_root *root = BTRFS_I(dir)->root; 5624 struct btrfs_root *sub_root = root; 5625 struct btrfs_key location; 5626 u8 di_type = 0; 5627 int ret = 0; 5628 5629 if (dentry->d_name.len > BTRFS_NAME_LEN) 5630 return ERR_PTR(-ENAMETOOLONG); 5631 5632 ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); 5633 if (ret < 0) 5634 return ERR_PTR(ret); 5635 5636 if (location.type == BTRFS_INODE_ITEM_KEY) { 5637 inode = btrfs_iget(dir->i_sb, location.objectid, root); 5638 if (IS_ERR(inode)) 5639 return inode; 5640 5641 /* Do extra check against inode mode with di_type */ 5642 if (btrfs_inode_type(inode) != di_type) { 5643 btrfs_crit(fs_info, 5644 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", 5645 inode->i_mode, btrfs_inode_type(inode), 5646 di_type); 5647 iput(inode); 5648 return ERR_PTR(-EUCLEAN); 5649 } 5650 return inode; 5651 } 5652 5653 ret = fixup_tree_root_location(fs_info, dir, dentry, 5654 &location, &sub_root); 5655 if (ret < 0) { 5656 if (ret != -ENOENT) 5657 inode = ERR_PTR(ret); 5658 else 5659 inode = new_simple_dir(dir->i_sb, &location, sub_root); 5660 } else { 5661 inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); 5662 } 5663 if (root != sub_root) 5664 btrfs_put_root(sub_root); 5665 5666 if (!IS_ERR(inode) && root != sub_root) { 5667 down_read(&fs_info->cleanup_work_sem); 5668 if (!sb_rdonly(inode->i_sb)) 5669 ret = btrfs_orphan_cleanup(sub_root); 5670 up_read(&fs_info->cleanup_work_sem); 5671 if (ret) { 5672 iput(inode); 5673 inode = ERR_PTR(ret); 5674 } 5675 } 5676 5677 return inode; 5678 } 5679 5680 static int btrfs_dentry_delete(const struct dentry *dentry) 5681 { 5682 struct btrfs_root *root; 5683 struct inode *inode = d_inode(dentry); 5684 5685 if (!inode && !IS_ROOT(dentry)) 5686 inode = d_inode(dentry->d_parent); 5687 5688 if (inode) { 5689 root = BTRFS_I(inode)->root; 5690 if (btrfs_root_refs(&root->root_item) == 0) 5691 return 1; 5692 5693 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5694 return 1; 5695 } 5696 return 0; 5697 } 5698 5699 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5700 unsigned int flags) 5701 { 5702 struct inode *inode = btrfs_lookup_dentry(dir, dentry); 5703 5704 if (inode == ERR_PTR(-ENOENT)) 5705 inode = NULL; 5706 return d_splice_alias(inode, dentry); 5707 } 5708 5709 /* 5710 * All this infrastructure exists because dir_emit can fault, and we are holding 5711 * the tree lock when doing readdir. For now just allocate a buffer and copy 5712 * our information into that, and then dir_emit from the buffer. This is 5713 * similar to what NFS does, only we don't keep the buffer around in pagecache 5714 * because I'm afraid I'll mess that up. Long term we need to make filldir do 5715 * copy_to_user_inatomic so we don't have to worry about page faulting under the 5716 * tree lock. 5717 */ 5718 static int btrfs_opendir(struct inode *inode, struct file *file) 5719 { 5720 struct btrfs_file_private *private; 5721 5722 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); 5723 if (!private) 5724 return -ENOMEM; 5725 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); 5726 if (!private->filldir_buf) { 5727 kfree(private); 5728 return -ENOMEM; 5729 } 5730 file->private_data = private; 5731 return 0; 5732 } 5733 5734 struct dir_entry { 5735 u64 ino; 5736 u64 offset; 5737 unsigned type; 5738 int name_len; 5739 }; 5740 5741 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) 5742 { 5743 while (entries--) { 5744 struct dir_entry *entry = addr; 5745 char *name = (char *)(entry + 1); 5746 5747 ctx->pos = get_unaligned(&entry->offset); 5748 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), 5749 get_unaligned(&entry->ino), 5750 get_unaligned(&entry->type))) 5751 return 1; 5752 addr += sizeof(struct dir_entry) + 5753 get_unaligned(&entry->name_len); 5754 ctx->pos++; 5755 } 5756 return 0; 5757 } 5758 5759 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 5760 { 5761 struct inode *inode = file_inode(file); 5762 struct btrfs_root *root = BTRFS_I(inode)->root; 5763 struct btrfs_file_private *private = file->private_data; 5764 struct btrfs_dir_item *di; 5765 struct btrfs_key key; 5766 struct btrfs_key found_key; 5767 struct btrfs_path *path; 5768 void *addr; 5769 struct list_head ins_list; 5770 struct list_head del_list; 5771 int ret; 5772 struct extent_buffer *leaf; 5773 int slot; 5774 char *name_ptr; 5775 int name_len; 5776 int entries = 0; 5777 int total_len = 0; 5778 bool put = false; 5779 struct btrfs_key location; 5780 5781 if (!dir_emit_dots(file, ctx)) 5782 return 0; 5783 5784 path = btrfs_alloc_path(); 5785 if (!path) 5786 return -ENOMEM; 5787 5788 addr = private->filldir_buf; 5789 path->reada = READA_FORWARD; 5790 5791 INIT_LIST_HEAD(&ins_list); 5792 INIT_LIST_HEAD(&del_list); 5793 put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); 5794 5795 again: 5796 key.type = BTRFS_DIR_INDEX_KEY; 5797 key.offset = ctx->pos; 5798 key.objectid = btrfs_ino(BTRFS_I(inode)); 5799 5800 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5801 if (ret < 0) 5802 goto err; 5803 5804 while (1) { 5805 struct dir_entry *entry; 5806 5807 leaf = path->nodes[0]; 5808 slot = path->slots[0]; 5809 if (slot >= btrfs_header_nritems(leaf)) { 5810 ret = btrfs_next_leaf(root, path); 5811 if (ret < 0) 5812 goto err; 5813 else if (ret > 0) 5814 break; 5815 continue; 5816 } 5817 5818 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5819 5820 if (found_key.objectid != key.objectid) 5821 break; 5822 if (found_key.type != BTRFS_DIR_INDEX_KEY) 5823 break; 5824 if (found_key.offset < ctx->pos) 5825 goto next; 5826 if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) 5827 goto next; 5828 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5829 name_len = btrfs_dir_name_len(leaf, di); 5830 if ((total_len + sizeof(struct dir_entry) + name_len) >= 5831 PAGE_SIZE) { 5832 btrfs_release_path(path); 5833 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 5834 if (ret) 5835 goto nopos; 5836 addr = private->filldir_buf; 5837 entries = 0; 5838 total_len = 0; 5839 goto again; 5840 } 5841 5842 entry = addr; 5843 put_unaligned(name_len, &entry->name_len); 5844 name_ptr = (char *)(entry + 1); 5845 read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), 5846 name_len); 5847 put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), 5848 &entry->type); 5849 btrfs_dir_item_key_to_cpu(leaf, di, &location); 5850 put_unaligned(location.objectid, &entry->ino); 5851 put_unaligned(found_key.offset, &entry->offset); 5852 entries++; 5853 addr += sizeof(struct dir_entry) + name_len; 5854 total_len += sizeof(struct dir_entry) + name_len; 5855 next: 5856 path->slots[0]++; 5857 } 5858 btrfs_release_path(path); 5859 5860 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 5861 if (ret) 5862 goto nopos; 5863 5864 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); 5865 if (ret) 5866 goto nopos; 5867 5868 /* 5869 * Stop new entries from being returned after we return the last 5870 * entry. 5871 * 5872 * New directory entries are assigned a strictly increasing 5873 * offset. This means that new entries created during readdir 5874 * are *guaranteed* to be seen in the future by that readdir. 5875 * This has broken buggy programs which operate on names as 5876 * they're returned by readdir. Until we re-use freed offsets 5877 * we have this hack to stop new entries from being returned 5878 * under the assumption that they'll never reach this huge 5879 * offset. 5880 * 5881 * This is being careful not to overflow 32bit loff_t unless the 5882 * last entry requires it because doing so has broken 32bit apps 5883 * in the past. 5884 */ 5885 if (ctx->pos >= INT_MAX) 5886 ctx->pos = LLONG_MAX; 5887 else 5888 ctx->pos = INT_MAX; 5889 nopos: 5890 ret = 0; 5891 err: 5892 if (put) 5893 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); 5894 btrfs_free_path(path); 5895 return ret; 5896 } 5897 5898 /* 5899 * This is somewhat expensive, updating the tree every time the 5900 * inode changes. But, it is most likely to find the inode in cache. 5901 * FIXME, needs more benchmarking...there are no reasons other than performance 5902 * to keep or drop this code. 5903 */ 5904 static int btrfs_dirty_inode(struct inode *inode) 5905 { 5906 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5907 struct btrfs_root *root = BTRFS_I(inode)->root; 5908 struct btrfs_trans_handle *trans; 5909 int ret; 5910 5911 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 5912 return 0; 5913 5914 trans = btrfs_join_transaction(root); 5915 if (IS_ERR(trans)) 5916 return PTR_ERR(trans); 5917 5918 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 5919 if (ret && ret == -ENOSPC) { 5920 /* whoops, lets try again with the full transaction */ 5921 btrfs_end_transaction(trans); 5922 trans = btrfs_start_transaction(root, 1); 5923 if (IS_ERR(trans)) 5924 return PTR_ERR(trans); 5925 5926 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 5927 } 5928 btrfs_end_transaction(trans); 5929 if (BTRFS_I(inode)->delayed_node) 5930 btrfs_balance_delayed_items(fs_info); 5931 5932 return ret; 5933 } 5934 5935 /* 5936 * This is a copy of file_update_time. We need this so we can return error on 5937 * ENOSPC for updating the inode in the case of file write and mmap writes. 5938 */ 5939 static int btrfs_update_time(struct inode *inode, struct timespec64 *now, 5940 int flags) 5941 { 5942 struct btrfs_root *root = BTRFS_I(inode)->root; 5943 bool dirty = flags & ~S_VERSION; 5944 5945 if (btrfs_root_readonly(root)) 5946 return -EROFS; 5947 5948 if (flags & S_VERSION) 5949 dirty |= inode_maybe_inc_iversion(inode, dirty); 5950 if (flags & S_CTIME) 5951 inode->i_ctime = *now; 5952 if (flags & S_MTIME) 5953 inode->i_mtime = *now; 5954 if (flags & S_ATIME) 5955 inode->i_atime = *now; 5956 return dirty ? btrfs_dirty_inode(inode) : 0; 5957 } 5958 5959 /* 5960 * find the highest existing sequence number in a directory 5961 * and then set the in-memory index_cnt variable to reflect 5962 * free sequence numbers 5963 */ 5964 static int btrfs_set_inode_index_count(struct btrfs_inode *inode) 5965 { 5966 struct btrfs_root *root = inode->root; 5967 struct btrfs_key key, found_key; 5968 struct btrfs_path *path; 5969 struct extent_buffer *leaf; 5970 int ret; 5971 5972 key.objectid = btrfs_ino(inode); 5973 key.type = BTRFS_DIR_INDEX_KEY; 5974 key.offset = (u64)-1; 5975 5976 path = btrfs_alloc_path(); 5977 if (!path) 5978 return -ENOMEM; 5979 5980 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5981 if (ret < 0) 5982 goto out; 5983 /* FIXME: we should be able to handle this */ 5984 if (ret == 0) 5985 goto out; 5986 ret = 0; 5987 5988 /* 5989 * MAGIC NUMBER EXPLANATION: 5990 * since we search a directory based on f_pos we have to start at 2 5991 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 5992 * else has to start at 2 5993 */ 5994 if (path->slots[0] == 0) { 5995 inode->index_cnt = 2; 5996 goto out; 5997 } 5998 5999 path->slots[0]--; 6000 6001 leaf = path->nodes[0]; 6002 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6003 6004 if (found_key.objectid != btrfs_ino(inode) || 6005 found_key.type != BTRFS_DIR_INDEX_KEY) { 6006 inode->index_cnt = 2; 6007 goto out; 6008 } 6009 6010 inode->index_cnt = found_key.offset + 1; 6011 out: 6012 btrfs_free_path(path); 6013 return ret; 6014 } 6015 6016 /* 6017 * helper to find a free sequence number in a given directory. This current 6018 * code is very simple, later versions will do smarter things in the btree 6019 */ 6020 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) 6021 { 6022 int ret = 0; 6023 6024 if (dir->index_cnt == (u64)-1) { 6025 ret = btrfs_inode_delayed_dir_index_count(dir); 6026 if (ret) { 6027 ret = btrfs_set_inode_index_count(dir); 6028 if (ret) 6029 return ret; 6030 } 6031 } 6032 6033 *index = dir->index_cnt; 6034 dir->index_cnt++; 6035 6036 return ret; 6037 } 6038 6039 static int btrfs_insert_inode_locked(struct inode *inode) 6040 { 6041 struct btrfs_iget_args args; 6042 6043 args.ino = BTRFS_I(inode)->location.objectid; 6044 args.root = BTRFS_I(inode)->root; 6045 6046 return insert_inode_locked4(inode, 6047 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), 6048 btrfs_find_actor, &args); 6049 } 6050 6051 /* 6052 * Inherit flags from the parent inode. 6053 * 6054 * Currently only the compression flags and the cow flags are inherited. 6055 */ 6056 static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 6057 { 6058 unsigned int flags; 6059 6060 if (!dir) 6061 return; 6062 6063 flags = BTRFS_I(dir)->flags; 6064 6065 if (flags & BTRFS_INODE_NOCOMPRESS) { 6066 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; 6067 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 6068 } else if (flags & BTRFS_INODE_COMPRESS) { 6069 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; 6070 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 6071 } 6072 6073 if (flags & BTRFS_INODE_NODATACOW) { 6074 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 6075 if (S_ISREG(inode->i_mode)) 6076 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6077 } 6078 6079 btrfs_sync_inode_flags_to_i_flags(inode); 6080 } 6081 6082 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 6083 struct btrfs_root *root, 6084 struct inode *dir, 6085 const char *name, int name_len, 6086 u64 ref_objectid, u64 objectid, 6087 umode_t mode, u64 *index) 6088 { 6089 struct btrfs_fs_info *fs_info = root->fs_info; 6090 struct inode *inode; 6091 struct btrfs_inode_item *inode_item; 6092 struct btrfs_key *location; 6093 struct btrfs_path *path; 6094 struct btrfs_inode_ref *ref; 6095 struct btrfs_key key[2]; 6096 u32 sizes[2]; 6097 int nitems = name ? 2 : 1; 6098 unsigned long ptr; 6099 unsigned int nofs_flag; 6100 int ret; 6101 6102 path = btrfs_alloc_path(); 6103 if (!path) 6104 return ERR_PTR(-ENOMEM); 6105 6106 nofs_flag = memalloc_nofs_save(); 6107 inode = new_inode(fs_info->sb); 6108 memalloc_nofs_restore(nofs_flag); 6109 if (!inode) { 6110 btrfs_free_path(path); 6111 return ERR_PTR(-ENOMEM); 6112 } 6113 6114 /* 6115 * O_TMPFILE, set link count to 0, so that after this point, 6116 * we fill in an inode item with the correct link count. 6117 */ 6118 if (!name) 6119 set_nlink(inode, 0); 6120 6121 /* 6122 * we have to initialize this early, so we can reclaim the inode 6123 * number if we fail afterwards in this function. 6124 */ 6125 inode->i_ino = objectid; 6126 6127 if (dir && name) { 6128 trace_btrfs_inode_request(dir); 6129 6130 ret = btrfs_set_inode_index(BTRFS_I(dir), index); 6131 if (ret) { 6132 btrfs_free_path(path); 6133 iput(inode); 6134 return ERR_PTR(ret); 6135 } 6136 } else if (dir) { 6137 *index = 0; 6138 } 6139 /* 6140 * index_cnt is ignored for everything but a dir, 6141 * btrfs_set_inode_index_count has an explanation for the magic 6142 * number 6143 */ 6144 BTRFS_I(inode)->index_cnt = 2; 6145 BTRFS_I(inode)->dir_index = *index; 6146 BTRFS_I(inode)->root = btrfs_grab_root(root); 6147 BTRFS_I(inode)->generation = trans->transid; 6148 inode->i_generation = BTRFS_I(inode)->generation; 6149 6150 /* 6151 * We could have gotten an inode number from somebody who was fsynced 6152 * and then removed in this same transaction, so let's just set full 6153 * sync since it will be a full sync anyway and this will blow away the 6154 * old info in the log. 6155 */ 6156 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 6157 6158 key[0].objectid = objectid; 6159 key[0].type = BTRFS_INODE_ITEM_KEY; 6160 key[0].offset = 0; 6161 6162 sizes[0] = sizeof(struct btrfs_inode_item); 6163 6164 if (name) { 6165 /* 6166 * Start new inodes with an inode_ref. This is slightly more 6167 * efficient for small numbers of hard links since they will 6168 * be packed into one item. Extended refs will kick in if we 6169 * add more hard links than can fit in the ref item. 6170 */ 6171 key[1].objectid = objectid; 6172 key[1].type = BTRFS_INODE_REF_KEY; 6173 key[1].offset = ref_objectid; 6174 6175 sizes[1] = name_len + sizeof(*ref); 6176 } 6177 6178 location = &BTRFS_I(inode)->location; 6179 location->objectid = objectid; 6180 location->offset = 0; 6181 location->type = BTRFS_INODE_ITEM_KEY; 6182 6183 ret = btrfs_insert_inode_locked(inode); 6184 if (ret < 0) { 6185 iput(inode); 6186 goto fail; 6187 } 6188 6189 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); 6190 if (ret != 0) 6191 goto fail_unlock; 6192 6193 inode_init_owner(inode, dir, mode); 6194 inode_set_bytes(inode, 0); 6195 6196 inode->i_mtime = current_time(inode); 6197 inode->i_atime = inode->i_mtime; 6198 inode->i_ctime = inode->i_mtime; 6199 BTRFS_I(inode)->i_otime = inode->i_mtime; 6200 6201 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 6202 struct btrfs_inode_item); 6203 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, 6204 sizeof(*inode_item)); 6205 fill_inode_item(trans, path->nodes[0], inode_item, inode); 6206 6207 if (name) { 6208 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 6209 struct btrfs_inode_ref); 6210 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 6211 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 6212 ptr = (unsigned long)(ref + 1); 6213 write_extent_buffer(path->nodes[0], name, ptr, name_len); 6214 } 6215 6216 btrfs_mark_buffer_dirty(path->nodes[0]); 6217 btrfs_free_path(path); 6218 6219 btrfs_inherit_iflags(inode, dir); 6220 6221 if (S_ISREG(mode)) { 6222 if (btrfs_test_opt(fs_info, NODATASUM)) 6223 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6224 if (btrfs_test_opt(fs_info, NODATACOW)) 6225 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6226 BTRFS_INODE_NODATASUM; 6227 } 6228 6229 inode_tree_add(inode); 6230 6231 trace_btrfs_inode_new(inode); 6232 btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); 6233 6234 btrfs_update_root_times(trans, root); 6235 6236 ret = btrfs_inode_inherit_props(trans, inode, dir); 6237 if (ret) 6238 btrfs_err(fs_info, 6239 "error inheriting props for ino %llu (root %llu): %d", 6240 btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); 6241 6242 return inode; 6243 6244 fail_unlock: 6245 discard_new_inode(inode); 6246 fail: 6247 if (dir && name) 6248 BTRFS_I(dir)->index_cnt--; 6249 btrfs_free_path(path); 6250 return ERR_PTR(ret); 6251 } 6252 6253 /* 6254 * utility function to add 'inode' into 'parent_inode' with 6255 * a give name and a given sequence number. 6256 * if 'add_backref' is true, also insert a backref from the 6257 * inode to the parent directory. 6258 */ 6259 int btrfs_add_link(struct btrfs_trans_handle *trans, 6260 struct btrfs_inode *parent_inode, struct btrfs_inode *inode, 6261 const char *name, int name_len, int add_backref, u64 index) 6262 { 6263 int ret = 0; 6264 struct btrfs_key key; 6265 struct btrfs_root *root = parent_inode->root; 6266 u64 ino = btrfs_ino(inode); 6267 u64 parent_ino = btrfs_ino(parent_inode); 6268 6269 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6270 memcpy(&key, &inode->root->root_key, sizeof(key)); 6271 } else { 6272 key.objectid = ino; 6273 key.type = BTRFS_INODE_ITEM_KEY; 6274 key.offset = 0; 6275 } 6276 6277 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6278 ret = btrfs_add_root_ref(trans, key.objectid, 6279 root->root_key.objectid, parent_ino, 6280 index, name, name_len); 6281 } else if (add_backref) { 6282 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 6283 parent_ino, index); 6284 } 6285 6286 /* Nothing to clean up yet */ 6287 if (ret) 6288 return ret; 6289 6290 ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, 6291 btrfs_inode_type(&inode->vfs_inode), index); 6292 if (ret == -EEXIST || ret == -EOVERFLOW) 6293 goto fail_dir_item; 6294 else if (ret) { 6295 btrfs_abort_transaction(trans, ret); 6296 return ret; 6297 } 6298 6299 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + 6300 name_len * 2); 6301 inode_inc_iversion(&parent_inode->vfs_inode); 6302 /* 6303 * If we are replaying a log tree, we do not want to update the mtime 6304 * and ctime of the parent directory with the current time, since the 6305 * log replay procedure is responsible for setting them to their correct 6306 * values (the ones it had when the fsync was done). 6307 */ 6308 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { 6309 struct timespec64 now = current_time(&parent_inode->vfs_inode); 6310 6311 parent_inode->vfs_inode.i_mtime = now; 6312 parent_inode->vfs_inode.i_ctime = now; 6313 } 6314 ret = btrfs_update_inode(trans, root, parent_inode); 6315 if (ret) 6316 btrfs_abort_transaction(trans, ret); 6317 return ret; 6318 6319 fail_dir_item: 6320 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6321 u64 local_index; 6322 int err; 6323 err = btrfs_del_root_ref(trans, key.objectid, 6324 root->root_key.objectid, parent_ino, 6325 &local_index, name, name_len); 6326 if (err) 6327 btrfs_abort_transaction(trans, err); 6328 } else if (add_backref) { 6329 u64 local_index; 6330 int err; 6331 6332 err = btrfs_del_inode_ref(trans, root, name, name_len, 6333 ino, parent_ino, &local_index); 6334 if (err) 6335 btrfs_abort_transaction(trans, err); 6336 } 6337 6338 /* Return the original error code */ 6339 return ret; 6340 } 6341 6342 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 6343 struct btrfs_inode *dir, struct dentry *dentry, 6344 struct btrfs_inode *inode, int backref, u64 index) 6345 { 6346 int err = btrfs_add_link(trans, dir, inode, 6347 dentry->d_name.name, dentry->d_name.len, 6348 backref, index); 6349 if (err > 0) 6350 err = -EEXIST; 6351 return err; 6352 } 6353 6354 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 6355 umode_t mode, dev_t rdev) 6356 { 6357 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6358 struct btrfs_trans_handle *trans; 6359 struct btrfs_root *root = BTRFS_I(dir)->root; 6360 struct inode *inode = NULL; 6361 int err; 6362 u64 objectid; 6363 u64 index = 0; 6364 6365 /* 6366 * 2 for inode item and ref 6367 * 2 for dir items 6368 * 1 for xattr if selinux is on 6369 */ 6370 trans = btrfs_start_transaction(root, 5); 6371 if (IS_ERR(trans)) 6372 return PTR_ERR(trans); 6373 6374 err = btrfs_find_free_objectid(root, &objectid); 6375 if (err) 6376 goto out_unlock; 6377 6378 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6379 dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, 6380 mode, &index); 6381 if (IS_ERR(inode)) { 6382 err = PTR_ERR(inode); 6383 inode = NULL; 6384 goto out_unlock; 6385 } 6386 6387 /* 6388 * If the active LSM wants to access the inode during 6389 * d_instantiate it needs these. Smack checks to see 6390 * if the filesystem supports xattrs by looking at the 6391 * ops vector. 6392 */ 6393 inode->i_op = &btrfs_special_inode_operations; 6394 init_special_inode(inode, inode->i_mode, rdev); 6395 6396 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6397 if (err) 6398 goto out_unlock; 6399 6400 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6401 0, index); 6402 if (err) 6403 goto out_unlock; 6404 6405 btrfs_update_inode(trans, root, BTRFS_I(inode)); 6406 d_instantiate_new(dentry, inode); 6407 6408 out_unlock: 6409 btrfs_end_transaction(trans); 6410 btrfs_btree_balance_dirty(fs_info); 6411 if (err && inode) { 6412 inode_dec_link_count(inode); 6413 discard_new_inode(inode); 6414 } 6415 return err; 6416 } 6417 6418 static int btrfs_create(struct inode *dir, struct dentry *dentry, 6419 umode_t mode, bool excl) 6420 { 6421 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6422 struct btrfs_trans_handle *trans; 6423 struct btrfs_root *root = BTRFS_I(dir)->root; 6424 struct inode *inode = NULL; 6425 int err; 6426 u64 objectid; 6427 u64 index = 0; 6428 6429 /* 6430 * 2 for inode item and ref 6431 * 2 for dir items 6432 * 1 for xattr if selinux is on 6433 */ 6434 trans = btrfs_start_transaction(root, 5); 6435 if (IS_ERR(trans)) 6436 return PTR_ERR(trans); 6437 6438 err = btrfs_find_free_objectid(root, &objectid); 6439 if (err) 6440 goto out_unlock; 6441 6442 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6443 dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, 6444 mode, &index); 6445 if (IS_ERR(inode)) { 6446 err = PTR_ERR(inode); 6447 inode = NULL; 6448 goto out_unlock; 6449 } 6450 /* 6451 * If the active LSM wants to access the inode during 6452 * d_instantiate it needs these. Smack checks to see 6453 * if the filesystem supports xattrs by looking at the 6454 * ops vector. 6455 */ 6456 inode->i_fop = &btrfs_file_operations; 6457 inode->i_op = &btrfs_file_inode_operations; 6458 inode->i_mapping->a_ops = &btrfs_aops; 6459 6460 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6461 if (err) 6462 goto out_unlock; 6463 6464 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6465 if (err) 6466 goto out_unlock; 6467 6468 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6469 0, index); 6470 if (err) 6471 goto out_unlock; 6472 6473 d_instantiate_new(dentry, inode); 6474 6475 out_unlock: 6476 btrfs_end_transaction(trans); 6477 if (err && inode) { 6478 inode_dec_link_count(inode); 6479 discard_new_inode(inode); 6480 } 6481 btrfs_btree_balance_dirty(fs_info); 6482 return err; 6483 } 6484 6485 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6486 struct dentry *dentry) 6487 { 6488 struct btrfs_trans_handle *trans = NULL; 6489 struct btrfs_root *root = BTRFS_I(dir)->root; 6490 struct inode *inode = d_inode(old_dentry); 6491 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 6492 u64 index; 6493 int err; 6494 int drop_inode = 0; 6495 6496 /* do not allow sys_link's with other subvols of the same device */ 6497 if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) 6498 return -EXDEV; 6499 6500 if (inode->i_nlink >= BTRFS_LINK_MAX) 6501 return -EMLINK; 6502 6503 err = btrfs_set_inode_index(BTRFS_I(dir), &index); 6504 if (err) 6505 goto fail; 6506 6507 /* 6508 * 2 items for inode and inode ref 6509 * 2 items for dir items 6510 * 1 item for parent inode 6511 * 1 item for orphan item deletion if O_TMPFILE 6512 */ 6513 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); 6514 if (IS_ERR(trans)) { 6515 err = PTR_ERR(trans); 6516 trans = NULL; 6517 goto fail; 6518 } 6519 6520 /* There are several dir indexes for this inode, clear the cache. */ 6521 BTRFS_I(inode)->dir_index = 0ULL; 6522 inc_nlink(inode); 6523 inode_inc_iversion(inode); 6524 inode->i_ctime = current_time(inode); 6525 ihold(inode); 6526 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6527 6528 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6529 1, index); 6530 6531 if (err) { 6532 drop_inode = 1; 6533 } else { 6534 struct dentry *parent = dentry->d_parent; 6535 6536 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6537 if (err) 6538 goto fail; 6539 if (inode->i_nlink == 1) { 6540 /* 6541 * If new hard link count is 1, it's a file created 6542 * with open(2) O_TMPFILE flag. 6543 */ 6544 err = btrfs_orphan_del(trans, BTRFS_I(inode)); 6545 if (err) 6546 goto fail; 6547 } 6548 d_instantiate(dentry, inode); 6549 btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); 6550 } 6551 6552 fail: 6553 if (trans) 6554 btrfs_end_transaction(trans); 6555 if (drop_inode) { 6556 inode_dec_link_count(inode); 6557 iput(inode); 6558 } 6559 btrfs_btree_balance_dirty(fs_info); 6560 return err; 6561 } 6562 6563 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 6564 { 6565 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6566 struct inode *inode = NULL; 6567 struct btrfs_trans_handle *trans; 6568 struct btrfs_root *root = BTRFS_I(dir)->root; 6569 int err = 0; 6570 u64 objectid = 0; 6571 u64 index = 0; 6572 6573 /* 6574 * 2 items for inode and ref 6575 * 2 items for dir items 6576 * 1 for xattr if selinux is on 6577 */ 6578 trans = btrfs_start_transaction(root, 5); 6579 if (IS_ERR(trans)) 6580 return PTR_ERR(trans); 6581 6582 err = btrfs_find_free_objectid(root, &objectid); 6583 if (err) 6584 goto out_fail; 6585 6586 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6587 dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, 6588 S_IFDIR | mode, &index); 6589 if (IS_ERR(inode)) { 6590 err = PTR_ERR(inode); 6591 inode = NULL; 6592 goto out_fail; 6593 } 6594 6595 /* these must be set before we unlock the inode */ 6596 inode->i_op = &btrfs_dir_inode_operations; 6597 inode->i_fop = &btrfs_dir_file_operations; 6598 6599 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6600 if (err) 6601 goto out_fail; 6602 6603 btrfs_i_size_write(BTRFS_I(inode), 0); 6604 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6605 if (err) 6606 goto out_fail; 6607 6608 err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6609 dentry->d_name.name, 6610 dentry->d_name.len, 0, index); 6611 if (err) 6612 goto out_fail; 6613 6614 d_instantiate_new(dentry, inode); 6615 6616 out_fail: 6617 btrfs_end_transaction(trans); 6618 if (err && inode) { 6619 inode_dec_link_count(inode); 6620 discard_new_inode(inode); 6621 } 6622 btrfs_btree_balance_dirty(fs_info); 6623 return err; 6624 } 6625 6626 static noinline int uncompress_inline(struct btrfs_path *path, 6627 struct page *page, 6628 size_t pg_offset, u64 extent_offset, 6629 struct btrfs_file_extent_item *item) 6630 { 6631 int ret; 6632 struct extent_buffer *leaf = path->nodes[0]; 6633 char *tmp; 6634 size_t max_size; 6635 unsigned long inline_size; 6636 unsigned long ptr; 6637 int compress_type; 6638 6639 WARN_ON(pg_offset != 0); 6640 compress_type = btrfs_file_extent_compression(leaf, item); 6641 max_size = btrfs_file_extent_ram_bytes(leaf, item); 6642 inline_size = btrfs_file_extent_inline_item_len(leaf, 6643 btrfs_item_nr(path->slots[0])); 6644 tmp = kmalloc(inline_size, GFP_NOFS); 6645 if (!tmp) 6646 return -ENOMEM; 6647 ptr = btrfs_file_extent_inline_start(item); 6648 6649 read_extent_buffer(leaf, tmp, ptr, inline_size); 6650 6651 max_size = min_t(unsigned long, PAGE_SIZE, max_size); 6652 ret = btrfs_decompress(compress_type, tmp, page, 6653 extent_offset, inline_size, max_size); 6654 6655 /* 6656 * decompression code contains a memset to fill in any space between the end 6657 * of the uncompressed data and the end of max_size in case the decompressed 6658 * data ends up shorter than ram_bytes. That doesn't cover the hole between 6659 * the end of an inline extent and the beginning of the next block, so we 6660 * cover that region here. 6661 */ 6662 6663 if (max_size + pg_offset < PAGE_SIZE) { 6664 char *map = kmap(page); 6665 memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); 6666 kunmap(page); 6667 } 6668 kfree(tmp); 6669 return ret; 6670 } 6671 6672 /** 6673 * btrfs_get_extent - Lookup the first extent overlapping a range in a file. 6674 * @inode: file to search in 6675 * @page: page to read extent data into if the extent is inline 6676 * @pg_offset: offset into @page to copy to 6677 * @start: file offset 6678 * @len: length of range starting at @start 6679 * 6680 * This returns the first &struct extent_map which overlaps with the given 6681 * range, reading it from the B-tree and caching it if necessary. Note that 6682 * there may be more extents which overlap the given range after the returned 6683 * extent_map. 6684 * 6685 * If @page is not NULL and the extent is inline, this also reads the extent 6686 * data directly into the page and marks the extent up to date in the io_tree. 6687 * 6688 * Return: ERR_PTR on error, non-NULL extent_map on success. 6689 */ 6690 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, 6691 struct page *page, size_t pg_offset, 6692 u64 start, u64 len) 6693 { 6694 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6695 int ret = 0; 6696 u64 extent_start = 0; 6697 u64 extent_end = 0; 6698 u64 objectid = btrfs_ino(inode); 6699 int extent_type = -1; 6700 struct btrfs_path *path = NULL; 6701 struct btrfs_root *root = inode->root; 6702 struct btrfs_file_extent_item *item; 6703 struct extent_buffer *leaf; 6704 struct btrfs_key found_key; 6705 struct extent_map *em = NULL; 6706 struct extent_map_tree *em_tree = &inode->extent_tree; 6707 struct extent_io_tree *io_tree = &inode->io_tree; 6708 6709 read_lock(&em_tree->lock); 6710 em = lookup_extent_mapping(em_tree, start, len); 6711 read_unlock(&em_tree->lock); 6712 6713 if (em) { 6714 if (em->start > start || em->start + em->len <= start) 6715 free_extent_map(em); 6716 else if (em->block_start == EXTENT_MAP_INLINE && page) 6717 free_extent_map(em); 6718 else 6719 goto out; 6720 } 6721 em = alloc_extent_map(); 6722 if (!em) { 6723 ret = -ENOMEM; 6724 goto out; 6725 } 6726 em->start = EXTENT_MAP_HOLE; 6727 em->orig_start = EXTENT_MAP_HOLE; 6728 em->len = (u64)-1; 6729 em->block_len = (u64)-1; 6730 6731 path = btrfs_alloc_path(); 6732 if (!path) { 6733 ret = -ENOMEM; 6734 goto out; 6735 } 6736 6737 /* Chances are we'll be called again, so go ahead and do readahead */ 6738 path->reada = READA_FORWARD; 6739 6740 /* 6741 * The same explanation in load_free_space_cache applies here as well, 6742 * we only read when we're loading the free space cache, and at that 6743 * point the commit_root has everything we need. 6744 */ 6745 if (btrfs_is_free_space_inode(inode)) { 6746 path->search_commit_root = 1; 6747 path->skip_locking = 1; 6748 } 6749 6750 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); 6751 if (ret < 0) { 6752 goto out; 6753 } else if (ret > 0) { 6754 if (path->slots[0] == 0) 6755 goto not_found; 6756 path->slots[0]--; 6757 ret = 0; 6758 } 6759 6760 leaf = path->nodes[0]; 6761 item = btrfs_item_ptr(leaf, path->slots[0], 6762 struct btrfs_file_extent_item); 6763 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6764 if (found_key.objectid != objectid || 6765 found_key.type != BTRFS_EXTENT_DATA_KEY) { 6766 /* 6767 * If we backup past the first extent we want to move forward 6768 * and see if there is an extent in front of us, otherwise we'll 6769 * say there is a hole for our whole search range which can 6770 * cause problems. 6771 */ 6772 extent_end = start; 6773 goto next; 6774 } 6775 6776 extent_type = btrfs_file_extent_type(leaf, item); 6777 extent_start = found_key.offset; 6778 extent_end = btrfs_file_extent_end(path); 6779 if (extent_type == BTRFS_FILE_EXTENT_REG || 6780 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 6781 /* Only regular file could have regular/prealloc extent */ 6782 if (!S_ISREG(inode->vfs_inode.i_mode)) { 6783 ret = -EUCLEAN; 6784 btrfs_crit(fs_info, 6785 "regular/prealloc extent found for non-regular inode %llu", 6786 btrfs_ino(inode)); 6787 goto out; 6788 } 6789 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, 6790 extent_start); 6791 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 6792 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, 6793 path->slots[0], 6794 extent_start); 6795 } 6796 next: 6797 if (start >= extent_end) { 6798 path->slots[0]++; 6799 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 6800 ret = btrfs_next_leaf(root, path); 6801 if (ret < 0) 6802 goto out; 6803 else if (ret > 0) 6804 goto not_found; 6805 6806 leaf = path->nodes[0]; 6807 } 6808 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6809 if (found_key.objectid != objectid || 6810 found_key.type != BTRFS_EXTENT_DATA_KEY) 6811 goto not_found; 6812 if (start + len <= found_key.offset) 6813 goto not_found; 6814 if (start > found_key.offset) 6815 goto next; 6816 6817 /* New extent overlaps with existing one */ 6818 em->start = start; 6819 em->orig_start = start; 6820 em->len = found_key.offset - start; 6821 em->block_start = EXTENT_MAP_HOLE; 6822 goto insert; 6823 } 6824 6825 btrfs_extent_item_to_extent_map(inode, path, item, !page, em); 6826 6827 if (extent_type == BTRFS_FILE_EXTENT_REG || 6828 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 6829 goto insert; 6830 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 6831 unsigned long ptr; 6832 char *map; 6833 size_t size; 6834 size_t extent_offset; 6835 size_t copy_size; 6836 6837 if (!page) 6838 goto out; 6839 6840 size = btrfs_file_extent_ram_bytes(leaf, item); 6841 extent_offset = page_offset(page) + pg_offset - extent_start; 6842 copy_size = min_t(u64, PAGE_SIZE - pg_offset, 6843 size - extent_offset); 6844 em->start = extent_start + extent_offset; 6845 em->len = ALIGN(copy_size, fs_info->sectorsize); 6846 em->orig_block_len = em->len; 6847 em->orig_start = em->start; 6848 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 6849 6850 if (!PageUptodate(page)) { 6851 if (btrfs_file_extent_compression(leaf, item) != 6852 BTRFS_COMPRESS_NONE) { 6853 ret = uncompress_inline(path, page, pg_offset, 6854 extent_offset, item); 6855 if (ret) 6856 goto out; 6857 } else { 6858 map = kmap(page); 6859 read_extent_buffer(leaf, map + pg_offset, ptr, 6860 copy_size); 6861 if (pg_offset + copy_size < PAGE_SIZE) { 6862 memset(map + pg_offset + copy_size, 0, 6863 PAGE_SIZE - pg_offset - 6864 copy_size); 6865 } 6866 kunmap(page); 6867 } 6868 flush_dcache_page(page); 6869 } 6870 set_extent_uptodate(io_tree, em->start, 6871 extent_map_end(em) - 1, NULL, GFP_NOFS); 6872 goto insert; 6873 } 6874 not_found: 6875 em->start = start; 6876 em->orig_start = start; 6877 em->len = len; 6878 em->block_start = EXTENT_MAP_HOLE; 6879 insert: 6880 ret = 0; 6881 btrfs_release_path(path); 6882 if (em->start > start || extent_map_end(em) <= start) { 6883 btrfs_err(fs_info, 6884 "bad extent! em: [%llu %llu] passed [%llu %llu]", 6885 em->start, em->len, start, len); 6886 ret = -EIO; 6887 goto out; 6888 } 6889 6890 write_lock(&em_tree->lock); 6891 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); 6892 write_unlock(&em_tree->lock); 6893 out: 6894 btrfs_free_path(path); 6895 6896 trace_btrfs_get_extent(root, inode, em); 6897 6898 if (ret) { 6899 free_extent_map(em); 6900 return ERR_PTR(ret); 6901 } 6902 return em; 6903 } 6904 6905 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, 6906 u64 start, u64 len) 6907 { 6908 struct extent_map *em; 6909 struct extent_map *hole_em = NULL; 6910 u64 delalloc_start = start; 6911 u64 end; 6912 u64 delalloc_len; 6913 u64 delalloc_end; 6914 int err = 0; 6915 6916 em = btrfs_get_extent(inode, NULL, 0, start, len); 6917 if (IS_ERR(em)) 6918 return em; 6919 /* 6920 * If our em maps to: 6921 * - a hole or 6922 * - a pre-alloc extent, 6923 * there might actually be delalloc bytes behind it. 6924 */ 6925 if (em->block_start != EXTENT_MAP_HOLE && 6926 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 6927 return em; 6928 else 6929 hole_em = em; 6930 6931 /* check to see if we've wrapped (len == -1 or similar) */ 6932 end = start + len; 6933 if (end < start) 6934 end = (u64)-1; 6935 else 6936 end -= 1; 6937 6938 em = NULL; 6939 6940 /* ok, we didn't find anything, lets look for delalloc */ 6941 delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, 6942 end, len, EXTENT_DELALLOC, 1); 6943 delalloc_end = delalloc_start + delalloc_len; 6944 if (delalloc_end < delalloc_start) 6945 delalloc_end = (u64)-1; 6946 6947 /* 6948 * We didn't find anything useful, return the original results from 6949 * get_extent() 6950 */ 6951 if (delalloc_start > end || delalloc_end <= start) { 6952 em = hole_em; 6953 hole_em = NULL; 6954 goto out; 6955 } 6956 6957 /* 6958 * Adjust the delalloc_start to make sure it doesn't go backwards from 6959 * the start they passed in 6960 */ 6961 delalloc_start = max(start, delalloc_start); 6962 delalloc_len = delalloc_end - delalloc_start; 6963 6964 if (delalloc_len > 0) { 6965 u64 hole_start; 6966 u64 hole_len; 6967 const u64 hole_end = extent_map_end(hole_em); 6968 6969 em = alloc_extent_map(); 6970 if (!em) { 6971 err = -ENOMEM; 6972 goto out; 6973 } 6974 6975 ASSERT(hole_em); 6976 /* 6977 * When btrfs_get_extent can't find anything it returns one 6978 * huge hole 6979 * 6980 * Make sure what it found really fits our range, and adjust to 6981 * make sure it is based on the start from the caller 6982 */ 6983 if (hole_end <= start || hole_em->start > end) { 6984 free_extent_map(hole_em); 6985 hole_em = NULL; 6986 } else { 6987 hole_start = max(hole_em->start, start); 6988 hole_len = hole_end - hole_start; 6989 } 6990 6991 if (hole_em && delalloc_start > hole_start) { 6992 /* 6993 * Our hole starts before our delalloc, so we have to 6994 * return just the parts of the hole that go until the 6995 * delalloc starts 6996 */ 6997 em->len = min(hole_len, delalloc_start - hole_start); 6998 em->start = hole_start; 6999 em->orig_start = hole_start; 7000 /* 7001 * Don't adjust block start at all, it is fixed at 7002 * EXTENT_MAP_HOLE 7003 */ 7004 em->block_start = hole_em->block_start; 7005 em->block_len = hole_len; 7006 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) 7007 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7008 } else { 7009 /* 7010 * Hole is out of passed range or it starts after 7011 * delalloc range 7012 */ 7013 em->start = delalloc_start; 7014 em->len = delalloc_len; 7015 em->orig_start = delalloc_start; 7016 em->block_start = EXTENT_MAP_DELALLOC; 7017 em->block_len = delalloc_len; 7018 } 7019 } else { 7020 return hole_em; 7021 } 7022 out: 7023 7024 free_extent_map(hole_em); 7025 if (err) { 7026 free_extent_map(em); 7027 return ERR_PTR(err); 7028 } 7029 return em; 7030 } 7031 7032 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, 7033 const u64 start, 7034 const u64 len, 7035 const u64 orig_start, 7036 const u64 block_start, 7037 const u64 block_len, 7038 const u64 orig_block_len, 7039 const u64 ram_bytes, 7040 const int type) 7041 { 7042 struct extent_map *em = NULL; 7043 int ret; 7044 7045 if (type != BTRFS_ORDERED_NOCOW) { 7046 em = create_io_em(inode, start, len, orig_start, block_start, 7047 block_len, orig_block_len, ram_bytes, 7048 BTRFS_COMPRESS_NONE, /* compress_type */ 7049 type); 7050 if (IS_ERR(em)) 7051 goto out; 7052 } 7053 ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, 7054 block_len, type); 7055 if (ret) { 7056 if (em) { 7057 free_extent_map(em); 7058 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 7059 } 7060 em = ERR_PTR(ret); 7061 } 7062 out: 7063 7064 return em; 7065 } 7066 7067 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, 7068 u64 start, u64 len) 7069 { 7070 struct btrfs_root *root = inode->root; 7071 struct btrfs_fs_info *fs_info = root->fs_info; 7072 struct extent_map *em; 7073 struct btrfs_key ins; 7074 u64 alloc_hint; 7075 int ret; 7076 7077 alloc_hint = get_extent_allocation_hint(inode, start, len); 7078 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 7079 0, alloc_hint, &ins, 1, 1); 7080 if (ret) 7081 return ERR_PTR(ret); 7082 7083 em = btrfs_create_dio_extent(inode, start, ins.offset, start, 7084 ins.objectid, ins.offset, ins.offset, 7085 ins.offset, BTRFS_ORDERED_REGULAR); 7086 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 7087 if (IS_ERR(em)) 7088 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 7089 1); 7090 7091 return em; 7092 } 7093 7094 /* 7095 * Check if we can do nocow write into the range [@offset, @offset + @len) 7096 * 7097 * @offset: File offset 7098 * @len: The length to write, will be updated to the nocow writeable 7099 * range 7100 * @orig_start: (optional) Return the original file offset of the file extent 7101 * @orig_len: (optional) Return the original on-disk length of the file extent 7102 * @ram_bytes: (optional) Return the ram_bytes of the file extent 7103 * @strict: if true, omit optimizations that might force us into unnecessary 7104 * cow. e.g., don't trust generation number. 7105 * 7106 * This function will flush ordered extents in the range to ensure proper 7107 * nocow checks for (nowait == false) case. 7108 * 7109 * Return: 7110 * >0 and update @len if we can do nocow write 7111 * 0 if we can't do nocow write 7112 * <0 if error happened 7113 * 7114 * NOTE: This only checks the file extents, caller is responsible to wait for 7115 * any ordered extents. 7116 */ 7117 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 7118 u64 *orig_start, u64 *orig_block_len, 7119 u64 *ram_bytes, bool strict) 7120 { 7121 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7122 struct btrfs_path *path; 7123 int ret; 7124 struct extent_buffer *leaf; 7125 struct btrfs_root *root = BTRFS_I(inode)->root; 7126 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7127 struct btrfs_file_extent_item *fi; 7128 struct btrfs_key key; 7129 u64 disk_bytenr; 7130 u64 backref_offset; 7131 u64 extent_end; 7132 u64 num_bytes; 7133 int slot; 7134 int found_type; 7135 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 7136 7137 path = btrfs_alloc_path(); 7138 if (!path) 7139 return -ENOMEM; 7140 7141 ret = btrfs_lookup_file_extent(NULL, root, path, 7142 btrfs_ino(BTRFS_I(inode)), offset, 0); 7143 if (ret < 0) 7144 goto out; 7145 7146 slot = path->slots[0]; 7147 if (ret == 1) { 7148 if (slot == 0) { 7149 /* can't find the item, must cow */ 7150 ret = 0; 7151 goto out; 7152 } 7153 slot--; 7154 } 7155 ret = 0; 7156 leaf = path->nodes[0]; 7157 btrfs_item_key_to_cpu(leaf, &key, slot); 7158 if (key.objectid != btrfs_ino(BTRFS_I(inode)) || 7159 key.type != BTRFS_EXTENT_DATA_KEY) { 7160 /* not our file or wrong item type, must cow */ 7161 goto out; 7162 } 7163 7164 if (key.offset > offset) { 7165 /* Wrong offset, must cow */ 7166 goto out; 7167 } 7168 7169 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 7170 found_type = btrfs_file_extent_type(leaf, fi); 7171 if (found_type != BTRFS_FILE_EXTENT_REG && 7172 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 7173 /* not a regular extent, must cow */ 7174 goto out; 7175 } 7176 7177 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 7178 goto out; 7179 7180 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 7181 if (extent_end <= offset) 7182 goto out; 7183 7184 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 7185 if (disk_bytenr == 0) 7186 goto out; 7187 7188 if (btrfs_file_extent_compression(leaf, fi) || 7189 btrfs_file_extent_encryption(leaf, fi) || 7190 btrfs_file_extent_other_encoding(leaf, fi)) 7191 goto out; 7192 7193 /* 7194 * Do the same check as in btrfs_cross_ref_exist but without the 7195 * unnecessary search. 7196 */ 7197 if (!strict && 7198 (btrfs_file_extent_generation(leaf, fi) <= 7199 btrfs_root_last_snapshot(&root->root_item))) 7200 goto out; 7201 7202 backref_offset = btrfs_file_extent_offset(leaf, fi); 7203 7204 if (orig_start) { 7205 *orig_start = key.offset - backref_offset; 7206 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 7207 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 7208 } 7209 7210 if (btrfs_extent_readonly(fs_info, disk_bytenr)) 7211 goto out; 7212 7213 num_bytes = min(offset + *len, extent_end) - offset; 7214 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7215 u64 range_end; 7216 7217 range_end = round_up(offset + num_bytes, 7218 root->fs_info->sectorsize) - 1; 7219 ret = test_range_bit(io_tree, offset, range_end, 7220 EXTENT_DELALLOC, 0, NULL); 7221 if (ret) { 7222 ret = -EAGAIN; 7223 goto out; 7224 } 7225 } 7226 7227 btrfs_release_path(path); 7228 7229 /* 7230 * look for other files referencing this extent, if we 7231 * find any we must cow 7232 */ 7233 7234 ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), 7235 key.offset - backref_offset, disk_bytenr, 7236 strict); 7237 if (ret) { 7238 ret = 0; 7239 goto out; 7240 } 7241 7242 /* 7243 * adjust disk_bytenr and num_bytes to cover just the bytes 7244 * in this extent we are about to write. If there 7245 * are any csums in that range we have to cow in order 7246 * to keep the csums correct 7247 */ 7248 disk_bytenr += backref_offset; 7249 disk_bytenr += offset - key.offset; 7250 if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) 7251 goto out; 7252 /* 7253 * all of the above have passed, it is safe to overwrite this extent 7254 * without cow 7255 */ 7256 *len = num_bytes; 7257 ret = 1; 7258 out: 7259 btrfs_free_path(path); 7260 return ret; 7261 } 7262 7263 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 7264 struct extent_state **cached_state, bool writing) 7265 { 7266 struct btrfs_ordered_extent *ordered; 7267 int ret = 0; 7268 7269 while (1) { 7270 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7271 cached_state); 7272 /* 7273 * We're concerned with the entire range that we're going to be 7274 * doing DIO to, so we need to make sure there's no ordered 7275 * extents in this range. 7276 */ 7277 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, 7278 lockend - lockstart + 1); 7279 7280 /* 7281 * We need to make sure there are no buffered pages in this 7282 * range either, we could have raced between the invalidate in 7283 * generic_file_direct_write and locking the extent. The 7284 * invalidate needs to happen so that reads after a write do not 7285 * get stale data. 7286 */ 7287 if (!ordered && 7288 (!writing || !filemap_range_has_page(inode->i_mapping, 7289 lockstart, lockend))) 7290 break; 7291 7292 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7293 cached_state); 7294 7295 if (ordered) { 7296 /* 7297 * If we are doing a DIO read and the ordered extent we 7298 * found is for a buffered write, we can not wait for it 7299 * to complete and retry, because if we do so we can 7300 * deadlock with concurrent buffered writes on page 7301 * locks. This happens only if our DIO read covers more 7302 * than one extent map, if at this point has already 7303 * created an ordered extent for a previous extent map 7304 * and locked its range in the inode's io tree, and a 7305 * concurrent write against that previous extent map's 7306 * range and this range started (we unlock the ranges 7307 * in the io tree only when the bios complete and 7308 * buffered writes always lock pages before attempting 7309 * to lock range in the io tree). 7310 */ 7311 if (writing || 7312 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 7313 btrfs_start_ordered_extent(ordered, 1); 7314 else 7315 ret = -ENOTBLK; 7316 btrfs_put_ordered_extent(ordered); 7317 } else { 7318 /* 7319 * We could trigger writeback for this range (and wait 7320 * for it to complete) and then invalidate the pages for 7321 * this range (through invalidate_inode_pages2_range()), 7322 * but that can lead us to a deadlock with a concurrent 7323 * call to readahead (a buffered read or a defrag call 7324 * triggered a readahead) on a page lock due to an 7325 * ordered dio extent we created before but did not have 7326 * yet a corresponding bio submitted (whence it can not 7327 * complete), which makes readahead wait for that 7328 * ordered extent to complete while holding a lock on 7329 * that page. 7330 */ 7331 ret = -ENOTBLK; 7332 } 7333 7334 if (ret) 7335 break; 7336 7337 cond_resched(); 7338 } 7339 7340 return ret; 7341 } 7342 7343 /* The callers of this must take lock_extent() */ 7344 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, 7345 u64 len, u64 orig_start, u64 block_start, 7346 u64 block_len, u64 orig_block_len, 7347 u64 ram_bytes, int compress_type, 7348 int type) 7349 { 7350 struct extent_map_tree *em_tree; 7351 struct extent_map *em; 7352 int ret; 7353 7354 ASSERT(type == BTRFS_ORDERED_PREALLOC || 7355 type == BTRFS_ORDERED_COMPRESSED || 7356 type == BTRFS_ORDERED_NOCOW || 7357 type == BTRFS_ORDERED_REGULAR); 7358 7359 em_tree = &inode->extent_tree; 7360 em = alloc_extent_map(); 7361 if (!em) 7362 return ERR_PTR(-ENOMEM); 7363 7364 em->start = start; 7365 em->orig_start = orig_start; 7366 em->len = len; 7367 em->block_len = block_len; 7368 em->block_start = block_start; 7369 em->orig_block_len = orig_block_len; 7370 em->ram_bytes = ram_bytes; 7371 em->generation = -1; 7372 set_bit(EXTENT_FLAG_PINNED, &em->flags); 7373 if (type == BTRFS_ORDERED_PREALLOC) { 7374 set_bit(EXTENT_FLAG_FILLING, &em->flags); 7375 } else if (type == BTRFS_ORDERED_COMPRESSED) { 7376 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 7377 em->compress_type = compress_type; 7378 } 7379 7380 do { 7381 btrfs_drop_extent_cache(inode, em->start, 7382 em->start + em->len - 1, 0); 7383 write_lock(&em_tree->lock); 7384 ret = add_extent_mapping(em_tree, em, 1); 7385 write_unlock(&em_tree->lock); 7386 /* 7387 * The caller has taken lock_extent(), who could race with us 7388 * to add em? 7389 */ 7390 } while (ret == -EEXIST); 7391 7392 if (ret) { 7393 free_extent_map(em); 7394 return ERR_PTR(ret); 7395 } 7396 7397 /* em got 2 refs now, callers needs to do free_extent_map once. */ 7398 return em; 7399 } 7400 7401 7402 static int btrfs_get_blocks_direct_write(struct extent_map **map, 7403 struct inode *inode, 7404 struct btrfs_dio_data *dio_data, 7405 u64 start, u64 len) 7406 { 7407 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7408 struct extent_map *em = *map; 7409 int ret = 0; 7410 7411 /* 7412 * We don't allocate a new extent in the following cases 7413 * 7414 * 1) The inode is marked as NODATACOW. In this case we'll just use the 7415 * existing extent. 7416 * 2) The extent is marked as PREALLOC. We're good to go here and can 7417 * just use the extent. 7418 * 7419 */ 7420 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 7421 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 7422 em->block_start != EXTENT_MAP_HOLE)) { 7423 int type; 7424 u64 block_start, orig_start, orig_block_len, ram_bytes; 7425 7426 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7427 type = BTRFS_ORDERED_PREALLOC; 7428 else 7429 type = BTRFS_ORDERED_NOCOW; 7430 len = min(len, em->len - (start - em->start)); 7431 block_start = em->block_start + (start - em->start); 7432 7433 if (can_nocow_extent(inode, start, &len, &orig_start, 7434 &orig_block_len, &ram_bytes, false) == 1 && 7435 btrfs_inc_nocow_writers(fs_info, block_start)) { 7436 struct extent_map *em2; 7437 7438 em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, 7439 orig_start, block_start, 7440 len, orig_block_len, 7441 ram_bytes, type); 7442 btrfs_dec_nocow_writers(fs_info, block_start); 7443 if (type == BTRFS_ORDERED_PREALLOC) { 7444 free_extent_map(em); 7445 *map = em = em2; 7446 } 7447 7448 if (em2 && IS_ERR(em2)) { 7449 ret = PTR_ERR(em2); 7450 goto out; 7451 } 7452 /* 7453 * For inode marked NODATACOW or extent marked PREALLOC, 7454 * use the existing or preallocated extent, so does not 7455 * need to adjust btrfs_space_info's bytes_may_use. 7456 */ 7457 btrfs_free_reserved_data_space_noquota(fs_info, len); 7458 goto skip_cow; 7459 } 7460 } 7461 7462 /* this will cow the extent */ 7463 free_extent_map(em); 7464 *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); 7465 if (IS_ERR(em)) { 7466 ret = PTR_ERR(em); 7467 goto out; 7468 } 7469 7470 len = min(len, em->len - (start - em->start)); 7471 7472 skip_cow: 7473 /* 7474 * Need to update the i_size under the extent lock so buffered 7475 * readers will get the updated i_size when we unlock. 7476 */ 7477 if (start + len > i_size_read(inode)) 7478 i_size_write(inode, start + len); 7479 7480 dio_data->reserve -= len; 7481 out: 7482 return ret; 7483 } 7484 7485 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, 7486 loff_t length, unsigned int flags, struct iomap *iomap, 7487 struct iomap *srcmap) 7488 { 7489 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7490 struct extent_map *em; 7491 struct extent_state *cached_state = NULL; 7492 struct btrfs_dio_data *dio_data = NULL; 7493 u64 lockstart, lockend; 7494 const bool write = !!(flags & IOMAP_WRITE); 7495 int ret = 0; 7496 u64 len = length; 7497 bool unlock_extents = false; 7498 7499 if (!write) 7500 len = min_t(u64, len, fs_info->sectorsize); 7501 7502 lockstart = start; 7503 lockend = start + len - 1; 7504 7505 /* 7506 * The generic stuff only does filemap_write_and_wait_range, which 7507 * isn't enough if we've written compressed pages to this area, so we 7508 * need to flush the dirty pages again to make absolutely sure that any 7509 * outstanding dirty pages are on disk. 7510 */ 7511 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7512 &BTRFS_I(inode)->runtime_flags)) { 7513 ret = filemap_fdatawrite_range(inode->i_mapping, start, 7514 start + length - 1); 7515 if (ret) 7516 return ret; 7517 } 7518 7519 dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); 7520 if (!dio_data) 7521 return -ENOMEM; 7522 7523 dio_data->length = length; 7524 if (write) { 7525 dio_data->reserve = round_up(length, fs_info->sectorsize); 7526 ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), 7527 &dio_data->data_reserved, 7528 start, dio_data->reserve); 7529 if (ret) { 7530 extent_changeset_free(dio_data->data_reserved); 7531 kfree(dio_data); 7532 return ret; 7533 } 7534 } 7535 iomap->private = dio_data; 7536 7537 7538 /* 7539 * If this errors out it's because we couldn't invalidate pagecache for 7540 * this range and we need to fallback to buffered. 7541 */ 7542 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { 7543 ret = -ENOTBLK; 7544 goto err; 7545 } 7546 7547 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); 7548 if (IS_ERR(em)) { 7549 ret = PTR_ERR(em); 7550 goto unlock_err; 7551 } 7552 7553 /* 7554 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 7555 * io. INLINE is special, and we could probably kludge it in here, but 7556 * it's still buffered so for safety lets just fall back to the generic 7557 * buffered path. 7558 * 7559 * For COMPRESSED we _have_ to read the entire extent in so we can 7560 * decompress it, so there will be buffering required no matter what we 7561 * do, so go ahead and fallback to buffered. 7562 * 7563 * We return -ENOTBLK because that's what makes DIO go ahead and go back 7564 * to buffered IO. Don't blame me, this is the price we pay for using 7565 * the generic code. 7566 */ 7567 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 7568 em->block_start == EXTENT_MAP_INLINE) { 7569 free_extent_map(em); 7570 ret = -ENOTBLK; 7571 goto unlock_err; 7572 } 7573 7574 len = min(len, em->len - (start - em->start)); 7575 if (write) { 7576 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 7577 start, len); 7578 if (ret < 0) 7579 goto unlock_err; 7580 unlock_extents = true; 7581 /* Recalc len in case the new em is smaller than requested */ 7582 len = min(len, em->len - (start - em->start)); 7583 } else { 7584 /* 7585 * We need to unlock only the end area that we aren't using. 7586 * The rest is going to be unlocked by the endio routine. 7587 */ 7588 lockstart = start + len; 7589 if (lockstart < lockend) 7590 unlock_extents = true; 7591 } 7592 7593 if (unlock_extents) 7594 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 7595 lockstart, lockend, &cached_state); 7596 else 7597 free_extent_state(cached_state); 7598 7599 /* 7600 * Translate extent map information to iomap. 7601 * We trim the extents (and move the addr) even though iomap code does 7602 * that, since we have locked only the parts we are performing I/O in. 7603 */ 7604 if ((em->block_start == EXTENT_MAP_HOLE) || 7605 (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { 7606 iomap->addr = IOMAP_NULL_ADDR; 7607 iomap->type = IOMAP_HOLE; 7608 } else { 7609 iomap->addr = em->block_start + (start - em->start); 7610 iomap->type = IOMAP_MAPPED; 7611 } 7612 iomap->offset = start; 7613 iomap->bdev = fs_info->fs_devices->latest_bdev; 7614 iomap->length = len; 7615 7616 free_extent_map(em); 7617 7618 return 0; 7619 7620 unlock_err: 7621 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7622 &cached_state); 7623 err: 7624 if (dio_data) { 7625 btrfs_delalloc_release_space(BTRFS_I(inode), 7626 dio_data->data_reserved, start, 7627 dio_data->reserve, true); 7628 btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); 7629 extent_changeset_free(dio_data->data_reserved); 7630 kfree(dio_data); 7631 } 7632 return ret; 7633 } 7634 7635 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, 7636 ssize_t written, unsigned int flags, struct iomap *iomap) 7637 { 7638 int ret = 0; 7639 struct btrfs_dio_data *dio_data = iomap->private; 7640 size_t submitted = dio_data->submitted; 7641 const bool write = !!(flags & IOMAP_WRITE); 7642 7643 if (!write && (iomap->type == IOMAP_HOLE)) { 7644 /* If reading from a hole, unlock and return */ 7645 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); 7646 goto out; 7647 } 7648 7649 if (submitted < length) { 7650 pos += submitted; 7651 length -= submitted; 7652 if (write) 7653 __endio_write_update_ordered(BTRFS_I(inode), pos, 7654 length, false); 7655 else 7656 unlock_extent(&BTRFS_I(inode)->io_tree, pos, 7657 pos + length - 1); 7658 ret = -ENOTBLK; 7659 } 7660 7661 if (write) { 7662 if (dio_data->reserve) 7663 btrfs_delalloc_release_space(BTRFS_I(inode), 7664 dio_data->data_reserved, pos, 7665 dio_data->reserve, true); 7666 btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); 7667 extent_changeset_free(dio_data->data_reserved); 7668 } 7669 out: 7670 kfree(dio_data); 7671 iomap->private = NULL; 7672 7673 return ret; 7674 } 7675 7676 static void btrfs_dio_private_put(struct btrfs_dio_private *dip) 7677 { 7678 /* 7679 * This implies a barrier so that stores to dio_bio->bi_status before 7680 * this and loads of dio_bio->bi_status after this are fully ordered. 7681 */ 7682 if (!refcount_dec_and_test(&dip->refs)) 7683 return; 7684 7685 if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { 7686 __endio_write_update_ordered(BTRFS_I(dip->inode), 7687 dip->logical_offset, 7688 dip->bytes, 7689 !dip->dio_bio->bi_status); 7690 } else { 7691 unlock_extent(&BTRFS_I(dip->inode)->io_tree, 7692 dip->logical_offset, 7693 dip->logical_offset + dip->bytes - 1); 7694 } 7695 7696 bio_endio(dip->dio_bio); 7697 kfree(dip); 7698 } 7699 7700 static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, 7701 int mirror_num, 7702 unsigned long bio_flags) 7703 { 7704 struct btrfs_dio_private *dip = bio->bi_private; 7705 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7706 blk_status_t ret; 7707 7708 BUG_ON(bio_op(bio) == REQ_OP_WRITE); 7709 7710 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 7711 if (ret) 7712 return ret; 7713 7714 refcount_inc(&dip->refs); 7715 ret = btrfs_map_bio(fs_info, bio, mirror_num); 7716 if (ret) 7717 refcount_dec(&dip->refs); 7718 return ret; 7719 } 7720 7721 static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, 7722 struct btrfs_io_bio *io_bio, 7723 const bool uptodate) 7724 { 7725 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 7726 const u32 sectorsize = fs_info->sectorsize; 7727 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 7728 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7729 const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); 7730 struct bio_vec bvec; 7731 struct bvec_iter iter; 7732 u64 start = io_bio->logical; 7733 u32 bio_offset = 0; 7734 blk_status_t err = BLK_STS_OK; 7735 7736 __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { 7737 unsigned int i, nr_sectors, pgoff; 7738 7739 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); 7740 pgoff = bvec.bv_offset; 7741 for (i = 0; i < nr_sectors; i++) { 7742 ASSERT(pgoff < PAGE_SIZE); 7743 if (uptodate && 7744 (!csum || !check_data_csum(inode, io_bio, 7745 bio_offset, bvec.bv_page, pgoff))) { 7746 clean_io_failure(fs_info, failure_tree, io_tree, 7747 start, bvec.bv_page, 7748 btrfs_ino(BTRFS_I(inode)), 7749 pgoff); 7750 } else { 7751 blk_status_t status; 7752 7753 ASSERT((start - io_bio->logical) < UINT_MAX); 7754 status = btrfs_submit_read_repair(inode, 7755 &io_bio->bio, 7756 start - io_bio->logical, 7757 bvec.bv_page, pgoff, 7758 start, 7759 start + sectorsize - 1, 7760 io_bio->mirror_num, 7761 submit_dio_repair_bio); 7762 if (status) 7763 err = status; 7764 } 7765 start += sectorsize; 7766 ASSERT(bio_offset + sectorsize > bio_offset); 7767 bio_offset += sectorsize; 7768 pgoff += sectorsize; 7769 } 7770 } 7771 return err; 7772 } 7773 7774 static void __endio_write_update_ordered(struct btrfs_inode *inode, 7775 const u64 offset, const u64 bytes, 7776 const bool uptodate) 7777 { 7778 struct btrfs_fs_info *fs_info = inode->root->fs_info; 7779 struct btrfs_ordered_extent *ordered = NULL; 7780 struct btrfs_workqueue *wq; 7781 u64 ordered_offset = offset; 7782 u64 ordered_bytes = bytes; 7783 u64 last_offset; 7784 7785 if (btrfs_is_free_space_inode(inode)) 7786 wq = fs_info->endio_freespace_worker; 7787 else 7788 wq = fs_info->endio_write_workers; 7789 7790 while (ordered_offset < offset + bytes) { 7791 last_offset = ordered_offset; 7792 if (btrfs_dec_test_first_ordered_pending(inode, &ordered, 7793 &ordered_offset, 7794 ordered_bytes, 7795 uptodate)) { 7796 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, 7797 NULL); 7798 btrfs_queue_work(wq, &ordered->work); 7799 } 7800 /* 7801 * If btrfs_dec_test_ordered_pending does not find any ordered 7802 * extent in the range, we can exit. 7803 */ 7804 if (ordered_offset == last_offset) 7805 return; 7806 /* 7807 * Our bio might span multiple ordered extents. In this case 7808 * we keep going until we have accounted the whole dio. 7809 */ 7810 if (ordered_offset < offset + bytes) { 7811 ordered_bytes = offset + bytes - ordered_offset; 7812 ordered = NULL; 7813 } 7814 } 7815 } 7816 7817 static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, 7818 struct bio *bio, 7819 u64 dio_file_offset) 7820 { 7821 return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); 7822 } 7823 7824 static void btrfs_end_dio_bio(struct bio *bio) 7825 { 7826 struct btrfs_dio_private *dip = bio->bi_private; 7827 blk_status_t err = bio->bi_status; 7828 7829 if (err) 7830 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, 7831 "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", 7832 btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), 7833 bio->bi_opf, bio->bi_iter.bi_sector, 7834 bio->bi_iter.bi_size, err); 7835 7836 if (bio_op(bio) == REQ_OP_READ) { 7837 err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), 7838 !err); 7839 } 7840 7841 if (err) 7842 dip->dio_bio->bi_status = err; 7843 7844 bio_put(bio); 7845 btrfs_dio_private_put(dip); 7846 } 7847 7848 static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, 7849 struct inode *inode, u64 file_offset, int async_submit) 7850 { 7851 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7852 struct btrfs_dio_private *dip = bio->bi_private; 7853 bool write = bio_op(bio) == REQ_OP_WRITE; 7854 blk_status_t ret; 7855 7856 /* Check btrfs_submit_bio_hook() for rules about async submit. */ 7857 if (async_submit) 7858 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 7859 7860 if (!write) { 7861 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 7862 if (ret) 7863 goto err; 7864 } 7865 7866 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 7867 goto map; 7868 7869 if (write && async_submit) { 7870 ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, 7871 btrfs_submit_bio_start_direct_io); 7872 goto err; 7873 } else if (write) { 7874 /* 7875 * If we aren't doing async submit, calculate the csum of the 7876 * bio now. 7877 */ 7878 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); 7879 if (ret) 7880 goto err; 7881 } else { 7882 u64 csum_offset; 7883 7884 csum_offset = file_offset - dip->logical_offset; 7885 csum_offset >>= fs_info->sectorsize_bits; 7886 csum_offset *= fs_info->csum_size; 7887 btrfs_io_bio(bio)->csum = dip->csums + csum_offset; 7888 } 7889 map: 7890 ret = btrfs_map_bio(fs_info, bio, 0); 7891 err: 7892 return ret; 7893 } 7894 7895 /* 7896 * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked 7897 * or ordered extents whether or not we submit any bios. 7898 */ 7899 static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, 7900 struct inode *inode, 7901 loff_t file_offset) 7902 { 7903 const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); 7904 const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); 7905 size_t dip_size; 7906 struct btrfs_dio_private *dip; 7907 7908 dip_size = sizeof(*dip); 7909 if (!write && csum) { 7910 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7911 size_t nblocks; 7912 7913 nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; 7914 dip_size += fs_info->csum_size * nblocks; 7915 } 7916 7917 dip = kzalloc(dip_size, GFP_NOFS); 7918 if (!dip) 7919 return NULL; 7920 7921 dip->inode = inode; 7922 dip->logical_offset = file_offset; 7923 dip->bytes = dio_bio->bi_iter.bi_size; 7924 dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; 7925 dip->dio_bio = dio_bio; 7926 refcount_set(&dip->refs, 1); 7927 return dip; 7928 } 7929 7930 static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, 7931 struct bio *dio_bio, loff_t file_offset) 7932 { 7933 const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); 7934 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7935 const bool raid56 = (btrfs_data_alloc_profile(fs_info) & 7936 BTRFS_BLOCK_GROUP_RAID56_MASK); 7937 struct btrfs_dio_private *dip; 7938 struct bio *bio; 7939 u64 start_sector; 7940 int async_submit = 0; 7941 u64 submit_len; 7942 int clone_offset = 0; 7943 int clone_len; 7944 int ret; 7945 blk_status_t status; 7946 struct btrfs_io_geometry geom; 7947 struct btrfs_dio_data *dio_data = iomap->private; 7948 7949 dip = btrfs_create_dio_private(dio_bio, inode, file_offset); 7950 if (!dip) { 7951 if (!write) { 7952 unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, 7953 file_offset + dio_bio->bi_iter.bi_size - 1); 7954 } 7955 dio_bio->bi_status = BLK_STS_RESOURCE; 7956 bio_endio(dio_bio); 7957 return BLK_QC_T_NONE; 7958 } 7959 7960 if (!write) { 7961 /* 7962 * Load the csums up front to reduce csum tree searches and 7963 * contention when submitting bios. 7964 * 7965 * If we have csums disabled this will do nothing. 7966 */ 7967 status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); 7968 if (status != BLK_STS_OK) 7969 goto out_err; 7970 } 7971 7972 start_sector = dio_bio->bi_iter.bi_sector; 7973 submit_len = dio_bio->bi_iter.bi_size; 7974 7975 do { 7976 ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio), 7977 start_sector << 9, submit_len, 7978 &geom); 7979 if (ret) { 7980 status = errno_to_blk_status(ret); 7981 goto out_err; 7982 } 7983 ASSERT(geom.len <= INT_MAX); 7984 7985 clone_len = min_t(int, submit_len, geom.len); 7986 7987 /* 7988 * This will never fail as it's passing GPF_NOFS and 7989 * the allocation is backed by btrfs_bioset. 7990 */ 7991 bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); 7992 bio->bi_private = dip; 7993 bio->bi_end_io = btrfs_end_dio_bio; 7994 btrfs_io_bio(bio)->logical = file_offset; 7995 7996 ASSERT(submit_len >= clone_len); 7997 submit_len -= clone_len; 7998 7999 /* 8000 * Increase the count before we submit the bio so we know 8001 * the end IO handler won't happen before we increase the 8002 * count. Otherwise, the dip might get freed before we're 8003 * done setting it up. 8004 * 8005 * We transfer the initial reference to the last bio, so we 8006 * don't need to increment the reference count for the last one. 8007 */ 8008 if (submit_len > 0) { 8009 refcount_inc(&dip->refs); 8010 /* 8011 * If we are submitting more than one bio, submit them 8012 * all asynchronously. The exception is RAID 5 or 6, as 8013 * asynchronous checksums make it difficult to collect 8014 * full stripe writes. 8015 */ 8016 if (!raid56) 8017 async_submit = 1; 8018 } 8019 8020 status = btrfs_submit_dio_bio(bio, inode, file_offset, 8021 async_submit); 8022 if (status) { 8023 bio_put(bio); 8024 if (submit_len > 0) 8025 refcount_dec(&dip->refs); 8026 goto out_err; 8027 } 8028 8029 dio_data->submitted += clone_len; 8030 clone_offset += clone_len; 8031 start_sector += clone_len >> 9; 8032 file_offset += clone_len; 8033 } while (submit_len > 0); 8034 return BLK_QC_T_NONE; 8035 8036 out_err: 8037 dip->dio_bio->bi_status = status; 8038 btrfs_dio_private_put(dip); 8039 return BLK_QC_T_NONE; 8040 } 8041 8042 const struct iomap_ops btrfs_dio_iomap_ops = { 8043 .iomap_begin = btrfs_dio_iomap_begin, 8044 .iomap_end = btrfs_dio_iomap_end, 8045 }; 8046 8047 const struct iomap_dio_ops btrfs_dio_ops = { 8048 .submit_io = btrfs_submit_direct, 8049 }; 8050 8051 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 8052 u64 start, u64 len) 8053 { 8054 int ret; 8055 8056 ret = fiemap_prep(inode, fieinfo, start, &len, 0); 8057 if (ret) 8058 return ret; 8059 8060 return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); 8061 } 8062 8063 int btrfs_readpage(struct file *file, struct page *page) 8064 { 8065 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 8066 u64 start = page_offset(page); 8067 u64 end = start + PAGE_SIZE - 1; 8068 unsigned long bio_flags = 0; 8069 struct bio *bio = NULL; 8070 int ret; 8071 8072 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 8073 8074 ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); 8075 if (bio) 8076 ret = submit_one_bio(bio, 0, bio_flags); 8077 return ret; 8078 } 8079 8080 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 8081 { 8082 struct inode *inode = page->mapping->host; 8083 int ret; 8084 8085 if (current->flags & PF_MEMALLOC) { 8086 redirty_page_for_writepage(wbc, page); 8087 unlock_page(page); 8088 return 0; 8089 } 8090 8091 /* 8092 * If we are under memory pressure we will call this directly from the 8093 * VM, we need to make sure we have the inode referenced for the ordered 8094 * extent. If not just return like we didn't do anything. 8095 */ 8096 if (!igrab(inode)) { 8097 redirty_page_for_writepage(wbc, page); 8098 return AOP_WRITEPAGE_ACTIVATE; 8099 } 8100 ret = extent_write_full_page(page, wbc); 8101 btrfs_add_delayed_iput(inode); 8102 return ret; 8103 } 8104 8105 static int btrfs_writepages(struct address_space *mapping, 8106 struct writeback_control *wbc) 8107 { 8108 return extent_writepages(mapping, wbc); 8109 } 8110 8111 static void btrfs_readahead(struct readahead_control *rac) 8112 { 8113 extent_readahead(rac); 8114 } 8115 8116 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8117 { 8118 int ret = try_release_extent_mapping(page, gfp_flags); 8119 if (ret == 1) 8120 detach_page_private(page); 8121 return ret; 8122 } 8123 8124 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8125 { 8126 if (PageWriteback(page) || PageDirty(page)) 8127 return 0; 8128 return __btrfs_releasepage(page, gfp_flags); 8129 } 8130 8131 #ifdef CONFIG_MIGRATION 8132 static int btrfs_migratepage(struct address_space *mapping, 8133 struct page *newpage, struct page *page, 8134 enum migrate_mode mode) 8135 { 8136 int ret; 8137 8138 ret = migrate_page_move_mapping(mapping, newpage, page, 0); 8139 if (ret != MIGRATEPAGE_SUCCESS) 8140 return ret; 8141 8142 if (page_has_private(page)) 8143 attach_page_private(newpage, detach_page_private(page)); 8144 8145 if (PagePrivate2(page)) { 8146 ClearPagePrivate2(page); 8147 SetPagePrivate2(newpage); 8148 } 8149 8150 if (mode != MIGRATE_SYNC_NO_COPY) 8151 migrate_page_copy(newpage, page); 8152 else 8153 migrate_page_states(newpage, page); 8154 return MIGRATEPAGE_SUCCESS; 8155 } 8156 #endif 8157 8158 static void btrfs_invalidatepage(struct page *page, unsigned int offset, 8159 unsigned int length) 8160 { 8161 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 8162 struct extent_io_tree *tree = &inode->io_tree; 8163 struct btrfs_ordered_extent *ordered; 8164 struct extent_state *cached_state = NULL; 8165 u64 page_start = page_offset(page); 8166 u64 page_end = page_start + PAGE_SIZE - 1; 8167 u64 start; 8168 u64 end; 8169 int inode_evicting = inode->vfs_inode.i_state & I_FREEING; 8170 bool found_ordered = false; 8171 bool completed_ordered = false; 8172 8173 /* 8174 * we have the page locked, so new writeback can't start, 8175 * and the dirty bit won't be cleared while we are here. 8176 * 8177 * Wait for IO on this page so that we can safely clear 8178 * the PagePrivate2 bit and do ordered accounting 8179 */ 8180 wait_on_page_writeback(page); 8181 8182 if (offset) { 8183 btrfs_releasepage(page, GFP_NOFS); 8184 return; 8185 } 8186 8187 if (!inode_evicting) 8188 lock_extent_bits(tree, page_start, page_end, &cached_state); 8189 again: 8190 start = page_start; 8191 ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); 8192 if (ordered) { 8193 found_ordered = true; 8194 end = min(page_end, 8195 ordered->file_offset + ordered->num_bytes - 1); 8196 /* 8197 * IO on this page will never be started, so we need to account 8198 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW 8199 * here, must leave that up for the ordered extent completion. 8200 */ 8201 if (!inode_evicting) 8202 clear_extent_bit(tree, start, end, 8203 EXTENT_DELALLOC | 8204 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8205 EXTENT_DEFRAG, 1, 0, &cached_state); 8206 /* 8207 * whoever cleared the private bit is responsible 8208 * for the finish_ordered_io 8209 */ 8210 if (TestClearPagePrivate2(page)) { 8211 struct btrfs_ordered_inode_tree *tree; 8212 u64 new_len; 8213 8214 tree = &inode->ordered_tree; 8215 8216 spin_lock_irq(&tree->lock); 8217 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8218 new_len = start - ordered->file_offset; 8219 if (new_len < ordered->truncated_len) 8220 ordered->truncated_len = new_len; 8221 spin_unlock_irq(&tree->lock); 8222 8223 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8224 start, 8225 end - start + 1, 1)) { 8226 btrfs_finish_ordered_io(ordered); 8227 completed_ordered = true; 8228 } 8229 } 8230 btrfs_put_ordered_extent(ordered); 8231 if (!inode_evicting) { 8232 cached_state = NULL; 8233 lock_extent_bits(tree, start, end, 8234 &cached_state); 8235 } 8236 8237 start = end + 1; 8238 if (start < page_end) 8239 goto again; 8240 } 8241 8242 /* 8243 * Qgroup reserved space handler 8244 * Page here will be either 8245 * 1) Already written to disk or ordered extent already submitted 8246 * Then its QGROUP_RESERVED bit in io_tree is already cleaned. 8247 * Qgroup will be handled by its qgroup_record then. 8248 * btrfs_qgroup_free_data() call will do nothing here. 8249 * 8250 * 2) Not written to disk yet 8251 * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED 8252 * bit of its io_tree, and free the qgroup reserved data space. 8253 * Since the IO will never happen for this page. 8254 */ 8255 btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); 8256 if (!inode_evicting) { 8257 bool delete = true; 8258 8259 /* 8260 * If there's an ordered extent for this range and we have not 8261 * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set 8262 * in the range for the ordered extent completion. We must also 8263 * not delete the range, otherwise we would lose that bit (and 8264 * any other bits set in the range). Make sure EXTENT_UPTODATE 8265 * is cleared if we don't delete, otherwise it can lead to 8266 * corruptions if the i_size is extented later. 8267 */ 8268 if (found_ordered && !completed_ordered) 8269 delete = false; 8270 clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | 8271 EXTENT_DELALLOC | EXTENT_UPTODATE | 8272 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 8273 delete, &cached_state); 8274 8275 __btrfs_releasepage(page, GFP_NOFS); 8276 } 8277 8278 ClearPageChecked(page); 8279 detach_page_private(page); 8280 } 8281 8282 /* 8283 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 8284 * called from a page fault handler when a page is first dirtied. Hence we must 8285 * be careful to check for EOF conditions here. We set the page up correctly 8286 * for a written page which means we get ENOSPC checking when writing into 8287 * holes and correct delalloc and unwritten extent mapping on filesystems that 8288 * support these features. 8289 * 8290 * We are not allowed to take the i_mutex here so we have to play games to 8291 * protect against truncate races as the page could now be beyond EOF. Because 8292 * truncate_setsize() writes the inode size before removing pages, once we have 8293 * the page lock we can determine safely if the page is beyond EOF. If it is not 8294 * beyond EOF, then the page is guaranteed safe against truncation until we 8295 * unlock the page. 8296 */ 8297 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) 8298 { 8299 struct page *page = vmf->page; 8300 struct inode *inode = file_inode(vmf->vma->vm_file); 8301 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8302 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 8303 struct btrfs_ordered_extent *ordered; 8304 struct extent_state *cached_state = NULL; 8305 struct extent_changeset *data_reserved = NULL; 8306 char *kaddr; 8307 unsigned long zero_start; 8308 loff_t size; 8309 vm_fault_t ret; 8310 int ret2; 8311 int reserved = 0; 8312 u64 reserved_space; 8313 u64 page_start; 8314 u64 page_end; 8315 u64 end; 8316 8317 reserved_space = PAGE_SIZE; 8318 8319 sb_start_pagefault(inode->i_sb); 8320 page_start = page_offset(page); 8321 page_end = page_start + PAGE_SIZE - 1; 8322 end = page_end; 8323 8324 /* 8325 * Reserving delalloc space after obtaining the page lock can lead to 8326 * deadlock. For example, if a dirty page is locked by this function 8327 * and the call to btrfs_delalloc_reserve_space() ends up triggering 8328 * dirty page write out, then the btrfs_writepage() function could 8329 * end up waiting indefinitely to get a lock on the page currently 8330 * being processed by btrfs_page_mkwrite() function. 8331 */ 8332 ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 8333 page_start, reserved_space); 8334 if (!ret2) { 8335 ret2 = file_update_time(vmf->vma->vm_file); 8336 reserved = 1; 8337 } 8338 if (ret2) { 8339 ret = vmf_error(ret2); 8340 if (reserved) 8341 goto out; 8342 goto out_noreserve; 8343 } 8344 8345 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 8346 again: 8347 lock_page(page); 8348 size = i_size_read(inode); 8349 8350 if ((page->mapping != inode->i_mapping) || 8351 (page_start >= size)) { 8352 /* page got truncated out from underneath us */ 8353 goto out_unlock; 8354 } 8355 wait_on_page_writeback(page); 8356 8357 lock_extent_bits(io_tree, page_start, page_end, &cached_state); 8358 set_page_extent_mapped(page); 8359 8360 /* 8361 * we can't set the delalloc bits if there are pending ordered 8362 * extents. Drop our locks and wait for them to finish 8363 */ 8364 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, 8365 PAGE_SIZE); 8366 if (ordered) { 8367 unlock_extent_cached(io_tree, page_start, page_end, 8368 &cached_state); 8369 unlock_page(page); 8370 btrfs_start_ordered_extent(ordered, 1); 8371 btrfs_put_ordered_extent(ordered); 8372 goto again; 8373 } 8374 8375 if (page->index == ((size - 1) >> PAGE_SHIFT)) { 8376 reserved_space = round_up(size - page_start, 8377 fs_info->sectorsize); 8378 if (reserved_space < PAGE_SIZE) { 8379 end = page_start + reserved_space - 1; 8380 btrfs_delalloc_release_space(BTRFS_I(inode), 8381 data_reserved, page_start, 8382 PAGE_SIZE - reserved_space, true); 8383 } 8384 } 8385 8386 /* 8387 * page_mkwrite gets called when the page is firstly dirtied after it's 8388 * faulted in, but write(2) could also dirty a page and set delalloc 8389 * bits, thus in this case for space account reason, we still need to 8390 * clear any delalloc bits within this page range since we have to 8391 * reserve data&meta space before lock_page() (see above comments). 8392 */ 8393 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, 8394 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 8395 EXTENT_DEFRAG, 0, 0, &cached_state); 8396 8397 ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, 8398 &cached_state); 8399 if (ret2) { 8400 unlock_extent_cached(io_tree, page_start, page_end, 8401 &cached_state); 8402 ret = VM_FAULT_SIGBUS; 8403 goto out_unlock; 8404 } 8405 8406 /* page is wholly or partially inside EOF */ 8407 if (page_start + PAGE_SIZE > size) 8408 zero_start = offset_in_page(size); 8409 else 8410 zero_start = PAGE_SIZE; 8411 8412 if (zero_start != PAGE_SIZE) { 8413 kaddr = kmap(page); 8414 memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); 8415 flush_dcache_page(page); 8416 kunmap(page); 8417 } 8418 ClearPageChecked(page); 8419 set_page_dirty(page); 8420 SetPageUptodate(page); 8421 8422 BTRFS_I(inode)->last_trans = fs_info->generation; 8423 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 8424 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 8425 8426 unlock_extent_cached(io_tree, page_start, page_end, &cached_state); 8427 8428 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 8429 sb_end_pagefault(inode->i_sb); 8430 extent_changeset_free(data_reserved); 8431 return VM_FAULT_LOCKED; 8432 8433 out_unlock: 8434 unlock_page(page); 8435 out: 8436 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 8437 btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, 8438 reserved_space, (ret != 0)); 8439 out_noreserve: 8440 sb_end_pagefault(inode->i_sb); 8441 extent_changeset_free(data_reserved); 8442 return ret; 8443 } 8444 8445 static int btrfs_truncate(struct inode *inode, bool skip_writeback) 8446 { 8447 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8448 struct btrfs_root *root = BTRFS_I(inode)->root; 8449 struct btrfs_block_rsv *rsv; 8450 int ret; 8451 struct btrfs_trans_handle *trans; 8452 u64 mask = fs_info->sectorsize - 1; 8453 u64 min_size = btrfs_calc_metadata_size(fs_info, 1); 8454 8455 if (!skip_writeback) { 8456 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), 8457 (u64)-1); 8458 if (ret) 8459 return ret; 8460 } 8461 8462 /* 8463 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of 8464 * things going on here: 8465 * 8466 * 1) We need to reserve space to update our inode. 8467 * 8468 * 2) We need to have something to cache all the space that is going to 8469 * be free'd up by the truncate operation, but also have some slack 8470 * space reserved in case it uses space during the truncate (thank you 8471 * very much snapshotting). 8472 * 8473 * And we need these to be separate. The fact is we can use a lot of 8474 * space doing the truncate, and we have no earthly idea how much space 8475 * we will use, so we need the truncate reservation to be separate so it 8476 * doesn't end up using space reserved for updating the inode. We also 8477 * need to be able to stop the transaction and start a new one, which 8478 * means we need to be able to update the inode several times, and we 8479 * have no idea of knowing how many times that will be, so we can't just 8480 * reserve 1 item for the entirety of the operation, so that has to be 8481 * done separately as well. 8482 * 8483 * So that leaves us with 8484 * 8485 * 1) rsv - for the truncate reservation, which we will steal from the 8486 * transaction reservation. 8487 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for 8488 * updating the inode. 8489 */ 8490 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 8491 if (!rsv) 8492 return -ENOMEM; 8493 rsv->size = min_size; 8494 rsv->failfast = 1; 8495 8496 /* 8497 * 1 for the truncate slack space 8498 * 1 for updating the inode. 8499 */ 8500 trans = btrfs_start_transaction(root, 2); 8501 if (IS_ERR(trans)) { 8502 ret = PTR_ERR(trans); 8503 goto out; 8504 } 8505 8506 /* Migrate the slack space for the truncate to our reserve */ 8507 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, 8508 min_size, false); 8509 BUG_ON(ret); 8510 8511 /* 8512 * So if we truncate and then write and fsync we normally would just 8513 * write the extents that changed, which is a problem if we need to 8514 * first truncate that entire inode. So set this flag so we write out 8515 * all of the extents in the inode to the sync log so we're completely 8516 * safe. 8517 */ 8518 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 8519 trans->block_rsv = rsv; 8520 8521 while (1) { 8522 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 8523 inode->i_size, 8524 BTRFS_EXTENT_DATA_KEY); 8525 trans->block_rsv = &fs_info->trans_block_rsv; 8526 if (ret != -ENOSPC && ret != -EAGAIN) 8527 break; 8528 8529 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 8530 if (ret) 8531 break; 8532 8533 btrfs_end_transaction(trans); 8534 btrfs_btree_balance_dirty(fs_info); 8535 8536 trans = btrfs_start_transaction(root, 2); 8537 if (IS_ERR(trans)) { 8538 ret = PTR_ERR(trans); 8539 trans = NULL; 8540 break; 8541 } 8542 8543 btrfs_block_rsv_release(fs_info, rsv, -1, NULL); 8544 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 8545 rsv, min_size, false); 8546 BUG_ON(ret); /* shouldn't happen */ 8547 trans->block_rsv = rsv; 8548 } 8549 8550 /* 8551 * We can't call btrfs_truncate_block inside a trans handle as we could 8552 * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know 8553 * we've truncated everything except the last little bit, and can do 8554 * btrfs_truncate_block and then update the disk_i_size. 8555 */ 8556 if (ret == NEED_TRUNCATE_BLOCK) { 8557 btrfs_end_transaction(trans); 8558 btrfs_btree_balance_dirty(fs_info); 8559 8560 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); 8561 if (ret) 8562 goto out; 8563 trans = btrfs_start_transaction(root, 1); 8564 if (IS_ERR(trans)) { 8565 ret = PTR_ERR(trans); 8566 goto out; 8567 } 8568 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 8569 } 8570 8571 if (trans) { 8572 int ret2; 8573 8574 trans->block_rsv = &fs_info->trans_block_rsv; 8575 ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); 8576 if (ret2 && !ret) 8577 ret = ret2; 8578 8579 ret2 = btrfs_end_transaction(trans); 8580 if (ret2 && !ret) 8581 ret = ret2; 8582 btrfs_btree_balance_dirty(fs_info); 8583 } 8584 out: 8585 btrfs_free_block_rsv(fs_info, rsv); 8586 8587 return ret; 8588 } 8589 8590 /* 8591 * create a new subvolume directory/inode (helper for the ioctl). 8592 */ 8593 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 8594 struct btrfs_root *new_root, 8595 struct btrfs_root *parent_root, 8596 u64 new_dirid) 8597 { 8598 struct inode *inode; 8599 int err; 8600 u64 index = 0; 8601 8602 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 8603 new_dirid, new_dirid, 8604 S_IFDIR | (~current_umask() & S_IRWXUGO), 8605 &index); 8606 if (IS_ERR(inode)) 8607 return PTR_ERR(inode); 8608 inode->i_op = &btrfs_dir_inode_operations; 8609 inode->i_fop = &btrfs_dir_file_operations; 8610 8611 set_nlink(inode, 1); 8612 btrfs_i_size_write(BTRFS_I(inode), 0); 8613 unlock_new_inode(inode); 8614 8615 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8616 if (err) 8617 btrfs_err(new_root->fs_info, 8618 "error inheriting subvolume %llu properties: %d", 8619 new_root->root_key.objectid, err); 8620 8621 err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); 8622 8623 iput(inode); 8624 return err; 8625 } 8626 8627 struct inode *btrfs_alloc_inode(struct super_block *sb) 8628 { 8629 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 8630 struct btrfs_inode *ei; 8631 struct inode *inode; 8632 8633 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); 8634 if (!ei) 8635 return NULL; 8636 8637 ei->root = NULL; 8638 ei->generation = 0; 8639 ei->last_trans = 0; 8640 ei->last_sub_trans = 0; 8641 ei->logged_trans = 0; 8642 ei->delalloc_bytes = 0; 8643 ei->new_delalloc_bytes = 0; 8644 ei->defrag_bytes = 0; 8645 ei->disk_i_size = 0; 8646 ei->flags = 0; 8647 ei->csum_bytes = 0; 8648 ei->index_cnt = (u64)-1; 8649 ei->dir_index = 0; 8650 ei->last_unlink_trans = 0; 8651 ei->last_reflink_trans = 0; 8652 ei->last_log_commit = 0; 8653 8654 spin_lock_init(&ei->lock); 8655 ei->outstanding_extents = 0; 8656 if (sb->s_magic != BTRFS_TEST_MAGIC) 8657 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, 8658 BTRFS_BLOCK_RSV_DELALLOC); 8659 ei->runtime_flags = 0; 8660 ei->prop_compress = BTRFS_COMPRESS_NONE; 8661 ei->defrag_compress = BTRFS_COMPRESS_NONE; 8662 8663 ei->delayed_node = NULL; 8664 8665 ei->i_otime.tv_sec = 0; 8666 ei->i_otime.tv_nsec = 0; 8667 8668 inode = &ei->vfs_inode; 8669 extent_map_tree_init(&ei->extent_tree); 8670 extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); 8671 extent_io_tree_init(fs_info, &ei->io_failure_tree, 8672 IO_TREE_INODE_IO_FAILURE, inode); 8673 extent_io_tree_init(fs_info, &ei->file_extent_tree, 8674 IO_TREE_INODE_FILE_EXTENT, inode); 8675 ei->io_tree.track_uptodate = true; 8676 ei->io_failure_tree.track_uptodate = true; 8677 atomic_set(&ei->sync_writers, 0); 8678 mutex_init(&ei->log_mutex); 8679 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 8680 INIT_LIST_HEAD(&ei->delalloc_inodes); 8681 INIT_LIST_HEAD(&ei->delayed_iput); 8682 RB_CLEAR_NODE(&ei->rb_node); 8683 8684 return inode; 8685 } 8686 8687 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8688 void btrfs_test_destroy_inode(struct inode *inode) 8689 { 8690 btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); 8691 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 8692 } 8693 #endif 8694 8695 void btrfs_free_inode(struct inode *inode) 8696 { 8697 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 8698 } 8699 8700 void btrfs_destroy_inode(struct inode *vfs_inode) 8701 { 8702 struct btrfs_ordered_extent *ordered; 8703 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 8704 struct btrfs_root *root = inode->root; 8705 8706 WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); 8707 WARN_ON(vfs_inode->i_data.nrpages); 8708 WARN_ON(inode->block_rsv.reserved); 8709 WARN_ON(inode->block_rsv.size); 8710 WARN_ON(inode->outstanding_extents); 8711 WARN_ON(inode->delalloc_bytes); 8712 WARN_ON(inode->new_delalloc_bytes); 8713 WARN_ON(inode->csum_bytes); 8714 WARN_ON(inode->defrag_bytes); 8715 8716 /* 8717 * This can happen where we create an inode, but somebody else also 8718 * created the same inode and we need to destroy the one we already 8719 * created. 8720 */ 8721 if (!root) 8722 return; 8723 8724 while (1) { 8725 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 8726 if (!ordered) 8727 break; 8728 else { 8729 btrfs_err(root->fs_info, 8730 "found ordered extent %llu %llu on inode cleanup", 8731 ordered->file_offset, ordered->num_bytes); 8732 btrfs_remove_ordered_extent(inode, ordered); 8733 btrfs_put_ordered_extent(ordered); 8734 btrfs_put_ordered_extent(ordered); 8735 } 8736 } 8737 btrfs_qgroup_check_reserved_leak(inode); 8738 inode_tree_del(inode); 8739 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 8740 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); 8741 btrfs_put_root(inode->root); 8742 } 8743 8744 int btrfs_drop_inode(struct inode *inode) 8745 { 8746 struct btrfs_root *root = BTRFS_I(inode)->root; 8747 8748 if (root == NULL) 8749 return 1; 8750 8751 /* the snap/subvol tree is on deleting */ 8752 if (btrfs_root_refs(&root->root_item) == 0) 8753 return 1; 8754 else 8755 return generic_drop_inode(inode); 8756 } 8757 8758 static void init_once(void *foo) 8759 { 8760 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 8761 8762 inode_init_once(&ei->vfs_inode); 8763 } 8764 8765 void __cold btrfs_destroy_cachep(void) 8766 { 8767 /* 8768 * Make sure all delayed rcu free inodes are flushed before we 8769 * destroy cache. 8770 */ 8771 rcu_barrier(); 8772 kmem_cache_destroy(btrfs_inode_cachep); 8773 kmem_cache_destroy(btrfs_trans_handle_cachep); 8774 kmem_cache_destroy(btrfs_path_cachep); 8775 kmem_cache_destroy(btrfs_free_space_cachep); 8776 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 8777 } 8778 8779 int __init btrfs_init_cachep(void) 8780 { 8781 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 8782 sizeof(struct btrfs_inode), 0, 8783 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, 8784 init_once); 8785 if (!btrfs_inode_cachep) 8786 goto fail; 8787 8788 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 8789 sizeof(struct btrfs_trans_handle), 0, 8790 SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); 8791 if (!btrfs_trans_handle_cachep) 8792 goto fail; 8793 8794 btrfs_path_cachep = kmem_cache_create("btrfs_path", 8795 sizeof(struct btrfs_path), 0, 8796 SLAB_MEM_SPREAD, NULL); 8797 if (!btrfs_path_cachep) 8798 goto fail; 8799 8800 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 8801 sizeof(struct btrfs_free_space), 0, 8802 SLAB_MEM_SPREAD, NULL); 8803 if (!btrfs_free_space_cachep) 8804 goto fail; 8805 8806 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 8807 PAGE_SIZE, PAGE_SIZE, 8808 SLAB_RED_ZONE, NULL); 8809 if (!btrfs_free_space_bitmap_cachep) 8810 goto fail; 8811 8812 return 0; 8813 fail: 8814 btrfs_destroy_cachep(); 8815 return -ENOMEM; 8816 } 8817 8818 static int btrfs_getattr(const struct path *path, struct kstat *stat, 8819 u32 request_mask, unsigned int flags) 8820 { 8821 u64 delalloc_bytes; 8822 u64 inode_bytes; 8823 struct inode *inode = d_inode(path->dentry); 8824 u32 blocksize = inode->i_sb->s_blocksize; 8825 u32 bi_flags = BTRFS_I(inode)->flags; 8826 8827 stat->result_mask |= STATX_BTIME; 8828 stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; 8829 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; 8830 if (bi_flags & BTRFS_INODE_APPEND) 8831 stat->attributes |= STATX_ATTR_APPEND; 8832 if (bi_flags & BTRFS_INODE_COMPRESS) 8833 stat->attributes |= STATX_ATTR_COMPRESSED; 8834 if (bi_flags & BTRFS_INODE_IMMUTABLE) 8835 stat->attributes |= STATX_ATTR_IMMUTABLE; 8836 if (bi_flags & BTRFS_INODE_NODUMP) 8837 stat->attributes |= STATX_ATTR_NODUMP; 8838 8839 stat->attributes_mask |= (STATX_ATTR_APPEND | 8840 STATX_ATTR_COMPRESSED | 8841 STATX_ATTR_IMMUTABLE | 8842 STATX_ATTR_NODUMP); 8843 8844 generic_fillattr(inode, stat); 8845 stat->dev = BTRFS_I(inode)->root->anon_dev; 8846 8847 spin_lock(&BTRFS_I(inode)->lock); 8848 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; 8849 inode_bytes = inode_get_bytes(inode); 8850 spin_unlock(&BTRFS_I(inode)->lock); 8851 stat->blocks = (ALIGN(inode_bytes, blocksize) + 8852 ALIGN(delalloc_bytes, blocksize)) >> 9; 8853 return 0; 8854 } 8855 8856 static int btrfs_rename_exchange(struct inode *old_dir, 8857 struct dentry *old_dentry, 8858 struct inode *new_dir, 8859 struct dentry *new_dentry) 8860 { 8861 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 8862 struct btrfs_trans_handle *trans; 8863 struct btrfs_root *root = BTRFS_I(old_dir)->root; 8864 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 8865 struct inode *new_inode = new_dentry->d_inode; 8866 struct inode *old_inode = old_dentry->d_inode; 8867 struct timespec64 ctime = current_time(old_inode); 8868 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 8869 u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); 8870 u64 old_idx = 0; 8871 u64 new_idx = 0; 8872 int ret; 8873 int ret2; 8874 bool root_log_pinned = false; 8875 bool dest_log_pinned = false; 8876 8877 /* we only allow rename subvolume link between subvolumes */ 8878 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 8879 return -EXDEV; 8880 8881 /* close the race window with snapshot create/destroy ioctl */ 8882 if (old_ino == BTRFS_FIRST_FREE_OBJECTID || 8883 new_ino == BTRFS_FIRST_FREE_OBJECTID) 8884 down_read(&fs_info->subvol_sem); 8885 8886 /* 8887 * We want to reserve the absolute worst case amount of items. So if 8888 * both inodes are subvols and we need to unlink them then that would 8889 * require 4 item modifications, but if they are both normal inodes it 8890 * would require 5 item modifications, so we'll assume their normal 8891 * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items 8892 * should cover the worst case number of items we'll modify. 8893 */ 8894 trans = btrfs_start_transaction(root, 12); 8895 if (IS_ERR(trans)) { 8896 ret = PTR_ERR(trans); 8897 goto out_notrans; 8898 } 8899 8900 if (dest != root) 8901 btrfs_record_root_in_trans(trans, dest); 8902 8903 /* 8904 * We need to find a free sequence number both in the source and 8905 * in the destination directory for the exchange. 8906 */ 8907 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); 8908 if (ret) 8909 goto out_fail; 8910 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); 8911 if (ret) 8912 goto out_fail; 8913 8914 BTRFS_I(old_inode)->dir_index = 0ULL; 8915 BTRFS_I(new_inode)->dir_index = 0ULL; 8916 8917 /* Reference for the source. */ 8918 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 8919 /* force full log commit if subvolume involved. */ 8920 btrfs_set_log_full_commit(trans); 8921 } else { 8922 btrfs_pin_log_trans(root); 8923 root_log_pinned = true; 8924 ret = btrfs_insert_inode_ref(trans, dest, 8925 new_dentry->d_name.name, 8926 new_dentry->d_name.len, 8927 old_ino, 8928 btrfs_ino(BTRFS_I(new_dir)), 8929 old_idx); 8930 if (ret) 8931 goto out_fail; 8932 } 8933 8934 /* And now for the dest. */ 8935 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 8936 /* force full log commit if subvolume involved. */ 8937 btrfs_set_log_full_commit(trans); 8938 } else { 8939 btrfs_pin_log_trans(dest); 8940 dest_log_pinned = true; 8941 ret = btrfs_insert_inode_ref(trans, root, 8942 old_dentry->d_name.name, 8943 old_dentry->d_name.len, 8944 new_ino, 8945 btrfs_ino(BTRFS_I(old_dir)), 8946 new_idx); 8947 if (ret) 8948 goto out_fail; 8949 } 8950 8951 /* Update inode version and ctime/mtime. */ 8952 inode_inc_iversion(old_dir); 8953 inode_inc_iversion(new_dir); 8954 inode_inc_iversion(old_inode); 8955 inode_inc_iversion(new_inode); 8956 old_dir->i_ctime = old_dir->i_mtime = ctime; 8957 new_dir->i_ctime = new_dir->i_mtime = ctime; 8958 old_inode->i_ctime = ctime; 8959 new_inode->i_ctime = ctime; 8960 8961 if (old_dentry->d_parent != new_dentry->d_parent) { 8962 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 8963 BTRFS_I(old_inode), 1); 8964 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), 8965 BTRFS_I(new_inode), 1); 8966 } 8967 8968 /* src is a subvolume */ 8969 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 8970 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); 8971 } else { /* src is an inode */ 8972 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), 8973 BTRFS_I(old_dentry->d_inode), 8974 old_dentry->d_name.name, 8975 old_dentry->d_name.len); 8976 if (!ret) 8977 ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); 8978 } 8979 if (ret) { 8980 btrfs_abort_transaction(trans, ret); 8981 goto out_fail; 8982 } 8983 8984 /* dest is a subvolume */ 8985 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 8986 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); 8987 } else { /* dest is an inode */ 8988 ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), 8989 BTRFS_I(new_dentry->d_inode), 8990 new_dentry->d_name.name, 8991 new_dentry->d_name.len); 8992 if (!ret) 8993 ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); 8994 } 8995 if (ret) { 8996 btrfs_abort_transaction(trans, ret); 8997 goto out_fail; 8998 } 8999 9000 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 9001 new_dentry->d_name.name, 9002 new_dentry->d_name.len, 0, old_idx); 9003 if (ret) { 9004 btrfs_abort_transaction(trans, ret); 9005 goto out_fail; 9006 } 9007 9008 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), 9009 old_dentry->d_name.name, 9010 old_dentry->d_name.len, 0, new_idx); 9011 if (ret) { 9012 btrfs_abort_transaction(trans, ret); 9013 goto out_fail; 9014 } 9015 9016 if (old_inode->i_nlink == 1) 9017 BTRFS_I(old_inode)->dir_index = old_idx; 9018 if (new_inode->i_nlink == 1) 9019 BTRFS_I(new_inode)->dir_index = new_idx; 9020 9021 if (root_log_pinned) { 9022 btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), 9023 new_dentry->d_parent); 9024 btrfs_end_log_trans(root); 9025 root_log_pinned = false; 9026 } 9027 if (dest_log_pinned) { 9028 btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), 9029 old_dentry->d_parent); 9030 btrfs_end_log_trans(dest); 9031 dest_log_pinned = false; 9032 } 9033 out_fail: 9034 /* 9035 * If we have pinned a log and an error happened, we unpin tasks 9036 * trying to sync the log and force them to fallback to a transaction 9037 * commit if the log currently contains any of the inodes involved in 9038 * this rename operation (to ensure we do not persist a log with an 9039 * inconsistent state for any of these inodes or leading to any 9040 * inconsistencies when replayed). If the transaction was aborted, the 9041 * abortion reason is propagated to userspace when attempting to commit 9042 * the transaction. If the log does not contain any of these inodes, we 9043 * allow the tasks to sync it. 9044 */ 9045 if (ret && (root_log_pinned || dest_log_pinned)) { 9046 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || 9047 btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || 9048 btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || 9049 (new_inode && 9050 btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) 9051 btrfs_set_log_full_commit(trans); 9052 9053 if (root_log_pinned) { 9054 btrfs_end_log_trans(root); 9055 root_log_pinned = false; 9056 } 9057 if (dest_log_pinned) { 9058 btrfs_end_log_trans(dest); 9059 dest_log_pinned = false; 9060 } 9061 } 9062 ret2 = btrfs_end_transaction(trans); 9063 ret = ret ? ret : ret2; 9064 out_notrans: 9065 if (new_ino == BTRFS_FIRST_FREE_OBJECTID || 9066 old_ino == BTRFS_FIRST_FREE_OBJECTID) 9067 up_read(&fs_info->subvol_sem); 9068 9069 return ret; 9070 } 9071 9072 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, 9073 struct btrfs_root *root, 9074 struct inode *dir, 9075 struct dentry *dentry) 9076 { 9077 int ret; 9078 struct inode *inode; 9079 u64 objectid; 9080 u64 index; 9081 9082 ret = btrfs_find_free_objectid(root, &objectid); 9083 if (ret) 9084 return ret; 9085 9086 inode = btrfs_new_inode(trans, root, dir, 9087 dentry->d_name.name, 9088 dentry->d_name.len, 9089 btrfs_ino(BTRFS_I(dir)), 9090 objectid, 9091 S_IFCHR | WHITEOUT_MODE, 9092 &index); 9093 9094 if (IS_ERR(inode)) { 9095 ret = PTR_ERR(inode); 9096 return ret; 9097 } 9098 9099 inode->i_op = &btrfs_special_inode_operations; 9100 init_special_inode(inode, inode->i_mode, 9101 WHITEOUT_DEV); 9102 9103 ret = btrfs_init_inode_security(trans, inode, dir, 9104 &dentry->d_name); 9105 if (ret) 9106 goto out; 9107 9108 ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 9109 BTRFS_I(inode), 0, index); 9110 if (ret) 9111 goto out; 9112 9113 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9114 out: 9115 unlock_new_inode(inode); 9116 if (ret) 9117 inode_dec_link_count(inode); 9118 iput(inode); 9119 9120 return ret; 9121 } 9122 9123 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 9124 struct inode *new_dir, struct dentry *new_dentry, 9125 unsigned int flags) 9126 { 9127 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9128 struct btrfs_trans_handle *trans; 9129 unsigned int trans_num_items; 9130 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9131 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9132 struct inode *new_inode = d_inode(new_dentry); 9133 struct inode *old_inode = d_inode(old_dentry); 9134 u64 index = 0; 9135 int ret; 9136 int ret2; 9137 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9138 bool log_pinned = false; 9139 9140 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 9141 return -EPERM; 9142 9143 /* we only allow rename subvolume link between subvolumes */ 9144 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 9145 return -EXDEV; 9146 9147 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 9148 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) 9149 return -ENOTEMPTY; 9150 9151 if (S_ISDIR(old_inode->i_mode) && new_inode && 9152 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 9153 return -ENOTEMPTY; 9154 9155 9156 /* check for collisions, even if the name isn't there */ 9157 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, 9158 new_dentry->d_name.name, 9159 new_dentry->d_name.len); 9160 9161 if (ret) { 9162 if (ret == -EEXIST) { 9163 /* we shouldn't get 9164 * eexist without a new_inode */ 9165 if (WARN_ON(!new_inode)) { 9166 return ret; 9167 } 9168 } else { 9169 /* maybe -EOVERFLOW */ 9170 return ret; 9171 } 9172 } 9173 ret = 0; 9174 9175 /* 9176 * we're using rename to replace one file with another. Start IO on it 9177 * now so we don't add too much work to the end of the transaction 9178 */ 9179 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 9180 filemap_flush(old_inode->i_mapping); 9181 9182 /* close the racy window with snapshot create/destroy ioctl */ 9183 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9184 down_read(&fs_info->subvol_sem); 9185 /* 9186 * We want to reserve the absolute worst case amount of items. So if 9187 * both inodes are subvols and we need to unlink them then that would 9188 * require 4 item modifications, but if they are both normal inodes it 9189 * would require 5 item modifications, so we'll assume they are normal 9190 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 9191 * should cover the worst case number of items we'll modify. 9192 * If our rename has the whiteout flag, we need more 5 units for the 9193 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item 9194 * when selinux is enabled). 9195 */ 9196 trans_num_items = 11; 9197 if (flags & RENAME_WHITEOUT) 9198 trans_num_items += 5; 9199 trans = btrfs_start_transaction(root, trans_num_items); 9200 if (IS_ERR(trans)) { 9201 ret = PTR_ERR(trans); 9202 goto out_notrans; 9203 } 9204 9205 if (dest != root) 9206 btrfs_record_root_in_trans(trans, dest); 9207 9208 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); 9209 if (ret) 9210 goto out_fail; 9211 9212 BTRFS_I(old_inode)->dir_index = 0ULL; 9213 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9214 /* force full log commit if subvolume involved. */ 9215 btrfs_set_log_full_commit(trans); 9216 } else { 9217 btrfs_pin_log_trans(root); 9218 log_pinned = true; 9219 ret = btrfs_insert_inode_ref(trans, dest, 9220 new_dentry->d_name.name, 9221 new_dentry->d_name.len, 9222 old_ino, 9223 btrfs_ino(BTRFS_I(new_dir)), index); 9224 if (ret) 9225 goto out_fail; 9226 } 9227 9228 inode_inc_iversion(old_dir); 9229 inode_inc_iversion(new_dir); 9230 inode_inc_iversion(old_inode); 9231 old_dir->i_ctime = old_dir->i_mtime = 9232 new_dir->i_ctime = new_dir->i_mtime = 9233 old_inode->i_ctime = current_time(old_dir); 9234 9235 if (old_dentry->d_parent != new_dentry->d_parent) 9236 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 9237 BTRFS_I(old_inode), 1); 9238 9239 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9240 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); 9241 } else { 9242 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), 9243 BTRFS_I(d_inode(old_dentry)), 9244 old_dentry->d_name.name, 9245 old_dentry->d_name.len); 9246 if (!ret) 9247 ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); 9248 } 9249 if (ret) { 9250 btrfs_abort_transaction(trans, ret); 9251 goto out_fail; 9252 } 9253 9254 if (new_inode) { 9255 inode_inc_iversion(new_inode); 9256 new_inode->i_ctime = current_time(new_inode); 9257 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == 9258 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 9259 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); 9260 BUG_ON(new_inode->i_nlink == 0); 9261 } else { 9262 ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), 9263 BTRFS_I(d_inode(new_dentry)), 9264 new_dentry->d_name.name, 9265 new_dentry->d_name.len); 9266 } 9267 if (!ret && new_inode->i_nlink == 0) 9268 ret = btrfs_orphan_add(trans, 9269 BTRFS_I(d_inode(new_dentry))); 9270 if (ret) { 9271 btrfs_abort_transaction(trans, ret); 9272 goto out_fail; 9273 } 9274 } 9275 9276 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 9277 new_dentry->d_name.name, 9278 new_dentry->d_name.len, 0, index); 9279 if (ret) { 9280 btrfs_abort_transaction(trans, ret); 9281 goto out_fail; 9282 } 9283 9284 if (old_inode->i_nlink == 1) 9285 BTRFS_I(old_inode)->dir_index = index; 9286 9287 if (log_pinned) { 9288 btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), 9289 new_dentry->d_parent); 9290 btrfs_end_log_trans(root); 9291 log_pinned = false; 9292 } 9293 9294 if (flags & RENAME_WHITEOUT) { 9295 ret = btrfs_whiteout_for_rename(trans, root, old_dir, 9296 old_dentry); 9297 9298 if (ret) { 9299 btrfs_abort_transaction(trans, ret); 9300 goto out_fail; 9301 } 9302 } 9303 out_fail: 9304 /* 9305 * If we have pinned the log and an error happened, we unpin tasks 9306 * trying to sync the log and force them to fallback to a transaction 9307 * commit if the log currently contains any of the inodes involved in 9308 * this rename operation (to ensure we do not persist a log with an 9309 * inconsistent state for any of these inodes or leading to any 9310 * inconsistencies when replayed). If the transaction was aborted, the 9311 * abortion reason is propagated to userspace when attempting to commit 9312 * the transaction. If the log does not contain any of these inodes, we 9313 * allow the tasks to sync it. 9314 */ 9315 if (ret && log_pinned) { 9316 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || 9317 btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || 9318 btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || 9319 (new_inode && 9320 btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) 9321 btrfs_set_log_full_commit(trans); 9322 9323 btrfs_end_log_trans(root); 9324 log_pinned = false; 9325 } 9326 ret2 = btrfs_end_transaction(trans); 9327 ret = ret ? ret : ret2; 9328 out_notrans: 9329 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9330 up_read(&fs_info->subvol_sem); 9331 9332 return ret; 9333 } 9334 9335 static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, 9336 struct inode *new_dir, struct dentry *new_dentry, 9337 unsigned int flags) 9338 { 9339 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 9340 return -EINVAL; 9341 9342 if (flags & RENAME_EXCHANGE) 9343 return btrfs_rename_exchange(old_dir, old_dentry, new_dir, 9344 new_dentry); 9345 9346 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); 9347 } 9348 9349 struct btrfs_delalloc_work { 9350 struct inode *inode; 9351 struct completion completion; 9352 struct list_head list; 9353 struct btrfs_work work; 9354 }; 9355 9356 static void btrfs_run_delalloc_work(struct btrfs_work *work) 9357 { 9358 struct btrfs_delalloc_work *delalloc_work; 9359 struct inode *inode; 9360 9361 delalloc_work = container_of(work, struct btrfs_delalloc_work, 9362 work); 9363 inode = delalloc_work->inode; 9364 filemap_flush(inode->i_mapping); 9365 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 9366 &BTRFS_I(inode)->runtime_flags)) 9367 filemap_flush(inode->i_mapping); 9368 9369 iput(inode); 9370 complete(&delalloc_work->completion); 9371 } 9372 9373 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) 9374 { 9375 struct btrfs_delalloc_work *work; 9376 9377 work = kmalloc(sizeof(*work), GFP_NOFS); 9378 if (!work) 9379 return NULL; 9380 9381 init_completion(&work->completion); 9382 INIT_LIST_HEAD(&work->list); 9383 work->inode = inode; 9384 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); 9385 9386 return work; 9387 } 9388 9389 /* 9390 * some fairly slow code that needs optimization. This walks the list 9391 * of all the inodes with pending delalloc and forces them to disk. 9392 */ 9393 static int start_delalloc_inodes(struct btrfs_root *root, 9394 struct writeback_control *wbc, bool snapshot, 9395 bool in_reclaim_context) 9396 { 9397 struct btrfs_inode *binode; 9398 struct inode *inode; 9399 struct btrfs_delalloc_work *work, *next; 9400 struct list_head works; 9401 struct list_head splice; 9402 int ret = 0; 9403 bool full_flush = wbc->nr_to_write == LONG_MAX; 9404 9405 INIT_LIST_HEAD(&works); 9406 INIT_LIST_HEAD(&splice); 9407 9408 mutex_lock(&root->delalloc_mutex); 9409 spin_lock(&root->delalloc_lock); 9410 list_splice_init(&root->delalloc_inodes, &splice); 9411 while (!list_empty(&splice)) { 9412 binode = list_entry(splice.next, struct btrfs_inode, 9413 delalloc_inodes); 9414 9415 list_move_tail(&binode->delalloc_inodes, 9416 &root->delalloc_inodes); 9417 9418 if (in_reclaim_context && 9419 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) 9420 continue; 9421 9422 inode = igrab(&binode->vfs_inode); 9423 if (!inode) { 9424 cond_resched_lock(&root->delalloc_lock); 9425 continue; 9426 } 9427 spin_unlock(&root->delalloc_lock); 9428 9429 if (snapshot) 9430 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 9431 &binode->runtime_flags); 9432 if (full_flush) { 9433 work = btrfs_alloc_delalloc_work(inode); 9434 if (!work) { 9435 iput(inode); 9436 ret = -ENOMEM; 9437 goto out; 9438 } 9439 list_add_tail(&work->list, &works); 9440 btrfs_queue_work(root->fs_info->flush_workers, 9441 &work->work); 9442 } else { 9443 ret = sync_inode(inode, wbc); 9444 if (!ret && 9445 test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 9446 &BTRFS_I(inode)->runtime_flags)) 9447 ret = sync_inode(inode, wbc); 9448 btrfs_add_delayed_iput(inode); 9449 if (ret || wbc->nr_to_write <= 0) 9450 goto out; 9451 } 9452 cond_resched(); 9453 spin_lock(&root->delalloc_lock); 9454 } 9455 spin_unlock(&root->delalloc_lock); 9456 9457 out: 9458 list_for_each_entry_safe(work, next, &works, list) { 9459 list_del_init(&work->list); 9460 wait_for_completion(&work->completion); 9461 kfree(work); 9462 } 9463 9464 if (!list_empty(&splice)) { 9465 spin_lock(&root->delalloc_lock); 9466 list_splice_tail(&splice, &root->delalloc_inodes); 9467 spin_unlock(&root->delalloc_lock); 9468 } 9469 mutex_unlock(&root->delalloc_mutex); 9470 return ret; 9471 } 9472 9473 int btrfs_start_delalloc_snapshot(struct btrfs_root *root) 9474 { 9475 struct writeback_control wbc = { 9476 .nr_to_write = LONG_MAX, 9477 .sync_mode = WB_SYNC_NONE, 9478 .range_start = 0, 9479 .range_end = LLONG_MAX, 9480 }; 9481 struct btrfs_fs_info *fs_info = root->fs_info; 9482 9483 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 9484 return -EROFS; 9485 9486 return start_delalloc_inodes(root, &wbc, true, false); 9487 } 9488 9489 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, 9490 bool in_reclaim_context) 9491 { 9492 struct writeback_control wbc = { 9493 .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr, 9494 .sync_mode = WB_SYNC_NONE, 9495 .range_start = 0, 9496 .range_end = LLONG_MAX, 9497 }; 9498 struct btrfs_root *root; 9499 struct list_head splice; 9500 int ret; 9501 9502 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 9503 return -EROFS; 9504 9505 INIT_LIST_HEAD(&splice); 9506 9507 mutex_lock(&fs_info->delalloc_root_mutex); 9508 spin_lock(&fs_info->delalloc_root_lock); 9509 list_splice_init(&fs_info->delalloc_roots, &splice); 9510 while (!list_empty(&splice) && nr) { 9511 /* 9512 * Reset nr_to_write here so we know that we're doing a full 9513 * flush. 9514 */ 9515 if (nr == U64_MAX) 9516 wbc.nr_to_write = LONG_MAX; 9517 9518 root = list_first_entry(&splice, struct btrfs_root, 9519 delalloc_root); 9520 root = btrfs_grab_root(root); 9521 BUG_ON(!root); 9522 list_move_tail(&root->delalloc_root, 9523 &fs_info->delalloc_roots); 9524 spin_unlock(&fs_info->delalloc_root_lock); 9525 9526 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); 9527 btrfs_put_root(root); 9528 if (ret < 0 || wbc.nr_to_write <= 0) 9529 goto out; 9530 spin_lock(&fs_info->delalloc_root_lock); 9531 } 9532 spin_unlock(&fs_info->delalloc_root_lock); 9533 9534 ret = 0; 9535 out: 9536 if (!list_empty(&splice)) { 9537 spin_lock(&fs_info->delalloc_root_lock); 9538 list_splice_tail(&splice, &fs_info->delalloc_roots); 9539 spin_unlock(&fs_info->delalloc_root_lock); 9540 } 9541 mutex_unlock(&fs_info->delalloc_root_mutex); 9542 return ret; 9543 } 9544 9545 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 9546 const char *symname) 9547 { 9548 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 9549 struct btrfs_trans_handle *trans; 9550 struct btrfs_root *root = BTRFS_I(dir)->root; 9551 struct btrfs_path *path; 9552 struct btrfs_key key; 9553 struct inode *inode = NULL; 9554 int err; 9555 u64 objectid; 9556 u64 index = 0; 9557 int name_len; 9558 int datasize; 9559 unsigned long ptr; 9560 struct btrfs_file_extent_item *ei; 9561 struct extent_buffer *leaf; 9562 9563 name_len = strlen(symname); 9564 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) 9565 return -ENAMETOOLONG; 9566 9567 /* 9568 * 2 items for inode item and ref 9569 * 2 items for dir items 9570 * 1 item for updating parent inode item 9571 * 1 item for the inline extent item 9572 * 1 item for xattr if selinux is on 9573 */ 9574 trans = btrfs_start_transaction(root, 7); 9575 if (IS_ERR(trans)) 9576 return PTR_ERR(trans); 9577 9578 err = btrfs_find_free_objectid(root, &objectid); 9579 if (err) 9580 goto out_unlock; 9581 9582 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 9583 dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), 9584 objectid, S_IFLNK|S_IRWXUGO, &index); 9585 if (IS_ERR(inode)) { 9586 err = PTR_ERR(inode); 9587 inode = NULL; 9588 goto out_unlock; 9589 } 9590 9591 /* 9592 * If the active LSM wants to access the inode during 9593 * d_instantiate it needs these. Smack checks to see 9594 * if the filesystem supports xattrs by looking at the 9595 * ops vector. 9596 */ 9597 inode->i_fop = &btrfs_file_operations; 9598 inode->i_op = &btrfs_file_inode_operations; 9599 inode->i_mapping->a_ops = &btrfs_aops; 9600 9601 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 9602 if (err) 9603 goto out_unlock; 9604 9605 path = btrfs_alloc_path(); 9606 if (!path) { 9607 err = -ENOMEM; 9608 goto out_unlock; 9609 } 9610 key.objectid = btrfs_ino(BTRFS_I(inode)); 9611 key.offset = 0; 9612 key.type = BTRFS_EXTENT_DATA_KEY; 9613 datasize = btrfs_file_extent_calc_inline_size(name_len); 9614 err = btrfs_insert_empty_item(trans, root, path, &key, 9615 datasize); 9616 if (err) { 9617 btrfs_free_path(path); 9618 goto out_unlock; 9619 } 9620 leaf = path->nodes[0]; 9621 ei = btrfs_item_ptr(leaf, path->slots[0], 9622 struct btrfs_file_extent_item); 9623 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 9624 btrfs_set_file_extent_type(leaf, ei, 9625 BTRFS_FILE_EXTENT_INLINE); 9626 btrfs_set_file_extent_encryption(leaf, ei, 0); 9627 btrfs_set_file_extent_compression(leaf, ei, 0); 9628 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 9629 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 9630 9631 ptr = btrfs_file_extent_inline_start(ei); 9632 write_extent_buffer(leaf, symname, ptr, name_len); 9633 btrfs_mark_buffer_dirty(leaf); 9634 btrfs_free_path(path); 9635 9636 inode->i_op = &btrfs_symlink_inode_operations; 9637 inode_nohighmem(inode); 9638 inode_set_bytes(inode, name_len); 9639 btrfs_i_size_write(BTRFS_I(inode), name_len); 9640 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9641 /* 9642 * Last step, add directory indexes for our symlink inode. This is the 9643 * last step to avoid extra cleanup of these indexes if an error happens 9644 * elsewhere above. 9645 */ 9646 if (!err) 9647 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 9648 BTRFS_I(inode), 0, index); 9649 if (err) 9650 goto out_unlock; 9651 9652 d_instantiate_new(dentry, inode); 9653 9654 out_unlock: 9655 btrfs_end_transaction(trans); 9656 if (err && inode) { 9657 inode_dec_link_count(inode); 9658 discard_new_inode(inode); 9659 } 9660 btrfs_btree_balance_dirty(fs_info); 9661 return err; 9662 } 9663 9664 static struct btrfs_trans_handle *insert_prealloc_file_extent( 9665 struct btrfs_trans_handle *trans_in, 9666 struct btrfs_inode *inode, 9667 struct btrfs_key *ins, 9668 u64 file_offset) 9669 { 9670 struct btrfs_file_extent_item stack_fi; 9671 struct btrfs_replace_extent_info extent_info; 9672 struct btrfs_trans_handle *trans = trans_in; 9673 struct btrfs_path *path; 9674 u64 start = ins->objectid; 9675 u64 len = ins->offset; 9676 int ret; 9677 9678 memset(&stack_fi, 0, sizeof(stack_fi)); 9679 9680 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); 9681 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); 9682 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); 9683 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); 9684 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); 9685 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); 9686 /* Encryption and other encoding is reserved and all 0 */ 9687 9688 ret = btrfs_qgroup_release_data(inode, file_offset, len); 9689 if (ret < 0) 9690 return ERR_PTR(ret); 9691 9692 if (trans) { 9693 ret = insert_reserved_file_extent(trans, inode, 9694 file_offset, &stack_fi, 9695 true, ret); 9696 if (ret) 9697 return ERR_PTR(ret); 9698 return trans; 9699 } 9700 9701 extent_info.disk_offset = start; 9702 extent_info.disk_len = len; 9703 extent_info.data_offset = 0; 9704 extent_info.data_len = len; 9705 extent_info.file_offset = file_offset; 9706 extent_info.extent_buf = (char *)&stack_fi; 9707 extent_info.is_new_extent = true; 9708 extent_info.qgroup_reserved = ret; 9709 extent_info.insertions = 0; 9710 9711 path = btrfs_alloc_path(); 9712 if (!path) 9713 return ERR_PTR(-ENOMEM); 9714 9715 ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, 9716 file_offset + len - 1, &extent_info, 9717 &trans); 9718 btrfs_free_path(path); 9719 if (ret) 9720 return ERR_PTR(ret); 9721 9722 return trans; 9723 } 9724 9725 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 9726 u64 start, u64 num_bytes, u64 min_size, 9727 loff_t actual_len, u64 *alloc_hint, 9728 struct btrfs_trans_handle *trans) 9729 { 9730 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 9731 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 9732 struct extent_map *em; 9733 struct btrfs_root *root = BTRFS_I(inode)->root; 9734 struct btrfs_key ins; 9735 u64 cur_offset = start; 9736 u64 clear_offset = start; 9737 u64 i_size; 9738 u64 cur_bytes; 9739 u64 last_alloc = (u64)-1; 9740 int ret = 0; 9741 bool own_trans = true; 9742 u64 end = start + num_bytes - 1; 9743 9744 if (trans) 9745 own_trans = false; 9746 while (num_bytes > 0) { 9747 cur_bytes = min_t(u64, num_bytes, SZ_256M); 9748 cur_bytes = max(cur_bytes, min_size); 9749 /* 9750 * If we are severely fragmented we could end up with really 9751 * small allocations, so if the allocator is returning small 9752 * chunks lets make its job easier by only searching for those 9753 * sized chunks. 9754 */ 9755 cur_bytes = min(cur_bytes, last_alloc); 9756 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, 9757 min_size, 0, *alloc_hint, &ins, 1, 0); 9758 if (ret) 9759 break; 9760 9761 /* 9762 * We've reserved this space, and thus converted it from 9763 * ->bytes_may_use to ->bytes_reserved. Any error that happens 9764 * from here on out we will only need to clear our reservation 9765 * for the remaining unreserved area, so advance our 9766 * clear_offset by our extent size. 9767 */ 9768 clear_offset += ins.offset; 9769 9770 last_alloc = ins.offset; 9771 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), 9772 &ins, cur_offset); 9773 /* 9774 * Now that we inserted the prealloc extent we can finally 9775 * decrement the number of reservations in the block group. 9776 * If we did it before, we could race with relocation and have 9777 * relocation miss the reserved extent, making it fail later. 9778 */ 9779 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 9780 if (IS_ERR(trans)) { 9781 ret = PTR_ERR(trans); 9782 btrfs_free_reserved_extent(fs_info, ins.objectid, 9783 ins.offset, 0); 9784 break; 9785 } 9786 9787 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 9788 cur_offset + ins.offset -1, 0); 9789 9790 em = alloc_extent_map(); 9791 if (!em) { 9792 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 9793 &BTRFS_I(inode)->runtime_flags); 9794 goto next; 9795 } 9796 9797 em->start = cur_offset; 9798 em->orig_start = cur_offset; 9799 em->len = ins.offset; 9800 em->block_start = ins.objectid; 9801 em->block_len = ins.offset; 9802 em->orig_block_len = ins.offset; 9803 em->ram_bytes = ins.offset; 9804 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 9805 em->generation = trans->transid; 9806 9807 while (1) { 9808 write_lock(&em_tree->lock); 9809 ret = add_extent_mapping(em_tree, em, 1); 9810 write_unlock(&em_tree->lock); 9811 if (ret != -EEXIST) 9812 break; 9813 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 9814 cur_offset + ins.offset - 1, 9815 0); 9816 } 9817 free_extent_map(em); 9818 next: 9819 num_bytes -= ins.offset; 9820 cur_offset += ins.offset; 9821 *alloc_hint = ins.objectid + ins.offset; 9822 9823 inode_inc_iversion(inode); 9824 inode->i_ctime = current_time(inode); 9825 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 9826 if (!(mode & FALLOC_FL_KEEP_SIZE) && 9827 (actual_len > inode->i_size) && 9828 (cur_offset > inode->i_size)) { 9829 if (cur_offset > actual_len) 9830 i_size = actual_len; 9831 else 9832 i_size = cur_offset; 9833 i_size_write(inode, i_size); 9834 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 9835 } 9836 9837 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9838 9839 if (ret) { 9840 btrfs_abort_transaction(trans, ret); 9841 if (own_trans) 9842 btrfs_end_transaction(trans); 9843 break; 9844 } 9845 9846 if (own_trans) { 9847 btrfs_end_transaction(trans); 9848 trans = NULL; 9849 } 9850 } 9851 if (clear_offset < end) 9852 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, 9853 end - clear_offset + 1); 9854 return ret; 9855 } 9856 9857 int btrfs_prealloc_file_range(struct inode *inode, int mode, 9858 u64 start, u64 num_bytes, u64 min_size, 9859 loff_t actual_len, u64 *alloc_hint) 9860 { 9861 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 9862 min_size, actual_len, alloc_hint, 9863 NULL); 9864 } 9865 9866 int btrfs_prealloc_file_range_trans(struct inode *inode, 9867 struct btrfs_trans_handle *trans, int mode, 9868 u64 start, u64 num_bytes, u64 min_size, 9869 loff_t actual_len, u64 *alloc_hint) 9870 { 9871 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 9872 min_size, actual_len, alloc_hint, trans); 9873 } 9874 9875 static int btrfs_set_page_dirty(struct page *page) 9876 { 9877 return __set_page_dirty_nobuffers(page); 9878 } 9879 9880 static int btrfs_permission(struct inode *inode, int mask) 9881 { 9882 struct btrfs_root *root = BTRFS_I(inode)->root; 9883 umode_t mode = inode->i_mode; 9884 9885 if (mask & MAY_WRITE && 9886 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 9887 if (btrfs_root_readonly(root)) 9888 return -EROFS; 9889 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 9890 return -EACCES; 9891 } 9892 return generic_permission(inode, mask); 9893 } 9894 9895 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 9896 { 9897 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 9898 struct btrfs_trans_handle *trans; 9899 struct btrfs_root *root = BTRFS_I(dir)->root; 9900 struct inode *inode = NULL; 9901 u64 objectid; 9902 u64 index; 9903 int ret = 0; 9904 9905 /* 9906 * 5 units required for adding orphan entry 9907 */ 9908 trans = btrfs_start_transaction(root, 5); 9909 if (IS_ERR(trans)) 9910 return PTR_ERR(trans); 9911 9912 ret = btrfs_find_free_objectid(root, &objectid); 9913 if (ret) 9914 goto out; 9915 9916 inode = btrfs_new_inode(trans, root, dir, NULL, 0, 9917 btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 9918 if (IS_ERR(inode)) { 9919 ret = PTR_ERR(inode); 9920 inode = NULL; 9921 goto out; 9922 } 9923 9924 inode->i_fop = &btrfs_file_operations; 9925 inode->i_op = &btrfs_file_inode_operations; 9926 9927 inode->i_mapping->a_ops = &btrfs_aops; 9928 9929 ret = btrfs_init_inode_security(trans, inode, dir, NULL); 9930 if (ret) 9931 goto out; 9932 9933 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9934 if (ret) 9935 goto out; 9936 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 9937 if (ret) 9938 goto out; 9939 9940 /* 9941 * We set number of links to 0 in btrfs_new_inode(), and here we set 9942 * it to 1 because d_tmpfile() will issue a warning if the count is 0, 9943 * through: 9944 * 9945 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 9946 */ 9947 set_nlink(inode, 1); 9948 d_tmpfile(dentry, inode); 9949 unlock_new_inode(inode); 9950 mark_inode_dirty(inode); 9951 out: 9952 btrfs_end_transaction(trans); 9953 if (ret && inode) 9954 discard_new_inode(inode); 9955 btrfs_btree_balance_dirty(fs_info); 9956 return ret; 9957 } 9958 9959 void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 9960 { 9961 struct inode *inode = tree->private_data; 9962 unsigned long index = start >> PAGE_SHIFT; 9963 unsigned long end_index = end >> PAGE_SHIFT; 9964 struct page *page; 9965 9966 while (index <= end_index) { 9967 page = find_get_page(inode->i_mapping, index); 9968 ASSERT(page); /* Pages should be in the extent_io_tree */ 9969 set_page_writeback(page); 9970 put_page(page); 9971 index++; 9972 } 9973 } 9974 9975 #ifdef CONFIG_SWAP 9976 /* 9977 * Add an entry indicating a block group or device which is pinned by a 9978 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a 9979 * negative errno on failure. 9980 */ 9981 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, 9982 bool is_block_group) 9983 { 9984 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 9985 struct btrfs_swapfile_pin *sp, *entry; 9986 struct rb_node **p; 9987 struct rb_node *parent = NULL; 9988 9989 sp = kmalloc(sizeof(*sp), GFP_NOFS); 9990 if (!sp) 9991 return -ENOMEM; 9992 sp->ptr = ptr; 9993 sp->inode = inode; 9994 sp->is_block_group = is_block_group; 9995 9996 spin_lock(&fs_info->swapfile_pins_lock); 9997 p = &fs_info->swapfile_pins.rb_node; 9998 while (*p) { 9999 parent = *p; 10000 entry = rb_entry(parent, struct btrfs_swapfile_pin, node); 10001 if (sp->ptr < entry->ptr || 10002 (sp->ptr == entry->ptr && sp->inode < entry->inode)) { 10003 p = &(*p)->rb_left; 10004 } else if (sp->ptr > entry->ptr || 10005 (sp->ptr == entry->ptr && sp->inode > entry->inode)) { 10006 p = &(*p)->rb_right; 10007 } else { 10008 spin_unlock(&fs_info->swapfile_pins_lock); 10009 kfree(sp); 10010 return 1; 10011 } 10012 } 10013 rb_link_node(&sp->node, parent, p); 10014 rb_insert_color(&sp->node, &fs_info->swapfile_pins); 10015 spin_unlock(&fs_info->swapfile_pins_lock); 10016 return 0; 10017 } 10018 10019 /* Free all of the entries pinned by this swapfile. */ 10020 static void btrfs_free_swapfile_pins(struct inode *inode) 10021 { 10022 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 10023 struct btrfs_swapfile_pin *sp; 10024 struct rb_node *node, *next; 10025 10026 spin_lock(&fs_info->swapfile_pins_lock); 10027 node = rb_first(&fs_info->swapfile_pins); 10028 while (node) { 10029 next = rb_next(node); 10030 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 10031 if (sp->inode == inode) { 10032 rb_erase(&sp->node, &fs_info->swapfile_pins); 10033 if (sp->is_block_group) 10034 btrfs_put_block_group(sp->ptr); 10035 kfree(sp); 10036 } 10037 node = next; 10038 } 10039 spin_unlock(&fs_info->swapfile_pins_lock); 10040 } 10041 10042 struct btrfs_swap_info { 10043 u64 start; 10044 u64 block_start; 10045 u64 block_len; 10046 u64 lowest_ppage; 10047 u64 highest_ppage; 10048 unsigned long nr_pages; 10049 int nr_extents; 10050 }; 10051 10052 static int btrfs_add_swap_extent(struct swap_info_struct *sis, 10053 struct btrfs_swap_info *bsi) 10054 { 10055 unsigned long nr_pages; 10056 u64 first_ppage, first_ppage_reported, next_ppage; 10057 int ret; 10058 10059 first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; 10060 next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, 10061 PAGE_SIZE) >> PAGE_SHIFT; 10062 10063 if (first_ppage >= next_ppage) 10064 return 0; 10065 nr_pages = next_ppage - first_ppage; 10066 10067 first_ppage_reported = first_ppage; 10068 if (bsi->start == 0) 10069 first_ppage_reported++; 10070 if (bsi->lowest_ppage > first_ppage_reported) 10071 bsi->lowest_ppage = first_ppage_reported; 10072 if (bsi->highest_ppage < (next_ppage - 1)) 10073 bsi->highest_ppage = next_ppage - 1; 10074 10075 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); 10076 if (ret < 0) 10077 return ret; 10078 bsi->nr_extents += ret; 10079 bsi->nr_pages += nr_pages; 10080 return 0; 10081 } 10082 10083 static void btrfs_swap_deactivate(struct file *file) 10084 { 10085 struct inode *inode = file_inode(file); 10086 10087 btrfs_free_swapfile_pins(inode); 10088 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); 10089 } 10090 10091 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 10092 sector_t *span) 10093 { 10094 struct inode *inode = file_inode(file); 10095 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 10096 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 10097 struct extent_state *cached_state = NULL; 10098 struct extent_map *em = NULL; 10099 struct btrfs_device *device = NULL; 10100 struct btrfs_swap_info bsi = { 10101 .lowest_ppage = (sector_t)-1ULL, 10102 }; 10103 int ret = 0; 10104 u64 isize; 10105 u64 start; 10106 10107 /* 10108 * If the swap file was just created, make sure delalloc is done. If the 10109 * file changes again after this, the user is doing something stupid and 10110 * we don't really care. 10111 */ 10112 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 10113 if (ret) 10114 return ret; 10115 10116 /* 10117 * The inode is locked, so these flags won't change after we check them. 10118 */ 10119 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { 10120 btrfs_warn(fs_info, "swapfile must not be compressed"); 10121 return -EINVAL; 10122 } 10123 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { 10124 btrfs_warn(fs_info, "swapfile must not be copy-on-write"); 10125 return -EINVAL; 10126 } 10127 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 10128 btrfs_warn(fs_info, "swapfile must not be checksummed"); 10129 return -EINVAL; 10130 } 10131 10132 /* 10133 * Balance or device remove/replace/resize can move stuff around from 10134 * under us. The exclop protection makes sure they aren't running/won't 10135 * run concurrently while we are mapping the swap extents, and 10136 * fs_info->swapfile_pins prevents them from running while the swap 10137 * file is active and moving the extents. Note that this also prevents 10138 * a concurrent device add which isn't actually necessary, but it's not 10139 * really worth the trouble to allow it. 10140 */ 10141 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { 10142 btrfs_warn(fs_info, 10143 "cannot activate swapfile while exclusive operation is running"); 10144 return -EBUSY; 10145 } 10146 /* 10147 * Snapshots can create extents which require COW even if NODATACOW is 10148 * set. We use this counter to prevent snapshots. We must increment it 10149 * before walking the extents because we don't want a concurrent 10150 * snapshot to run after we've already checked the extents. 10151 */ 10152 atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles); 10153 10154 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); 10155 10156 lock_extent_bits(io_tree, 0, isize - 1, &cached_state); 10157 start = 0; 10158 while (start < isize) { 10159 u64 logical_block_start, physical_block_start; 10160 struct btrfs_block_group *bg; 10161 u64 len = isize - start; 10162 10163 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); 10164 if (IS_ERR(em)) { 10165 ret = PTR_ERR(em); 10166 goto out; 10167 } 10168 10169 if (em->block_start == EXTENT_MAP_HOLE) { 10170 btrfs_warn(fs_info, "swapfile must not have holes"); 10171 ret = -EINVAL; 10172 goto out; 10173 } 10174 if (em->block_start == EXTENT_MAP_INLINE) { 10175 /* 10176 * It's unlikely we'll ever actually find ourselves 10177 * here, as a file small enough to fit inline won't be 10178 * big enough to store more than the swap header, but in 10179 * case something changes in the future, let's catch it 10180 * here rather than later. 10181 */ 10182 btrfs_warn(fs_info, "swapfile must not be inline"); 10183 ret = -EINVAL; 10184 goto out; 10185 } 10186 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 10187 btrfs_warn(fs_info, "swapfile must not be compressed"); 10188 ret = -EINVAL; 10189 goto out; 10190 } 10191 10192 logical_block_start = em->block_start + (start - em->start); 10193 len = min(len, em->len - (start - em->start)); 10194 free_extent_map(em); 10195 em = NULL; 10196 10197 ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); 10198 if (ret < 0) { 10199 goto out; 10200 } else if (ret) { 10201 ret = 0; 10202 } else { 10203 btrfs_warn(fs_info, 10204 "swapfile must not be copy-on-write"); 10205 ret = -EINVAL; 10206 goto out; 10207 } 10208 10209 em = btrfs_get_chunk_map(fs_info, logical_block_start, len); 10210 if (IS_ERR(em)) { 10211 ret = PTR_ERR(em); 10212 goto out; 10213 } 10214 10215 if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 10216 btrfs_warn(fs_info, 10217 "swapfile must have single data profile"); 10218 ret = -EINVAL; 10219 goto out; 10220 } 10221 10222 if (device == NULL) { 10223 device = em->map_lookup->stripes[0].dev; 10224 ret = btrfs_add_swapfile_pin(inode, device, false); 10225 if (ret == 1) 10226 ret = 0; 10227 else if (ret) 10228 goto out; 10229 } else if (device != em->map_lookup->stripes[0].dev) { 10230 btrfs_warn(fs_info, "swapfile must be on one device"); 10231 ret = -EINVAL; 10232 goto out; 10233 } 10234 10235 physical_block_start = (em->map_lookup->stripes[0].physical + 10236 (logical_block_start - em->start)); 10237 len = min(len, em->len - (logical_block_start - em->start)); 10238 free_extent_map(em); 10239 em = NULL; 10240 10241 bg = btrfs_lookup_block_group(fs_info, logical_block_start); 10242 if (!bg) { 10243 btrfs_warn(fs_info, 10244 "could not find block group containing swapfile"); 10245 ret = -EINVAL; 10246 goto out; 10247 } 10248 10249 ret = btrfs_add_swapfile_pin(inode, bg, true); 10250 if (ret) { 10251 btrfs_put_block_group(bg); 10252 if (ret == 1) 10253 ret = 0; 10254 else 10255 goto out; 10256 } 10257 10258 if (bsi.block_len && 10259 bsi.block_start + bsi.block_len == physical_block_start) { 10260 bsi.block_len += len; 10261 } else { 10262 if (bsi.block_len) { 10263 ret = btrfs_add_swap_extent(sis, &bsi); 10264 if (ret) 10265 goto out; 10266 } 10267 bsi.start = start; 10268 bsi.block_start = physical_block_start; 10269 bsi.block_len = len; 10270 } 10271 10272 start += len; 10273 } 10274 10275 if (bsi.block_len) 10276 ret = btrfs_add_swap_extent(sis, &bsi); 10277 10278 out: 10279 if (!IS_ERR_OR_NULL(em)) 10280 free_extent_map(em); 10281 10282 unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); 10283 10284 if (ret) 10285 btrfs_swap_deactivate(file); 10286 10287 btrfs_exclop_finish(fs_info); 10288 10289 if (ret) 10290 return ret; 10291 10292 if (device) 10293 sis->bdev = device->bdev; 10294 *span = bsi.highest_ppage - bsi.lowest_ppage + 1; 10295 sis->max = bsi.nr_pages; 10296 sis->pages = bsi.nr_pages - 1; 10297 sis->highest_bit = bsi.nr_pages - 1; 10298 return bsi.nr_extents; 10299 } 10300 #else 10301 static void btrfs_swap_deactivate(struct file *file) 10302 { 10303 } 10304 10305 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 10306 sector_t *span) 10307 { 10308 return -EOPNOTSUPP; 10309 } 10310 #endif 10311 10312 /* 10313 * Update the number of bytes used in the VFS' inode. When we replace extents in 10314 * a range (clone, dedupe, fallocate's zero range), we must update the number of 10315 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls 10316 * always get a correct value. 10317 */ 10318 void btrfs_update_inode_bytes(struct btrfs_inode *inode, 10319 const u64 add_bytes, 10320 const u64 del_bytes) 10321 { 10322 if (add_bytes == del_bytes) 10323 return; 10324 10325 spin_lock(&inode->lock); 10326 if (del_bytes > 0) 10327 inode_sub_bytes(&inode->vfs_inode, del_bytes); 10328 if (add_bytes > 0) 10329 inode_add_bytes(&inode->vfs_inode, add_bytes); 10330 spin_unlock(&inode->lock); 10331 } 10332 10333 static const struct inode_operations btrfs_dir_inode_operations = { 10334 .getattr = btrfs_getattr, 10335 .lookup = btrfs_lookup, 10336 .create = btrfs_create, 10337 .unlink = btrfs_unlink, 10338 .link = btrfs_link, 10339 .mkdir = btrfs_mkdir, 10340 .rmdir = btrfs_rmdir, 10341 .rename = btrfs_rename2, 10342 .symlink = btrfs_symlink, 10343 .setattr = btrfs_setattr, 10344 .mknod = btrfs_mknod, 10345 .listxattr = btrfs_listxattr, 10346 .permission = btrfs_permission, 10347 .get_acl = btrfs_get_acl, 10348 .set_acl = btrfs_set_acl, 10349 .update_time = btrfs_update_time, 10350 .tmpfile = btrfs_tmpfile, 10351 }; 10352 10353 static const struct file_operations btrfs_dir_file_operations = { 10354 .llseek = generic_file_llseek, 10355 .read = generic_read_dir, 10356 .iterate_shared = btrfs_real_readdir, 10357 .open = btrfs_opendir, 10358 .unlocked_ioctl = btrfs_ioctl, 10359 #ifdef CONFIG_COMPAT 10360 .compat_ioctl = btrfs_compat_ioctl, 10361 #endif 10362 .release = btrfs_release_file, 10363 .fsync = btrfs_sync_file, 10364 }; 10365 10366 /* 10367 * btrfs doesn't support the bmap operation because swapfiles 10368 * use bmap to make a mapping of extents in the file. They assume 10369 * these extents won't change over the life of the file and they 10370 * use the bmap result to do IO directly to the drive. 10371 * 10372 * the btrfs bmap call would return logical addresses that aren't 10373 * suitable for IO and they also will change frequently as COW 10374 * operations happen. So, swapfile + btrfs == corruption. 10375 * 10376 * For now we're avoiding this by dropping bmap. 10377 */ 10378 static const struct address_space_operations btrfs_aops = { 10379 .readpage = btrfs_readpage, 10380 .writepage = btrfs_writepage, 10381 .writepages = btrfs_writepages, 10382 .readahead = btrfs_readahead, 10383 .direct_IO = noop_direct_IO, 10384 .invalidatepage = btrfs_invalidatepage, 10385 .releasepage = btrfs_releasepage, 10386 #ifdef CONFIG_MIGRATION 10387 .migratepage = btrfs_migratepage, 10388 #endif 10389 .set_page_dirty = btrfs_set_page_dirty, 10390 .error_remove_page = generic_error_remove_page, 10391 .swap_activate = btrfs_swap_activate, 10392 .swap_deactivate = btrfs_swap_deactivate, 10393 }; 10394 10395 static const struct inode_operations btrfs_file_inode_operations = { 10396 .getattr = btrfs_getattr, 10397 .setattr = btrfs_setattr, 10398 .listxattr = btrfs_listxattr, 10399 .permission = btrfs_permission, 10400 .fiemap = btrfs_fiemap, 10401 .get_acl = btrfs_get_acl, 10402 .set_acl = btrfs_set_acl, 10403 .update_time = btrfs_update_time, 10404 }; 10405 static const struct inode_operations btrfs_special_inode_operations = { 10406 .getattr = btrfs_getattr, 10407 .setattr = btrfs_setattr, 10408 .permission = btrfs_permission, 10409 .listxattr = btrfs_listxattr, 10410 .get_acl = btrfs_get_acl, 10411 .set_acl = btrfs_set_acl, 10412 .update_time = btrfs_update_time, 10413 }; 10414 static const struct inode_operations btrfs_symlink_inode_operations = { 10415 .get_link = page_get_link, 10416 .getattr = btrfs_getattr, 10417 .setattr = btrfs_setattr, 10418 .permission = btrfs_permission, 10419 .listxattr = btrfs_listxattr, 10420 .update_time = btrfs_update_time, 10421 }; 10422 10423 const struct dentry_operations btrfs_dentry_operations = { 10424 .d_delete = btrfs_dentry_delete, 10425 }; 10426