1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include <linux/slab.h> 40 #include <linux/ratelimit.h> 41 #include <linux/mount.h> 42 #include "compat.h" 43 #include "ctree.h" 44 #include "disk-io.h" 45 #include "transaction.h" 46 #include "btrfs_inode.h" 47 #include "ioctl.h" 48 #include "print-tree.h" 49 #include "ordered-data.h" 50 #include "xattr.h" 51 #include "tree-log.h" 52 #include "volumes.h" 53 #include "compression.h" 54 #include "locking.h" 55 #include "free-space-cache.h" 56 #include "inode-map.h" 57 58 struct btrfs_iget_args { 59 u64 ino; 60 struct btrfs_root *root; 61 }; 62 63 static const struct inode_operations btrfs_dir_inode_operations; 64 static const struct inode_operations btrfs_symlink_inode_operations; 65 static const struct inode_operations btrfs_dir_ro_inode_operations; 66 static const struct inode_operations btrfs_special_inode_operations; 67 static const struct inode_operations btrfs_file_inode_operations; 68 static const struct address_space_operations btrfs_aops; 69 static const struct address_space_operations btrfs_symlink_aops; 70 static const struct file_operations btrfs_dir_file_operations; 71 static struct extent_io_ops btrfs_extent_io_ops; 72 73 static struct kmem_cache *btrfs_inode_cachep; 74 struct kmem_cache *btrfs_trans_handle_cachep; 75 struct kmem_cache *btrfs_transaction_cachep; 76 struct kmem_cache *btrfs_path_cachep; 77 struct kmem_cache *btrfs_free_space_cachep; 78 79 #define S_SHIFT 12 80 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 81 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 82 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 83 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 84 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 85 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 86 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 87 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 88 }; 89 90 static int btrfs_setsize(struct inode *inode, loff_t newsize); 91 static int btrfs_truncate(struct inode *inode); 92 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 93 static noinline int cow_file_range(struct inode *inode, 94 struct page *locked_page, 95 u64 start, u64 end, int *page_started, 96 unsigned long *nr_written, int unlock); 97 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, struct inode *inode); 99 100 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 101 struct inode *inode, struct inode *dir, 102 const struct qstr *qstr) 103 { 104 int err; 105 106 err = btrfs_init_acl(trans, inode, dir); 107 if (!err) 108 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 109 return err; 110 } 111 112 /* 113 * this does all the hard work for inserting an inline extent into 114 * the btree. The caller should have done a btrfs_drop_extents so that 115 * no overlapping inline items exist in the btree 116 */ 117 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 118 struct btrfs_root *root, struct inode *inode, 119 u64 start, size_t size, size_t compressed_size, 120 int compress_type, 121 struct page **compressed_pages) 122 { 123 struct btrfs_key key; 124 struct btrfs_path *path; 125 struct extent_buffer *leaf; 126 struct page *page = NULL; 127 char *kaddr; 128 unsigned long ptr; 129 struct btrfs_file_extent_item *ei; 130 int err = 0; 131 int ret; 132 size_t cur_size = size; 133 size_t datasize; 134 unsigned long offset; 135 136 if (compressed_size && compressed_pages) 137 cur_size = compressed_size; 138 139 path = btrfs_alloc_path(); 140 if (!path) 141 return -ENOMEM; 142 143 path->leave_spinning = 1; 144 145 key.objectid = btrfs_ino(inode); 146 key.offset = start; 147 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 148 datasize = btrfs_file_extent_calc_inline_size(cur_size); 149 150 inode_add_bytes(inode, size); 151 ret = btrfs_insert_empty_item(trans, root, path, &key, 152 datasize); 153 if (ret) { 154 err = ret; 155 goto fail; 156 } 157 leaf = path->nodes[0]; 158 ei = btrfs_item_ptr(leaf, path->slots[0], 159 struct btrfs_file_extent_item); 160 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 161 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 162 btrfs_set_file_extent_encryption(leaf, ei, 0); 163 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 164 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 165 ptr = btrfs_file_extent_inline_start(ei); 166 167 if (compress_type != BTRFS_COMPRESS_NONE) { 168 struct page *cpage; 169 int i = 0; 170 while (compressed_size > 0) { 171 cpage = compressed_pages[i]; 172 cur_size = min_t(unsigned long, compressed_size, 173 PAGE_CACHE_SIZE); 174 175 kaddr = kmap_atomic(cpage); 176 write_extent_buffer(leaf, kaddr, ptr, cur_size); 177 kunmap_atomic(kaddr); 178 179 i++; 180 ptr += cur_size; 181 compressed_size -= cur_size; 182 } 183 btrfs_set_file_extent_compression(leaf, ei, 184 compress_type); 185 } else { 186 page = find_get_page(inode->i_mapping, 187 start >> PAGE_CACHE_SHIFT); 188 btrfs_set_file_extent_compression(leaf, ei, 0); 189 kaddr = kmap_atomic(page); 190 offset = start & (PAGE_CACHE_SIZE - 1); 191 write_extent_buffer(leaf, kaddr + offset, ptr, size); 192 kunmap_atomic(kaddr); 193 page_cache_release(page); 194 } 195 btrfs_mark_buffer_dirty(leaf); 196 btrfs_free_path(path); 197 198 /* 199 * we're an inline extent, so nobody can 200 * extend the file past i_size without locking 201 * a page we already have locked. 202 * 203 * We must do any isize and inode updates 204 * before we unlock the pages. Otherwise we 205 * could end up racing with unlink. 206 */ 207 BTRFS_I(inode)->disk_i_size = inode->i_size; 208 ret = btrfs_update_inode(trans, root, inode); 209 210 return ret; 211 fail: 212 btrfs_free_path(path); 213 return err; 214 } 215 216 217 /* 218 * conditionally insert an inline extent into the file. This 219 * does the checks required to make sure the data is small enough 220 * to fit as an inline extent. 221 */ 222 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 223 struct btrfs_root *root, 224 struct inode *inode, u64 start, u64 end, 225 size_t compressed_size, int compress_type, 226 struct page **compressed_pages) 227 { 228 u64 isize = i_size_read(inode); 229 u64 actual_end = min(end + 1, isize); 230 u64 inline_len = actual_end - start; 231 u64 aligned_end = (end + root->sectorsize - 1) & 232 ~((u64)root->sectorsize - 1); 233 u64 data_len = inline_len; 234 int ret; 235 236 if (compressed_size) 237 data_len = compressed_size; 238 239 if (start > 0 || 240 actual_end >= PAGE_CACHE_SIZE || 241 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 242 (!compressed_size && 243 (actual_end & (root->sectorsize - 1)) == 0) || 244 end + 1 < isize || 245 data_len > root->fs_info->max_inline) { 246 return 1; 247 } 248 249 ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); 250 if (ret) 251 return ret; 252 253 if (isize > actual_end) 254 inline_len = min_t(u64, isize, actual_end); 255 ret = insert_inline_extent(trans, root, inode, start, 256 inline_len, compressed_size, 257 compress_type, compressed_pages); 258 if (ret && ret != -ENOSPC) { 259 btrfs_abort_transaction(trans, root, ret); 260 return ret; 261 } else if (ret == -ENOSPC) { 262 return 1; 263 } 264 265 btrfs_delalloc_release_metadata(inode, end + 1 - start); 266 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 267 return 0; 268 } 269 270 struct async_extent { 271 u64 start; 272 u64 ram_size; 273 u64 compressed_size; 274 struct page **pages; 275 unsigned long nr_pages; 276 int compress_type; 277 struct list_head list; 278 }; 279 280 struct async_cow { 281 struct inode *inode; 282 struct btrfs_root *root; 283 struct page *locked_page; 284 u64 start; 285 u64 end; 286 struct list_head extents; 287 struct btrfs_work work; 288 }; 289 290 static noinline int add_async_extent(struct async_cow *cow, 291 u64 start, u64 ram_size, 292 u64 compressed_size, 293 struct page **pages, 294 unsigned long nr_pages, 295 int compress_type) 296 { 297 struct async_extent *async_extent; 298 299 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 300 BUG_ON(!async_extent); /* -ENOMEM */ 301 async_extent->start = start; 302 async_extent->ram_size = ram_size; 303 async_extent->compressed_size = compressed_size; 304 async_extent->pages = pages; 305 async_extent->nr_pages = nr_pages; 306 async_extent->compress_type = compress_type; 307 list_add_tail(&async_extent->list, &cow->extents); 308 return 0; 309 } 310 311 /* 312 * we create compressed extents in two phases. The first 313 * phase compresses a range of pages that have already been 314 * locked (both pages and state bits are locked). 315 * 316 * This is done inside an ordered work queue, and the compression 317 * is spread across many cpus. The actual IO submission is step 318 * two, and the ordered work queue takes care of making sure that 319 * happens in the same order things were put onto the queue by 320 * writepages and friends. 321 * 322 * If this code finds it can't get good compression, it puts an 323 * entry onto the work queue to write the uncompressed bytes. This 324 * makes sure that both compressed inodes and uncompressed inodes 325 * are written in the same order that the flusher thread sent them 326 * down. 327 */ 328 static noinline int compress_file_range(struct inode *inode, 329 struct page *locked_page, 330 u64 start, u64 end, 331 struct async_cow *async_cow, 332 int *num_added) 333 { 334 struct btrfs_root *root = BTRFS_I(inode)->root; 335 struct btrfs_trans_handle *trans; 336 u64 num_bytes; 337 u64 blocksize = root->sectorsize; 338 u64 actual_end; 339 u64 isize = i_size_read(inode); 340 int ret = 0; 341 struct page **pages = NULL; 342 unsigned long nr_pages; 343 unsigned long nr_pages_ret = 0; 344 unsigned long total_compressed = 0; 345 unsigned long total_in = 0; 346 unsigned long max_compressed = 128 * 1024; 347 unsigned long max_uncompressed = 128 * 1024; 348 int i; 349 int will_compress; 350 int compress_type = root->fs_info->compress_type; 351 352 /* if this is a small write inside eof, kick off a defrag */ 353 if ((end - start + 1) < 16 * 1024 && 354 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 355 btrfs_add_inode_defrag(NULL, inode); 356 357 actual_end = min_t(u64, isize, end + 1); 358 again: 359 will_compress = 0; 360 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 361 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 362 363 /* 364 * we don't want to send crud past the end of i_size through 365 * compression, that's just a waste of CPU time. So, if the 366 * end of the file is before the start of our current 367 * requested range of bytes, we bail out to the uncompressed 368 * cleanup code that can deal with all of this. 369 * 370 * It isn't really the fastest way to fix things, but this is a 371 * very uncommon corner. 372 */ 373 if (actual_end <= start) 374 goto cleanup_and_bail_uncompressed; 375 376 total_compressed = actual_end - start; 377 378 /* we want to make sure that amount of ram required to uncompress 379 * an extent is reasonable, so we limit the total size in ram 380 * of a compressed extent to 128k. This is a crucial number 381 * because it also controls how easily we can spread reads across 382 * cpus for decompression. 383 * 384 * We also want to make sure the amount of IO required to do 385 * a random read is reasonably small, so we limit the size of 386 * a compressed extent to 128k. 387 */ 388 total_compressed = min(total_compressed, max_uncompressed); 389 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 390 num_bytes = max(blocksize, num_bytes); 391 total_in = 0; 392 ret = 0; 393 394 /* 395 * we do compression for mount -o compress and when the 396 * inode has not been flagged as nocompress. This flag can 397 * change at any time if we discover bad compression ratios. 398 */ 399 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 400 (btrfs_test_opt(root, COMPRESS) || 401 (BTRFS_I(inode)->force_compress) || 402 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 403 WARN_ON(pages); 404 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 405 if (!pages) { 406 /* just bail out to the uncompressed code */ 407 goto cont; 408 } 409 410 if (BTRFS_I(inode)->force_compress) 411 compress_type = BTRFS_I(inode)->force_compress; 412 413 ret = btrfs_compress_pages(compress_type, 414 inode->i_mapping, start, 415 total_compressed, pages, 416 nr_pages, &nr_pages_ret, 417 &total_in, 418 &total_compressed, 419 max_compressed); 420 421 if (!ret) { 422 unsigned long offset = total_compressed & 423 (PAGE_CACHE_SIZE - 1); 424 struct page *page = pages[nr_pages_ret - 1]; 425 char *kaddr; 426 427 /* zero the tail end of the last page, we might be 428 * sending it down to disk 429 */ 430 if (offset) { 431 kaddr = kmap_atomic(page); 432 memset(kaddr + offset, 0, 433 PAGE_CACHE_SIZE - offset); 434 kunmap_atomic(kaddr); 435 } 436 will_compress = 1; 437 } 438 } 439 cont: 440 if (start == 0) { 441 trans = btrfs_join_transaction(root); 442 if (IS_ERR(trans)) { 443 ret = PTR_ERR(trans); 444 trans = NULL; 445 goto cleanup_and_out; 446 } 447 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 448 449 /* lets try to make an inline extent */ 450 if (ret || total_in < (actual_end - start)) { 451 /* we didn't compress the entire range, try 452 * to make an uncompressed inline extent. 453 */ 454 ret = cow_file_range_inline(trans, root, inode, 455 start, end, 0, 0, NULL); 456 } else { 457 /* try making a compressed inline extent */ 458 ret = cow_file_range_inline(trans, root, inode, 459 start, end, 460 total_compressed, 461 compress_type, pages); 462 } 463 if (ret <= 0) { 464 /* 465 * inline extent creation worked or returned error, 466 * we don't need to create any more async work items. 467 * Unlock and free up our temp pages. 468 */ 469 extent_clear_unlock_delalloc(inode, 470 &BTRFS_I(inode)->io_tree, 471 start, end, NULL, 472 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 473 EXTENT_CLEAR_DELALLOC | 474 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 475 476 btrfs_end_transaction(trans, root); 477 goto free_pages_out; 478 } 479 btrfs_end_transaction(trans, root); 480 } 481 482 if (will_compress) { 483 /* 484 * we aren't doing an inline extent round the compressed size 485 * up to a block size boundary so the allocator does sane 486 * things 487 */ 488 total_compressed = (total_compressed + blocksize - 1) & 489 ~(blocksize - 1); 490 491 /* 492 * one last check to make sure the compression is really a 493 * win, compare the page count read with the blocks on disk 494 */ 495 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 496 ~(PAGE_CACHE_SIZE - 1); 497 if (total_compressed >= total_in) { 498 will_compress = 0; 499 } else { 500 num_bytes = total_in; 501 } 502 } 503 if (!will_compress && pages) { 504 /* 505 * the compression code ran but failed to make things smaller, 506 * free any pages it allocated and our page pointer array 507 */ 508 for (i = 0; i < nr_pages_ret; i++) { 509 WARN_ON(pages[i]->mapping); 510 page_cache_release(pages[i]); 511 } 512 kfree(pages); 513 pages = NULL; 514 total_compressed = 0; 515 nr_pages_ret = 0; 516 517 /* flag the file so we don't compress in the future */ 518 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 519 !(BTRFS_I(inode)->force_compress)) { 520 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 521 } 522 } 523 if (will_compress) { 524 *num_added += 1; 525 526 /* the async work queues will take care of doing actual 527 * allocation on disk for these compressed pages, 528 * and will submit them to the elevator. 529 */ 530 add_async_extent(async_cow, start, num_bytes, 531 total_compressed, pages, nr_pages_ret, 532 compress_type); 533 534 if (start + num_bytes < end) { 535 start += num_bytes; 536 pages = NULL; 537 cond_resched(); 538 goto again; 539 } 540 } else { 541 cleanup_and_bail_uncompressed: 542 /* 543 * No compression, but we still need to write the pages in 544 * the file we've been given so far. redirty the locked 545 * page if it corresponds to our extent and set things up 546 * for the async work queue to run cow_file_range to do 547 * the normal delalloc dance 548 */ 549 if (page_offset(locked_page) >= start && 550 page_offset(locked_page) <= end) { 551 __set_page_dirty_nobuffers(locked_page); 552 /* unlocked later on in the async handlers */ 553 } 554 add_async_extent(async_cow, start, end - start + 1, 555 0, NULL, 0, BTRFS_COMPRESS_NONE); 556 *num_added += 1; 557 } 558 559 out: 560 return ret; 561 562 free_pages_out: 563 for (i = 0; i < nr_pages_ret; i++) { 564 WARN_ON(pages[i]->mapping); 565 page_cache_release(pages[i]); 566 } 567 kfree(pages); 568 569 goto out; 570 571 cleanup_and_out: 572 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 573 start, end, NULL, 574 EXTENT_CLEAR_UNLOCK_PAGE | 575 EXTENT_CLEAR_DIRTY | 576 EXTENT_CLEAR_DELALLOC | 577 EXTENT_SET_WRITEBACK | 578 EXTENT_END_WRITEBACK); 579 if (!trans || IS_ERR(trans)) 580 btrfs_error(root->fs_info, ret, "Failed to join transaction"); 581 else 582 btrfs_abort_transaction(trans, root, ret); 583 goto free_pages_out; 584 } 585 586 /* 587 * phase two of compressed writeback. This is the ordered portion 588 * of the code, which only gets called in the order the work was 589 * queued. We walk all the async extents created by compress_file_range 590 * and send them down to the disk. 591 */ 592 static noinline int submit_compressed_extents(struct inode *inode, 593 struct async_cow *async_cow) 594 { 595 struct async_extent *async_extent; 596 u64 alloc_hint = 0; 597 struct btrfs_trans_handle *trans; 598 struct btrfs_key ins; 599 struct extent_map *em; 600 struct btrfs_root *root = BTRFS_I(inode)->root; 601 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 602 struct extent_io_tree *io_tree; 603 int ret = 0; 604 605 if (list_empty(&async_cow->extents)) 606 return 0; 607 608 609 while (!list_empty(&async_cow->extents)) { 610 async_extent = list_entry(async_cow->extents.next, 611 struct async_extent, list); 612 list_del(&async_extent->list); 613 614 io_tree = &BTRFS_I(inode)->io_tree; 615 616 retry: 617 /* did the compression code fall back to uncompressed IO? */ 618 if (!async_extent->pages) { 619 int page_started = 0; 620 unsigned long nr_written = 0; 621 622 lock_extent(io_tree, async_extent->start, 623 async_extent->start + 624 async_extent->ram_size - 1); 625 626 /* allocate blocks */ 627 ret = cow_file_range(inode, async_cow->locked_page, 628 async_extent->start, 629 async_extent->start + 630 async_extent->ram_size - 1, 631 &page_started, &nr_written, 0); 632 633 /* JDM XXX */ 634 635 /* 636 * if page_started, cow_file_range inserted an 637 * inline extent and took care of all the unlocking 638 * and IO for us. Otherwise, we need to submit 639 * all those pages down to the drive. 640 */ 641 if (!page_started && !ret) 642 extent_write_locked_range(io_tree, 643 inode, async_extent->start, 644 async_extent->start + 645 async_extent->ram_size - 1, 646 btrfs_get_extent, 647 WB_SYNC_ALL); 648 kfree(async_extent); 649 cond_resched(); 650 continue; 651 } 652 653 lock_extent(io_tree, async_extent->start, 654 async_extent->start + async_extent->ram_size - 1); 655 656 trans = btrfs_join_transaction(root); 657 if (IS_ERR(trans)) { 658 ret = PTR_ERR(trans); 659 } else { 660 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 661 ret = btrfs_reserve_extent(trans, root, 662 async_extent->compressed_size, 663 async_extent->compressed_size, 664 0, alloc_hint, &ins, 1); 665 if (ret && ret != -ENOSPC) 666 btrfs_abort_transaction(trans, root, ret); 667 btrfs_end_transaction(trans, root); 668 } 669 670 if (ret) { 671 int i; 672 for (i = 0; i < async_extent->nr_pages; i++) { 673 WARN_ON(async_extent->pages[i]->mapping); 674 page_cache_release(async_extent->pages[i]); 675 } 676 kfree(async_extent->pages); 677 async_extent->nr_pages = 0; 678 async_extent->pages = NULL; 679 unlock_extent(io_tree, async_extent->start, 680 async_extent->start + 681 async_extent->ram_size - 1); 682 if (ret == -ENOSPC) 683 goto retry; 684 goto out_free; /* JDM: Requeue? */ 685 } 686 687 /* 688 * here we're doing allocation and writeback of the 689 * compressed pages 690 */ 691 btrfs_drop_extent_cache(inode, async_extent->start, 692 async_extent->start + 693 async_extent->ram_size - 1, 0); 694 695 em = alloc_extent_map(); 696 BUG_ON(!em); /* -ENOMEM */ 697 em->start = async_extent->start; 698 em->len = async_extent->ram_size; 699 em->orig_start = em->start; 700 701 em->block_start = ins.objectid; 702 em->block_len = ins.offset; 703 em->bdev = root->fs_info->fs_devices->latest_bdev; 704 em->compress_type = async_extent->compress_type; 705 set_bit(EXTENT_FLAG_PINNED, &em->flags); 706 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 707 708 while (1) { 709 write_lock(&em_tree->lock); 710 ret = add_extent_mapping(em_tree, em); 711 write_unlock(&em_tree->lock); 712 if (ret != -EEXIST) { 713 free_extent_map(em); 714 break; 715 } 716 btrfs_drop_extent_cache(inode, async_extent->start, 717 async_extent->start + 718 async_extent->ram_size - 1, 0); 719 } 720 721 ret = btrfs_add_ordered_extent_compress(inode, 722 async_extent->start, 723 ins.objectid, 724 async_extent->ram_size, 725 ins.offset, 726 BTRFS_ORDERED_COMPRESSED, 727 async_extent->compress_type); 728 BUG_ON(ret); /* -ENOMEM */ 729 730 /* 731 * clear dirty, set writeback and unlock the pages. 732 */ 733 extent_clear_unlock_delalloc(inode, 734 &BTRFS_I(inode)->io_tree, 735 async_extent->start, 736 async_extent->start + 737 async_extent->ram_size - 1, 738 NULL, EXTENT_CLEAR_UNLOCK_PAGE | 739 EXTENT_CLEAR_UNLOCK | 740 EXTENT_CLEAR_DELALLOC | 741 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); 742 743 ret = btrfs_submit_compressed_write(inode, 744 async_extent->start, 745 async_extent->ram_size, 746 ins.objectid, 747 ins.offset, async_extent->pages, 748 async_extent->nr_pages); 749 750 BUG_ON(ret); /* -ENOMEM */ 751 alloc_hint = ins.objectid + ins.offset; 752 kfree(async_extent); 753 cond_resched(); 754 } 755 ret = 0; 756 out: 757 return ret; 758 out_free: 759 kfree(async_extent); 760 goto out; 761 } 762 763 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 764 u64 num_bytes) 765 { 766 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 767 struct extent_map *em; 768 u64 alloc_hint = 0; 769 770 read_lock(&em_tree->lock); 771 em = search_extent_mapping(em_tree, start, num_bytes); 772 if (em) { 773 /* 774 * if block start isn't an actual block number then find the 775 * first block in this inode and use that as a hint. If that 776 * block is also bogus then just don't worry about it. 777 */ 778 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 779 free_extent_map(em); 780 em = search_extent_mapping(em_tree, 0, 0); 781 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 782 alloc_hint = em->block_start; 783 if (em) 784 free_extent_map(em); 785 } else { 786 alloc_hint = em->block_start; 787 free_extent_map(em); 788 } 789 } 790 read_unlock(&em_tree->lock); 791 792 return alloc_hint; 793 } 794 795 /* 796 * when extent_io.c finds a delayed allocation range in the file, 797 * the call backs end up in this code. The basic idea is to 798 * allocate extents on disk for the range, and create ordered data structs 799 * in ram to track those extents. 800 * 801 * locked_page is the page that writepage had locked already. We use 802 * it to make sure we don't do extra locks or unlocks. 803 * 804 * *page_started is set to one if we unlock locked_page and do everything 805 * required to start IO on it. It may be clean and already done with 806 * IO when we return. 807 */ 808 static noinline int cow_file_range(struct inode *inode, 809 struct page *locked_page, 810 u64 start, u64 end, int *page_started, 811 unsigned long *nr_written, 812 int unlock) 813 { 814 struct btrfs_root *root = BTRFS_I(inode)->root; 815 struct btrfs_trans_handle *trans; 816 u64 alloc_hint = 0; 817 u64 num_bytes; 818 unsigned long ram_size; 819 u64 disk_num_bytes; 820 u64 cur_alloc_size; 821 u64 blocksize = root->sectorsize; 822 struct btrfs_key ins; 823 struct extent_map *em; 824 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 825 int ret = 0; 826 827 BUG_ON(btrfs_is_free_space_inode(inode)); 828 trans = btrfs_join_transaction(root); 829 if (IS_ERR(trans)) { 830 extent_clear_unlock_delalloc(inode, 831 &BTRFS_I(inode)->io_tree, 832 start, end, locked_page, 833 EXTENT_CLEAR_UNLOCK_PAGE | 834 EXTENT_CLEAR_UNLOCK | 835 EXTENT_CLEAR_DELALLOC | 836 EXTENT_CLEAR_DIRTY | 837 EXTENT_SET_WRITEBACK | 838 EXTENT_END_WRITEBACK); 839 return PTR_ERR(trans); 840 } 841 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 842 843 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 844 num_bytes = max(blocksize, num_bytes); 845 disk_num_bytes = num_bytes; 846 ret = 0; 847 848 /* if this is a small write inside eof, kick off defrag */ 849 if (num_bytes < 64 * 1024 && 850 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 851 btrfs_add_inode_defrag(trans, inode); 852 853 if (start == 0) { 854 /* lets try to make an inline extent */ 855 ret = cow_file_range_inline(trans, root, inode, 856 start, end, 0, 0, NULL); 857 if (ret == 0) { 858 extent_clear_unlock_delalloc(inode, 859 &BTRFS_I(inode)->io_tree, 860 start, end, NULL, 861 EXTENT_CLEAR_UNLOCK_PAGE | 862 EXTENT_CLEAR_UNLOCK | 863 EXTENT_CLEAR_DELALLOC | 864 EXTENT_CLEAR_DIRTY | 865 EXTENT_SET_WRITEBACK | 866 EXTENT_END_WRITEBACK); 867 868 *nr_written = *nr_written + 869 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 870 *page_started = 1; 871 goto out; 872 } else if (ret < 0) { 873 btrfs_abort_transaction(trans, root, ret); 874 goto out_unlock; 875 } 876 } 877 878 BUG_ON(disk_num_bytes > 879 btrfs_super_total_bytes(root->fs_info->super_copy)); 880 881 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 882 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 883 884 while (disk_num_bytes > 0) { 885 unsigned long op; 886 887 cur_alloc_size = disk_num_bytes; 888 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 889 root->sectorsize, 0, alloc_hint, 890 &ins, 1); 891 if (ret < 0) { 892 btrfs_abort_transaction(trans, root, ret); 893 goto out_unlock; 894 } 895 896 em = alloc_extent_map(); 897 BUG_ON(!em); /* -ENOMEM */ 898 em->start = start; 899 em->orig_start = em->start; 900 ram_size = ins.offset; 901 em->len = ins.offset; 902 903 em->block_start = ins.objectid; 904 em->block_len = ins.offset; 905 em->bdev = root->fs_info->fs_devices->latest_bdev; 906 set_bit(EXTENT_FLAG_PINNED, &em->flags); 907 908 while (1) { 909 write_lock(&em_tree->lock); 910 ret = add_extent_mapping(em_tree, em); 911 write_unlock(&em_tree->lock); 912 if (ret != -EEXIST) { 913 free_extent_map(em); 914 break; 915 } 916 btrfs_drop_extent_cache(inode, start, 917 start + ram_size - 1, 0); 918 } 919 920 cur_alloc_size = ins.offset; 921 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 922 ram_size, cur_alloc_size, 0); 923 BUG_ON(ret); /* -ENOMEM */ 924 925 if (root->root_key.objectid == 926 BTRFS_DATA_RELOC_TREE_OBJECTID) { 927 ret = btrfs_reloc_clone_csums(inode, start, 928 cur_alloc_size); 929 if (ret) { 930 btrfs_abort_transaction(trans, root, ret); 931 goto out_unlock; 932 } 933 } 934 935 if (disk_num_bytes < cur_alloc_size) 936 break; 937 938 /* we're not doing compressed IO, don't unlock the first 939 * page (which the caller expects to stay locked), don't 940 * clear any dirty bits and don't set any writeback bits 941 * 942 * Do set the Private2 bit so we know this page was properly 943 * setup for writepage 944 */ 945 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; 946 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 947 EXTENT_SET_PRIVATE2; 948 949 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 950 start, start + ram_size - 1, 951 locked_page, op); 952 disk_num_bytes -= cur_alloc_size; 953 num_bytes -= cur_alloc_size; 954 alloc_hint = ins.objectid + ins.offset; 955 start += cur_alloc_size; 956 } 957 ret = 0; 958 out: 959 btrfs_end_transaction(trans, root); 960 961 return ret; 962 out_unlock: 963 extent_clear_unlock_delalloc(inode, 964 &BTRFS_I(inode)->io_tree, 965 start, end, locked_page, 966 EXTENT_CLEAR_UNLOCK_PAGE | 967 EXTENT_CLEAR_UNLOCK | 968 EXTENT_CLEAR_DELALLOC | 969 EXTENT_CLEAR_DIRTY | 970 EXTENT_SET_WRITEBACK | 971 EXTENT_END_WRITEBACK); 972 973 goto out; 974 } 975 976 /* 977 * work queue call back to started compression on a file and pages 978 */ 979 static noinline void async_cow_start(struct btrfs_work *work) 980 { 981 struct async_cow *async_cow; 982 int num_added = 0; 983 async_cow = container_of(work, struct async_cow, work); 984 985 compress_file_range(async_cow->inode, async_cow->locked_page, 986 async_cow->start, async_cow->end, async_cow, 987 &num_added); 988 if (num_added == 0) { 989 btrfs_add_delayed_iput(async_cow->inode); 990 async_cow->inode = NULL; 991 } 992 } 993 994 /* 995 * work queue call back to submit previously compressed pages 996 */ 997 static noinline void async_cow_submit(struct btrfs_work *work) 998 { 999 struct async_cow *async_cow; 1000 struct btrfs_root *root; 1001 unsigned long nr_pages; 1002 1003 async_cow = container_of(work, struct async_cow, work); 1004 1005 root = async_cow->root; 1006 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 1007 PAGE_CACHE_SHIFT; 1008 1009 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 1010 5 * 1024 * 1024 && 1011 waitqueue_active(&root->fs_info->async_submit_wait)) 1012 wake_up(&root->fs_info->async_submit_wait); 1013 1014 if (async_cow->inode) 1015 submit_compressed_extents(async_cow->inode, async_cow); 1016 } 1017 1018 static noinline void async_cow_free(struct btrfs_work *work) 1019 { 1020 struct async_cow *async_cow; 1021 async_cow = container_of(work, struct async_cow, work); 1022 if (async_cow->inode) 1023 btrfs_add_delayed_iput(async_cow->inode); 1024 kfree(async_cow); 1025 } 1026 1027 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1028 u64 start, u64 end, int *page_started, 1029 unsigned long *nr_written) 1030 { 1031 struct async_cow *async_cow; 1032 struct btrfs_root *root = BTRFS_I(inode)->root; 1033 unsigned long nr_pages; 1034 u64 cur_end; 1035 int limit = 10 * 1024 * 1024; 1036 1037 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1038 1, 0, NULL, GFP_NOFS); 1039 while (start < end) { 1040 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1041 BUG_ON(!async_cow); /* -ENOMEM */ 1042 async_cow->inode = igrab(inode); 1043 async_cow->root = root; 1044 async_cow->locked_page = locked_page; 1045 async_cow->start = start; 1046 1047 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1048 cur_end = end; 1049 else 1050 cur_end = min(end, start + 512 * 1024 - 1); 1051 1052 async_cow->end = cur_end; 1053 INIT_LIST_HEAD(&async_cow->extents); 1054 1055 async_cow->work.func = async_cow_start; 1056 async_cow->work.ordered_func = async_cow_submit; 1057 async_cow->work.ordered_free = async_cow_free; 1058 async_cow->work.flags = 0; 1059 1060 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1061 PAGE_CACHE_SHIFT; 1062 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1063 1064 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1065 &async_cow->work); 1066 1067 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1068 wait_event(root->fs_info->async_submit_wait, 1069 (atomic_read(&root->fs_info->async_delalloc_pages) < 1070 limit)); 1071 } 1072 1073 while (atomic_read(&root->fs_info->async_submit_draining) && 1074 atomic_read(&root->fs_info->async_delalloc_pages)) { 1075 wait_event(root->fs_info->async_submit_wait, 1076 (atomic_read(&root->fs_info->async_delalloc_pages) == 1077 0)); 1078 } 1079 1080 *nr_written += nr_pages; 1081 start = cur_end + 1; 1082 } 1083 *page_started = 1; 1084 return 0; 1085 } 1086 1087 static noinline int csum_exist_in_range(struct btrfs_root *root, 1088 u64 bytenr, u64 num_bytes) 1089 { 1090 int ret; 1091 struct btrfs_ordered_sum *sums; 1092 LIST_HEAD(list); 1093 1094 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1095 bytenr + num_bytes - 1, &list, 0); 1096 if (ret == 0 && list_empty(&list)) 1097 return 0; 1098 1099 while (!list_empty(&list)) { 1100 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1101 list_del(&sums->list); 1102 kfree(sums); 1103 } 1104 return 1; 1105 } 1106 1107 /* 1108 * when nowcow writeback call back. This checks for snapshots or COW copies 1109 * of the extents that exist in the file, and COWs the file as required. 1110 * 1111 * If no cow copies or snapshots exist, we write directly to the existing 1112 * blocks on disk 1113 */ 1114 static noinline int run_delalloc_nocow(struct inode *inode, 1115 struct page *locked_page, 1116 u64 start, u64 end, int *page_started, int force, 1117 unsigned long *nr_written) 1118 { 1119 struct btrfs_root *root = BTRFS_I(inode)->root; 1120 struct btrfs_trans_handle *trans; 1121 struct extent_buffer *leaf; 1122 struct btrfs_path *path; 1123 struct btrfs_file_extent_item *fi; 1124 struct btrfs_key found_key; 1125 u64 cow_start; 1126 u64 cur_offset; 1127 u64 extent_end; 1128 u64 extent_offset; 1129 u64 disk_bytenr; 1130 u64 num_bytes; 1131 int extent_type; 1132 int ret, err; 1133 int type; 1134 int nocow; 1135 int check_prev = 1; 1136 bool nolock; 1137 u64 ino = btrfs_ino(inode); 1138 1139 path = btrfs_alloc_path(); 1140 if (!path) { 1141 extent_clear_unlock_delalloc(inode, 1142 &BTRFS_I(inode)->io_tree, 1143 start, end, locked_page, 1144 EXTENT_CLEAR_UNLOCK_PAGE | 1145 EXTENT_CLEAR_UNLOCK | 1146 EXTENT_CLEAR_DELALLOC | 1147 EXTENT_CLEAR_DIRTY | 1148 EXTENT_SET_WRITEBACK | 1149 EXTENT_END_WRITEBACK); 1150 return -ENOMEM; 1151 } 1152 1153 nolock = btrfs_is_free_space_inode(inode); 1154 1155 if (nolock) 1156 trans = btrfs_join_transaction_nolock(root); 1157 else 1158 trans = btrfs_join_transaction(root); 1159 1160 if (IS_ERR(trans)) { 1161 extent_clear_unlock_delalloc(inode, 1162 &BTRFS_I(inode)->io_tree, 1163 start, end, locked_page, 1164 EXTENT_CLEAR_UNLOCK_PAGE | 1165 EXTENT_CLEAR_UNLOCK | 1166 EXTENT_CLEAR_DELALLOC | 1167 EXTENT_CLEAR_DIRTY | 1168 EXTENT_SET_WRITEBACK | 1169 EXTENT_END_WRITEBACK); 1170 btrfs_free_path(path); 1171 return PTR_ERR(trans); 1172 } 1173 1174 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1175 1176 cow_start = (u64)-1; 1177 cur_offset = start; 1178 while (1) { 1179 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1180 cur_offset, 0); 1181 if (ret < 0) { 1182 btrfs_abort_transaction(trans, root, ret); 1183 goto error; 1184 } 1185 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1186 leaf = path->nodes[0]; 1187 btrfs_item_key_to_cpu(leaf, &found_key, 1188 path->slots[0] - 1); 1189 if (found_key.objectid == ino && 1190 found_key.type == BTRFS_EXTENT_DATA_KEY) 1191 path->slots[0]--; 1192 } 1193 check_prev = 0; 1194 next_slot: 1195 leaf = path->nodes[0]; 1196 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1197 ret = btrfs_next_leaf(root, path); 1198 if (ret < 0) { 1199 btrfs_abort_transaction(trans, root, ret); 1200 goto error; 1201 } 1202 if (ret > 0) 1203 break; 1204 leaf = path->nodes[0]; 1205 } 1206 1207 nocow = 0; 1208 disk_bytenr = 0; 1209 num_bytes = 0; 1210 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1211 1212 if (found_key.objectid > ino || 1213 found_key.type > BTRFS_EXTENT_DATA_KEY || 1214 found_key.offset > end) 1215 break; 1216 1217 if (found_key.offset > cur_offset) { 1218 extent_end = found_key.offset; 1219 extent_type = 0; 1220 goto out_check; 1221 } 1222 1223 fi = btrfs_item_ptr(leaf, path->slots[0], 1224 struct btrfs_file_extent_item); 1225 extent_type = btrfs_file_extent_type(leaf, fi); 1226 1227 if (extent_type == BTRFS_FILE_EXTENT_REG || 1228 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1229 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1230 extent_offset = btrfs_file_extent_offset(leaf, fi); 1231 extent_end = found_key.offset + 1232 btrfs_file_extent_num_bytes(leaf, fi); 1233 if (extent_end <= start) { 1234 path->slots[0]++; 1235 goto next_slot; 1236 } 1237 if (disk_bytenr == 0) 1238 goto out_check; 1239 if (btrfs_file_extent_compression(leaf, fi) || 1240 btrfs_file_extent_encryption(leaf, fi) || 1241 btrfs_file_extent_other_encoding(leaf, fi)) 1242 goto out_check; 1243 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1244 goto out_check; 1245 if (btrfs_extent_readonly(root, disk_bytenr)) 1246 goto out_check; 1247 if (btrfs_cross_ref_exist(trans, root, ino, 1248 found_key.offset - 1249 extent_offset, disk_bytenr)) 1250 goto out_check; 1251 disk_bytenr += extent_offset; 1252 disk_bytenr += cur_offset - found_key.offset; 1253 num_bytes = min(end + 1, extent_end) - cur_offset; 1254 /* 1255 * force cow if csum exists in the range. 1256 * this ensure that csum for a given extent are 1257 * either valid or do not exist. 1258 */ 1259 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1260 goto out_check; 1261 nocow = 1; 1262 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1263 extent_end = found_key.offset + 1264 btrfs_file_extent_inline_len(leaf, fi); 1265 extent_end = ALIGN(extent_end, root->sectorsize); 1266 } else { 1267 BUG_ON(1); 1268 } 1269 out_check: 1270 if (extent_end <= start) { 1271 path->slots[0]++; 1272 goto next_slot; 1273 } 1274 if (!nocow) { 1275 if (cow_start == (u64)-1) 1276 cow_start = cur_offset; 1277 cur_offset = extent_end; 1278 if (cur_offset > end) 1279 break; 1280 path->slots[0]++; 1281 goto next_slot; 1282 } 1283 1284 btrfs_release_path(path); 1285 if (cow_start != (u64)-1) { 1286 ret = cow_file_range(inode, locked_page, cow_start, 1287 found_key.offset - 1, page_started, 1288 nr_written, 1); 1289 if (ret) { 1290 btrfs_abort_transaction(trans, root, ret); 1291 goto error; 1292 } 1293 cow_start = (u64)-1; 1294 } 1295 1296 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1297 struct extent_map *em; 1298 struct extent_map_tree *em_tree; 1299 em_tree = &BTRFS_I(inode)->extent_tree; 1300 em = alloc_extent_map(); 1301 BUG_ON(!em); /* -ENOMEM */ 1302 em->start = cur_offset; 1303 em->orig_start = em->start; 1304 em->len = num_bytes; 1305 em->block_len = num_bytes; 1306 em->block_start = disk_bytenr; 1307 em->bdev = root->fs_info->fs_devices->latest_bdev; 1308 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1309 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 1310 while (1) { 1311 write_lock(&em_tree->lock); 1312 ret = add_extent_mapping(em_tree, em); 1313 write_unlock(&em_tree->lock); 1314 if (ret != -EEXIST) { 1315 free_extent_map(em); 1316 break; 1317 } 1318 btrfs_drop_extent_cache(inode, em->start, 1319 em->start + em->len - 1, 0); 1320 } 1321 type = BTRFS_ORDERED_PREALLOC; 1322 } else { 1323 type = BTRFS_ORDERED_NOCOW; 1324 } 1325 1326 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1327 num_bytes, num_bytes, type); 1328 BUG_ON(ret); /* -ENOMEM */ 1329 1330 if (root->root_key.objectid == 1331 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1332 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1333 num_bytes); 1334 if (ret) { 1335 btrfs_abort_transaction(trans, root, ret); 1336 goto error; 1337 } 1338 } 1339 1340 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1341 cur_offset, cur_offset + num_bytes - 1, 1342 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1343 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 1344 EXTENT_SET_PRIVATE2); 1345 cur_offset = extent_end; 1346 if (cur_offset > end) 1347 break; 1348 } 1349 btrfs_release_path(path); 1350 1351 if (cur_offset <= end && cow_start == (u64)-1) { 1352 cow_start = cur_offset; 1353 cur_offset = end; 1354 } 1355 1356 if (cow_start != (u64)-1) { 1357 ret = cow_file_range(inode, locked_page, cow_start, end, 1358 page_started, nr_written, 1); 1359 if (ret) { 1360 btrfs_abort_transaction(trans, root, ret); 1361 goto error; 1362 } 1363 } 1364 1365 error: 1366 err = btrfs_end_transaction(trans, root); 1367 if (!ret) 1368 ret = err; 1369 1370 if (ret && cur_offset < end) 1371 extent_clear_unlock_delalloc(inode, 1372 &BTRFS_I(inode)->io_tree, 1373 cur_offset, end, locked_page, 1374 EXTENT_CLEAR_UNLOCK_PAGE | 1375 EXTENT_CLEAR_UNLOCK | 1376 EXTENT_CLEAR_DELALLOC | 1377 EXTENT_CLEAR_DIRTY | 1378 EXTENT_SET_WRITEBACK | 1379 EXTENT_END_WRITEBACK); 1380 1381 btrfs_free_path(path); 1382 return ret; 1383 } 1384 1385 /* 1386 * extent_io.c call back to do delayed allocation processing 1387 */ 1388 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1389 u64 start, u64 end, int *page_started, 1390 unsigned long *nr_written) 1391 { 1392 int ret; 1393 struct btrfs_root *root = BTRFS_I(inode)->root; 1394 1395 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1396 ret = run_delalloc_nocow(inode, locked_page, start, end, 1397 page_started, 1, nr_written); 1398 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1399 ret = run_delalloc_nocow(inode, locked_page, start, end, 1400 page_started, 0, nr_written); 1401 } else if (!btrfs_test_opt(root, COMPRESS) && 1402 !(BTRFS_I(inode)->force_compress) && 1403 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { 1404 ret = cow_file_range(inode, locked_page, start, end, 1405 page_started, nr_written, 1); 1406 } else { 1407 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1408 &BTRFS_I(inode)->runtime_flags); 1409 ret = cow_file_range_async(inode, locked_page, start, end, 1410 page_started, nr_written); 1411 } 1412 return ret; 1413 } 1414 1415 static void btrfs_split_extent_hook(struct inode *inode, 1416 struct extent_state *orig, u64 split) 1417 { 1418 /* not delalloc, ignore it */ 1419 if (!(orig->state & EXTENT_DELALLOC)) 1420 return; 1421 1422 spin_lock(&BTRFS_I(inode)->lock); 1423 BTRFS_I(inode)->outstanding_extents++; 1424 spin_unlock(&BTRFS_I(inode)->lock); 1425 } 1426 1427 /* 1428 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1429 * extents so we can keep track of new extents that are just merged onto old 1430 * extents, such as when we are doing sequential writes, so we can properly 1431 * account for the metadata space we'll need. 1432 */ 1433 static void btrfs_merge_extent_hook(struct inode *inode, 1434 struct extent_state *new, 1435 struct extent_state *other) 1436 { 1437 /* not delalloc, ignore it */ 1438 if (!(other->state & EXTENT_DELALLOC)) 1439 return; 1440 1441 spin_lock(&BTRFS_I(inode)->lock); 1442 BTRFS_I(inode)->outstanding_extents--; 1443 spin_unlock(&BTRFS_I(inode)->lock); 1444 } 1445 1446 /* 1447 * extent_io.c set_bit_hook, used to track delayed allocation 1448 * bytes in this file, and to maintain the list of inodes that 1449 * have pending delalloc work to be done. 1450 */ 1451 static void btrfs_set_bit_hook(struct inode *inode, 1452 struct extent_state *state, int *bits) 1453 { 1454 1455 /* 1456 * set_bit and clear bit hooks normally require _irqsave/restore 1457 * but in this case, we are only testing for the DELALLOC 1458 * bit, which is only set or cleared with irqs on 1459 */ 1460 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1461 struct btrfs_root *root = BTRFS_I(inode)->root; 1462 u64 len = state->end + 1 - state->start; 1463 bool do_list = !btrfs_is_free_space_inode(inode); 1464 1465 if (*bits & EXTENT_FIRST_DELALLOC) { 1466 *bits &= ~EXTENT_FIRST_DELALLOC; 1467 } else { 1468 spin_lock(&BTRFS_I(inode)->lock); 1469 BTRFS_I(inode)->outstanding_extents++; 1470 spin_unlock(&BTRFS_I(inode)->lock); 1471 } 1472 1473 spin_lock(&root->fs_info->delalloc_lock); 1474 BTRFS_I(inode)->delalloc_bytes += len; 1475 root->fs_info->delalloc_bytes += len; 1476 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1477 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1478 &root->fs_info->delalloc_inodes); 1479 } 1480 spin_unlock(&root->fs_info->delalloc_lock); 1481 } 1482 } 1483 1484 /* 1485 * extent_io.c clear_bit_hook, see set_bit_hook for why 1486 */ 1487 static void btrfs_clear_bit_hook(struct inode *inode, 1488 struct extent_state *state, int *bits) 1489 { 1490 /* 1491 * set_bit and clear bit hooks normally require _irqsave/restore 1492 * but in this case, we are only testing for the DELALLOC 1493 * bit, which is only set or cleared with irqs on 1494 */ 1495 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1496 struct btrfs_root *root = BTRFS_I(inode)->root; 1497 u64 len = state->end + 1 - state->start; 1498 bool do_list = !btrfs_is_free_space_inode(inode); 1499 1500 if (*bits & EXTENT_FIRST_DELALLOC) { 1501 *bits &= ~EXTENT_FIRST_DELALLOC; 1502 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1503 spin_lock(&BTRFS_I(inode)->lock); 1504 BTRFS_I(inode)->outstanding_extents--; 1505 spin_unlock(&BTRFS_I(inode)->lock); 1506 } 1507 1508 if (*bits & EXTENT_DO_ACCOUNTING) 1509 btrfs_delalloc_release_metadata(inode, len); 1510 1511 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1512 && do_list) 1513 btrfs_free_reserved_data_space(inode, len); 1514 1515 spin_lock(&root->fs_info->delalloc_lock); 1516 root->fs_info->delalloc_bytes -= len; 1517 BTRFS_I(inode)->delalloc_bytes -= len; 1518 1519 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1520 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1521 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1522 } 1523 spin_unlock(&root->fs_info->delalloc_lock); 1524 } 1525 } 1526 1527 /* 1528 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1529 * we don't create bios that span stripes or chunks 1530 */ 1531 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1532 size_t size, struct bio *bio, 1533 unsigned long bio_flags) 1534 { 1535 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1536 struct btrfs_mapping_tree *map_tree; 1537 u64 logical = (u64)bio->bi_sector << 9; 1538 u64 length = 0; 1539 u64 map_length; 1540 int ret; 1541 1542 if (bio_flags & EXTENT_BIO_COMPRESSED) 1543 return 0; 1544 1545 length = bio->bi_size; 1546 map_tree = &root->fs_info->mapping_tree; 1547 map_length = length; 1548 ret = btrfs_map_block(map_tree, READ, logical, 1549 &map_length, NULL, 0); 1550 /* Will always return 0 or 1 with map_multi == NULL */ 1551 BUG_ON(ret < 0); 1552 if (map_length < length + size) 1553 return 1; 1554 return 0; 1555 } 1556 1557 /* 1558 * in order to insert checksums into the metadata in large chunks, 1559 * we wait until bio submission time. All the pages in the bio are 1560 * checksummed and sums are attached onto the ordered extent record. 1561 * 1562 * At IO completion time the cums attached on the ordered extent record 1563 * are inserted into the btree 1564 */ 1565 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1566 struct bio *bio, int mirror_num, 1567 unsigned long bio_flags, 1568 u64 bio_offset) 1569 { 1570 struct btrfs_root *root = BTRFS_I(inode)->root; 1571 int ret = 0; 1572 1573 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1574 BUG_ON(ret); /* -ENOMEM */ 1575 return 0; 1576 } 1577 1578 /* 1579 * in order to insert checksums into the metadata in large chunks, 1580 * we wait until bio submission time. All the pages in the bio are 1581 * checksummed and sums are attached onto the ordered extent record. 1582 * 1583 * At IO completion time the cums attached on the ordered extent record 1584 * are inserted into the btree 1585 */ 1586 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1587 int mirror_num, unsigned long bio_flags, 1588 u64 bio_offset) 1589 { 1590 struct btrfs_root *root = BTRFS_I(inode)->root; 1591 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1592 } 1593 1594 /* 1595 * extent_io.c submission hook. This does the right thing for csum calculation 1596 * on write, or reading the csums from the tree before a read 1597 */ 1598 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1599 int mirror_num, unsigned long bio_flags, 1600 u64 bio_offset) 1601 { 1602 struct btrfs_root *root = BTRFS_I(inode)->root; 1603 int ret = 0; 1604 int skip_sum; 1605 int metadata = 0; 1606 1607 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1608 1609 if (btrfs_is_free_space_inode(inode)) 1610 metadata = 2; 1611 1612 if (!(rw & REQ_WRITE)) { 1613 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1614 if (ret) 1615 return ret; 1616 1617 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1618 return btrfs_submit_compressed_read(inode, bio, 1619 mirror_num, bio_flags); 1620 } else if (!skip_sum) { 1621 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1622 if (ret) 1623 return ret; 1624 } 1625 goto mapit; 1626 } else if (!skip_sum) { 1627 /* csum items have already been cloned */ 1628 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1629 goto mapit; 1630 /* we're doing a write, do the async checksumming */ 1631 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1632 inode, rw, bio, mirror_num, 1633 bio_flags, bio_offset, 1634 __btrfs_submit_bio_start, 1635 __btrfs_submit_bio_done); 1636 } 1637 1638 mapit: 1639 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1640 } 1641 1642 /* 1643 * given a list of ordered sums record them in the inode. This happens 1644 * at IO completion time based on sums calculated at bio submission time. 1645 */ 1646 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1647 struct inode *inode, u64 file_offset, 1648 struct list_head *list) 1649 { 1650 struct btrfs_ordered_sum *sum; 1651 1652 list_for_each_entry(sum, list, list) { 1653 btrfs_csum_file_blocks(trans, 1654 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1655 } 1656 return 0; 1657 } 1658 1659 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1660 struct extent_state **cached_state) 1661 { 1662 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1663 WARN_ON(1); 1664 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1665 cached_state, GFP_NOFS); 1666 } 1667 1668 /* see btrfs_writepage_start_hook for details on why this is required */ 1669 struct btrfs_writepage_fixup { 1670 struct page *page; 1671 struct btrfs_work work; 1672 }; 1673 1674 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1675 { 1676 struct btrfs_writepage_fixup *fixup; 1677 struct btrfs_ordered_extent *ordered; 1678 struct extent_state *cached_state = NULL; 1679 struct page *page; 1680 struct inode *inode; 1681 u64 page_start; 1682 u64 page_end; 1683 int ret; 1684 1685 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1686 page = fixup->page; 1687 again: 1688 lock_page(page); 1689 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1690 ClearPageChecked(page); 1691 goto out_page; 1692 } 1693 1694 inode = page->mapping->host; 1695 page_start = page_offset(page); 1696 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1697 1698 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1699 &cached_state); 1700 1701 /* already ordered? We're done */ 1702 if (PagePrivate2(page)) 1703 goto out; 1704 1705 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1706 if (ordered) { 1707 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1708 page_end, &cached_state, GFP_NOFS); 1709 unlock_page(page); 1710 btrfs_start_ordered_extent(inode, ordered, 1); 1711 btrfs_put_ordered_extent(ordered); 1712 goto again; 1713 } 1714 1715 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1716 if (ret) { 1717 mapping_set_error(page->mapping, ret); 1718 end_extent_writepage(page, ret, page_start, page_end); 1719 ClearPageChecked(page); 1720 goto out; 1721 } 1722 1723 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1724 ClearPageChecked(page); 1725 set_page_dirty(page); 1726 out: 1727 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1728 &cached_state, GFP_NOFS); 1729 out_page: 1730 unlock_page(page); 1731 page_cache_release(page); 1732 kfree(fixup); 1733 } 1734 1735 /* 1736 * There are a few paths in the higher layers of the kernel that directly 1737 * set the page dirty bit without asking the filesystem if it is a 1738 * good idea. This causes problems because we want to make sure COW 1739 * properly happens and the data=ordered rules are followed. 1740 * 1741 * In our case any range that doesn't have the ORDERED bit set 1742 * hasn't been properly setup for IO. We kick off an async process 1743 * to fix it up. The async helper will wait for ordered extents, set 1744 * the delalloc bit and make it safe to write the page. 1745 */ 1746 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1747 { 1748 struct inode *inode = page->mapping->host; 1749 struct btrfs_writepage_fixup *fixup; 1750 struct btrfs_root *root = BTRFS_I(inode)->root; 1751 1752 /* this page is properly in the ordered list */ 1753 if (TestClearPagePrivate2(page)) 1754 return 0; 1755 1756 if (PageChecked(page)) 1757 return -EAGAIN; 1758 1759 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1760 if (!fixup) 1761 return -EAGAIN; 1762 1763 SetPageChecked(page); 1764 page_cache_get(page); 1765 fixup->work.func = btrfs_writepage_fixup_worker; 1766 fixup->page = page; 1767 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1768 return -EBUSY; 1769 } 1770 1771 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1772 struct inode *inode, u64 file_pos, 1773 u64 disk_bytenr, u64 disk_num_bytes, 1774 u64 num_bytes, u64 ram_bytes, 1775 u8 compression, u8 encryption, 1776 u16 other_encoding, int extent_type) 1777 { 1778 struct btrfs_root *root = BTRFS_I(inode)->root; 1779 struct btrfs_file_extent_item *fi; 1780 struct btrfs_path *path; 1781 struct extent_buffer *leaf; 1782 struct btrfs_key ins; 1783 int ret; 1784 1785 path = btrfs_alloc_path(); 1786 if (!path) 1787 return -ENOMEM; 1788 1789 path->leave_spinning = 1; 1790 1791 /* 1792 * we may be replacing one extent in the tree with another. 1793 * The new extent is pinned in the extent map, and we don't want 1794 * to drop it from the cache until it is completely in the btree. 1795 * 1796 * So, tell btrfs_drop_extents to leave this extent in the cache. 1797 * the caller is expected to unpin it and allow it to be merged 1798 * with the others. 1799 */ 1800 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1801 file_pos + num_bytes, 0); 1802 if (ret) 1803 goto out; 1804 1805 ins.objectid = btrfs_ino(inode); 1806 ins.offset = file_pos; 1807 ins.type = BTRFS_EXTENT_DATA_KEY; 1808 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1809 if (ret) 1810 goto out; 1811 leaf = path->nodes[0]; 1812 fi = btrfs_item_ptr(leaf, path->slots[0], 1813 struct btrfs_file_extent_item); 1814 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1815 btrfs_set_file_extent_type(leaf, fi, extent_type); 1816 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1817 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1818 btrfs_set_file_extent_offset(leaf, fi, 0); 1819 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1820 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1821 btrfs_set_file_extent_compression(leaf, fi, compression); 1822 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1823 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1824 1825 btrfs_mark_buffer_dirty(leaf); 1826 btrfs_release_path(path); 1827 1828 inode_add_bytes(inode, num_bytes); 1829 1830 ins.objectid = disk_bytenr; 1831 ins.offset = disk_num_bytes; 1832 ins.type = BTRFS_EXTENT_ITEM_KEY; 1833 ret = btrfs_alloc_reserved_file_extent(trans, root, 1834 root->root_key.objectid, 1835 btrfs_ino(inode), file_pos, &ins); 1836 out: 1837 btrfs_free_path(path); 1838 1839 return ret; 1840 } 1841 1842 /* 1843 * helper function for btrfs_finish_ordered_io, this 1844 * just reads in some of the csum leaves to prime them into ram 1845 * before we start the transaction. It limits the amount of btree 1846 * reads required while inside the transaction. 1847 */ 1848 /* as ordered data IO finishes, this gets called so we can finish 1849 * an ordered extent if the range of bytes in the file it covers are 1850 * fully written. 1851 */ 1852 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 1853 { 1854 struct inode *inode = ordered_extent->inode; 1855 struct btrfs_root *root = BTRFS_I(inode)->root; 1856 struct btrfs_trans_handle *trans = NULL; 1857 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1858 struct extent_state *cached_state = NULL; 1859 int compress_type = 0; 1860 int ret; 1861 bool nolock; 1862 1863 nolock = btrfs_is_free_space_inode(inode); 1864 1865 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 1866 ret = -EIO; 1867 goto out; 1868 } 1869 1870 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1871 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1872 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1873 if (!ret) { 1874 if (nolock) 1875 trans = btrfs_join_transaction_nolock(root); 1876 else 1877 trans = btrfs_join_transaction(root); 1878 if (IS_ERR(trans)) { 1879 ret = PTR_ERR(trans); 1880 trans = NULL; 1881 goto out; 1882 } 1883 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1884 ret = btrfs_update_inode_fallback(trans, root, inode); 1885 if (ret) /* -ENOMEM or corruption */ 1886 btrfs_abort_transaction(trans, root, ret); 1887 } 1888 goto out; 1889 } 1890 1891 lock_extent_bits(io_tree, ordered_extent->file_offset, 1892 ordered_extent->file_offset + ordered_extent->len - 1, 1893 0, &cached_state); 1894 1895 if (nolock) 1896 trans = btrfs_join_transaction_nolock(root); 1897 else 1898 trans = btrfs_join_transaction(root); 1899 if (IS_ERR(trans)) { 1900 ret = PTR_ERR(trans); 1901 trans = NULL; 1902 goto out_unlock; 1903 } 1904 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1905 1906 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1907 compress_type = ordered_extent->compress_type; 1908 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1909 BUG_ON(compress_type); 1910 ret = btrfs_mark_extent_written(trans, inode, 1911 ordered_extent->file_offset, 1912 ordered_extent->file_offset + 1913 ordered_extent->len); 1914 } else { 1915 BUG_ON(root == root->fs_info->tree_root); 1916 ret = insert_reserved_file_extent(trans, inode, 1917 ordered_extent->file_offset, 1918 ordered_extent->start, 1919 ordered_extent->disk_len, 1920 ordered_extent->len, 1921 ordered_extent->len, 1922 compress_type, 0, 0, 1923 BTRFS_FILE_EXTENT_REG); 1924 } 1925 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1926 ordered_extent->file_offset, ordered_extent->len, 1927 trans->transid); 1928 if (ret < 0) { 1929 btrfs_abort_transaction(trans, root, ret); 1930 goto out_unlock; 1931 } 1932 1933 add_pending_csums(trans, inode, ordered_extent->file_offset, 1934 &ordered_extent->list); 1935 1936 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1937 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1938 ret = btrfs_update_inode_fallback(trans, root, inode); 1939 if (ret) { /* -ENOMEM or corruption */ 1940 btrfs_abort_transaction(trans, root, ret); 1941 goto out_unlock; 1942 } 1943 } else { 1944 btrfs_set_inode_last_trans(trans, inode); 1945 } 1946 ret = 0; 1947 out_unlock: 1948 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1949 ordered_extent->file_offset + 1950 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1951 out: 1952 if (root != root->fs_info->tree_root) 1953 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1954 if (trans) 1955 btrfs_end_transaction(trans, root); 1956 1957 if (ret) 1958 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 1959 ordered_extent->file_offset + 1960 ordered_extent->len - 1, NULL, GFP_NOFS); 1961 1962 /* 1963 * This needs to be done to make sure anybody waiting knows we are done 1964 * updating everything for this ordered extent. 1965 */ 1966 btrfs_remove_ordered_extent(inode, ordered_extent); 1967 1968 /* once for us */ 1969 btrfs_put_ordered_extent(ordered_extent); 1970 /* once for the tree */ 1971 btrfs_put_ordered_extent(ordered_extent); 1972 1973 return ret; 1974 } 1975 1976 static void finish_ordered_fn(struct btrfs_work *work) 1977 { 1978 struct btrfs_ordered_extent *ordered_extent; 1979 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 1980 btrfs_finish_ordered_io(ordered_extent); 1981 } 1982 1983 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1984 struct extent_state *state, int uptodate) 1985 { 1986 struct inode *inode = page->mapping->host; 1987 struct btrfs_root *root = BTRFS_I(inode)->root; 1988 struct btrfs_ordered_extent *ordered_extent = NULL; 1989 struct btrfs_workers *workers; 1990 1991 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 1992 1993 ClearPagePrivate2(page); 1994 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1995 end - start + 1, uptodate)) 1996 return 0; 1997 1998 ordered_extent->work.func = finish_ordered_fn; 1999 ordered_extent->work.flags = 0; 2000 2001 if (btrfs_is_free_space_inode(inode)) 2002 workers = &root->fs_info->endio_freespace_worker; 2003 else 2004 workers = &root->fs_info->endio_write_workers; 2005 btrfs_queue_worker(workers, &ordered_extent->work); 2006 2007 return 0; 2008 } 2009 2010 /* 2011 * when reads are done, we need to check csums to verify the data is correct 2012 * if there's a match, we allow the bio to finish. If not, the code in 2013 * extent_io.c will try to find good copies for us. 2014 */ 2015 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 2016 struct extent_state *state, int mirror) 2017 { 2018 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 2019 struct inode *inode = page->mapping->host; 2020 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2021 char *kaddr; 2022 u64 private = ~(u32)0; 2023 int ret; 2024 struct btrfs_root *root = BTRFS_I(inode)->root; 2025 u32 csum = ~(u32)0; 2026 2027 if (PageChecked(page)) { 2028 ClearPageChecked(page); 2029 goto good; 2030 } 2031 2032 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2033 goto good; 2034 2035 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2036 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2037 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 2038 GFP_NOFS); 2039 return 0; 2040 } 2041 2042 if (state && state->start == start) { 2043 private = state->private; 2044 ret = 0; 2045 } else { 2046 ret = get_state_private(io_tree, start, &private); 2047 } 2048 kaddr = kmap_atomic(page); 2049 if (ret) 2050 goto zeroit; 2051 2052 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 2053 btrfs_csum_final(csum, (char *)&csum); 2054 if (csum != private) 2055 goto zeroit; 2056 2057 kunmap_atomic(kaddr); 2058 good: 2059 return 0; 2060 2061 zeroit: 2062 printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " 2063 "private %llu\n", 2064 (unsigned long long)btrfs_ino(page->mapping->host), 2065 (unsigned long long)start, csum, 2066 (unsigned long long)private); 2067 memset(kaddr + offset, 1, end - start + 1); 2068 flush_dcache_page(page); 2069 kunmap_atomic(kaddr); 2070 if (private == 0) 2071 return 0; 2072 return -EIO; 2073 } 2074 2075 struct delayed_iput { 2076 struct list_head list; 2077 struct inode *inode; 2078 }; 2079 2080 /* JDM: If this is fs-wide, why can't we add a pointer to 2081 * btrfs_inode instead and avoid the allocation? */ 2082 void btrfs_add_delayed_iput(struct inode *inode) 2083 { 2084 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2085 struct delayed_iput *delayed; 2086 2087 if (atomic_add_unless(&inode->i_count, -1, 1)) 2088 return; 2089 2090 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 2091 delayed->inode = inode; 2092 2093 spin_lock(&fs_info->delayed_iput_lock); 2094 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 2095 spin_unlock(&fs_info->delayed_iput_lock); 2096 } 2097 2098 void btrfs_run_delayed_iputs(struct btrfs_root *root) 2099 { 2100 LIST_HEAD(list); 2101 struct btrfs_fs_info *fs_info = root->fs_info; 2102 struct delayed_iput *delayed; 2103 int empty; 2104 2105 spin_lock(&fs_info->delayed_iput_lock); 2106 empty = list_empty(&fs_info->delayed_iputs); 2107 spin_unlock(&fs_info->delayed_iput_lock); 2108 if (empty) 2109 return; 2110 2111 spin_lock(&fs_info->delayed_iput_lock); 2112 list_splice_init(&fs_info->delayed_iputs, &list); 2113 spin_unlock(&fs_info->delayed_iput_lock); 2114 2115 while (!list_empty(&list)) { 2116 delayed = list_entry(list.next, struct delayed_iput, list); 2117 list_del(&delayed->list); 2118 iput(delayed->inode); 2119 kfree(delayed); 2120 } 2121 } 2122 2123 enum btrfs_orphan_cleanup_state { 2124 ORPHAN_CLEANUP_STARTED = 1, 2125 ORPHAN_CLEANUP_DONE = 2, 2126 }; 2127 2128 /* 2129 * This is called in transaction commit time. If there are no orphan 2130 * files in the subvolume, it removes orphan item and frees block_rsv 2131 * structure. 2132 */ 2133 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2134 struct btrfs_root *root) 2135 { 2136 struct btrfs_block_rsv *block_rsv; 2137 int ret; 2138 2139 if (atomic_read(&root->orphan_inodes) || 2140 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2141 return; 2142 2143 spin_lock(&root->orphan_lock); 2144 if (atomic_read(&root->orphan_inodes)) { 2145 spin_unlock(&root->orphan_lock); 2146 return; 2147 } 2148 2149 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { 2150 spin_unlock(&root->orphan_lock); 2151 return; 2152 } 2153 2154 block_rsv = root->orphan_block_rsv; 2155 root->orphan_block_rsv = NULL; 2156 spin_unlock(&root->orphan_lock); 2157 2158 if (root->orphan_item_inserted && 2159 btrfs_root_refs(&root->root_item) > 0) { 2160 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2161 root->root_key.objectid); 2162 BUG_ON(ret); 2163 root->orphan_item_inserted = 0; 2164 } 2165 2166 if (block_rsv) { 2167 WARN_ON(block_rsv->size > 0); 2168 btrfs_free_block_rsv(root, block_rsv); 2169 } 2170 } 2171 2172 /* 2173 * This creates an orphan entry for the given inode in case something goes 2174 * wrong in the middle of an unlink/truncate. 2175 * 2176 * NOTE: caller of this function should reserve 5 units of metadata for 2177 * this function. 2178 */ 2179 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2180 { 2181 struct btrfs_root *root = BTRFS_I(inode)->root; 2182 struct btrfs_block_rsv *block_rsv = NULL; 2183 int reserve = 0; 2184 int insert = 0; 2185 int ret; 2186 2187 if (!root->orphan_block_rsv) { 2188 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 2189 if (!block_rsv) 2190 return -ENOMEM; 2191 } 2192 2193 spin_lock(&root->orphan_lock); 2194 if (!root->orphan_block_rsv) { 2195 root->orphan_block_rsv = block_rsv; 2196 } else if (block_rsv) { 2197 btrfs_free_block_rsv(root, block_rsv); 2198 block_rsv = NULL; 2199 } 2200 2201 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2202 &BTRFS_I(inode)->runtime_flags)) { 2203 #if 0 2204 /* 2205 * For proper ENOSPC handling, we should do orphan 2206 * cleanup when mounting. But this introduces backward 2207 * compatibility issue. 2208 */ 2209 if (!xchg(&root->orphan_item_inserted, 1)) 2210 insert = 2; 2211 else 2212 insert = 1; 2213 #endif 2214 insert = 1; 2215 atomic_inc(&root->orphan_inodes); 2216 } 2217 2218 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2219 &BTRFS_I(inode)->runtime_flags)) 2220 reserve = 1; 2221 spin_unlock(&root->orphan_lock); 2222 2223 /* grab metadata reservation from transaction handle */ 2224 if (reserve) { 2225 ret = btrfs_orphan_reserve_metadata(trans, inode); 2226 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ 2227 } 2228 2229 /* insert an orphan item to track this unlinked/truncated file */ 2230 if (insert >= 1) { 2231 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2232 if (ret && ret != -EEXIST) { 2233 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2234 &BTRFS_I(inode)->runtime_flags); 2235 btrfs_abort_transaction(trans, root, ret); 2236 return ret; 2237 } 2238 ret = 0; 2239 } 2240 2241 /* insert an orphan item to track subvolume contains orphan files */ 2242 if (insert >= 2) { 2243 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2244 root->root_key.objectid); 2245 if (ret && ret != -EEXIST) { 2246 btrfs_abort_transaction(trans, root, ret); 2247 return ret; 2248 } 2249 } 2250 return 0; 2251 } 2252 2253 /* 2254 * We have done the truncate/delete so we can go ahead and remove the orphan 2255 * item for this particular inode. 2256 */ 2257 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2258 { 2259 struct btrfs_root *root = BTRFS_I(inode)->root; 2260 int delete_item = 0; 2261 int release_rsv = 0; 2262 int ret = 0; 2263 2264 spin_lock(&root->orphan_lock); 2265 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2266 &BTRFS_I(inode)->runtime_flags)) 2267 delete_item = 1; 2268 2269 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2270 &BTRFS_I(inode)->runtime_flags)) 2271 release_rsv = 1; 2272 spin_unlock(&root->orphan_lock); 2273 2274 if (trans && delete_item) { 2275 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); 2276 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2277 } 2278 2279 if (release_rsv) { 2280 btrfs_orphan_release_metadata(inode); 2281 atomic_dec(&root->orphan_inodes); 2282 } 2283 2284 return 0; 2285 } 2286 2287 /* 2288 * this cleans up any orphans that may be left on the list from the last use 2289 * of this root. 2290 */ 2291 int btrfs_orphan_cleanup(struct btrfs_root *root) 2292 { 2293 struct btrfs_path *path; 2294 struct extent_buffer *leaf; 2295 struct btrfs_key key, found_key; 2296 struct btrfs_trans_handle *trans; 2297 struct inode *inode; 2298 u64 last_objectid = 0; 2299 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2300 2301 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2302 return 0; 2303 2304 path = btrfs_alloc_path(); 2305 if (!path) { 2306 ret = -ENOMEM; 2307 goto out; 2308 } 2309 path->reada = -1; 2310 2311 key.objectid = BTRFS_ORPHAN_OBJECTID; 2312 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2313 key.offset = (u64)-1; 2314 2315 while (1) { 2316 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2317 if (ret < 0) 2318 goto out; 2319 2320 /* 2321 * if ret == 0 means we found what we were searching for, which 2322 * is weird, but possible, so only screw with path if we didn't 2323 * find the key and see if we have stuff that matches 2324 */ 2325 if (ret > 0) { 2326 ret = 0; 2327 if (path->slots[0] == 0) 2328 break; 2329 path->slots[0]--; 2330 } 2331 2332 /* pull out the item */ 2333 leaf = path->nodes[0]; 2334 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2335 2336 /* make sure the item matches what we want */ 2337 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 2338 break; 2339 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 2340 break; 2341 2342 /* release the path since we're done with it */ 2343 btrfs_release_path(path); 2344 2345 /* 2346 * this is where we are basically btrfs_lookup, without the 2347 * crossing root thing. we store the inode number in the 2348 * offset of the orphan item. 2349 */ 2350 2351 if (found_key.offset == last_objectid) { 2352 printk(KERN_ERR "btrfs: Error removing orphan entry, " 2353 "stopping orphan cleanup\n"); 2354 ret = -EINVAL; 2355 goto out; 2356 } 2357 2358 last_objectid = found_key.offset; 2359 2360 found_key.objectid = found_key.offset; 2361 found_key.type = BTRFS_INODE_ITEM_KEY; 2362 found_key.offset = 0; 2363 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2364 ret = PTR_RET(inode); 2365 if (ret && ret != -ESTALE) 2366 goto out; 2367 2368 if (ret == -ESTALE && root == root->fs_info->tree_root) { 2369 struct btrfs_root *dead_root; 2370 struct btrfs_fs_info *fs_info = root->fs_info; 2371 int is_dead_root = 0; 2372 2373 /* 2374 * this is an orphan in the tree root. Currently these 2375 * could come from 2 sources: 2376 * a) a snapshot deletion in progress 2377 * b) a free space cache inode 2378 * We need to distinguish those two, as the snapshot 2379 * orphan must not get deleted. 2380 * find_dead_roots already ran before us, so if this 2381 * is a snapshot deletion, we should find the root 2382 * in the dead_roots list 2383 */ 2384 spin_lock(&fs_info->trans_lock); 2385 list_for_each_entry(dead_root, &fs_info->dead_roots, 2386 root_list) { 2387 if (dead_root->root_key.objectid == 2388 found_key.objectid) { 2389 is_dead_root = 1; 2390 break; 2391 } 2392 } 2393 spin_unlock(&fs_info->trans_lock); 2394 if (is_dead_root) { 2395 /* prevent this orphan from being found again */ 2396 key.offset = found_key.objectid - 1; 2397 continue; 2398 } 2399 } 2400 /* 2401 * Inode is already gone but the orphan item is still there, 2402 * kill the orphan item. 2403 */ 2404 if (ret == -ESTALE) { 2405 trans = btrfs_start_transaction(root, 1); 2406 if (IS_ERR(trans)) { 2407 ret = PTR_ERR(trans); 2408 goto out; 2409 } 2410 printk(KERN_ERR "auto deleting %Lu\n", 2411 found_key.objectid); 2412 ret = btrfs_del_orphan_item(trans, root, 2413 found_key.objectid); 2414 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2415 btrfs_end_transaction(trans, root); 2416 continue; 2417 } 2418 2419 /* 2420 * add this inode to the orphan list so btrfs_orphan_del does 2421 * the proper thing when we hit it 2422 */ 2423 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2424 &BTRFS_I(inode)->runtime_flags); 2425 2426 /* if we have links, this was a truncate, lets do that */ 2427 if (inode->i_nlink) { 2428 if (!S_ISREG(inode->i_mode)) { 2429 WARN_ON(1); 2430 iput(inode); 2431 continue; 2432 } 2433 nr_truncate++; 2434 ret = btrfs_truncate(inode); 2435 } else { 2436 nr_unlink++; 2437 } 2438 2439 /* this will do delete_inode and everything for us */ 2440 iput(inode); 2441 if (ret) 2442 goto out; 2443 } 2444 /* release the path since we're done with it */ 2445 btrfs_release_path(path); 2446 2447 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2448 2449 if (root->orphan_block_rsv) 2450 btrfs_block_rsv_release(root, root->orphan_block_rsv, 2451 (u64)-1); 2452 2453 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2454 trans = btrfs_join_transaction(root); 2455 if (!IS_ERR(trans)) 2456 btrfs_end_transaction(trans, root); 2457 } 2458 2459 if (nr_unlink) 2460 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2461 if (nr_truncate) 2462 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2463 2464 out: 2465 if (ret) 2466 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); 2467 btrfs_free_path(path); 2468 return ret; 2469 } 2470 2471 /* 2472 * very simple check to peek ahead in the leaf looking for xattrs. If we 2473 * don't find any xattrs, we know there can't be any acls. 2474 * 2475 * slot is the slot the inode is in, objectid is the objectid of the inode 2476 */ 2477 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2478 int slot, u64 objectid) 2479 { 2480 u32 nritems = btrfs_header_nritems(leaf); 2481 struct btrfs_key found_key; 2482 int scanned = 0; 2483 2484 slot++; 2485 while (slot < nritems) { 2486 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2487 2488 /* we found a different objectid, there must not be acls */ 2489 if (found_key.objectid != objectid) 2490 return 0; 2491 2492 /* we found an xattr, assume we've got an acl */ 2493 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2494 return 1; 2495 2496 /* 2497 * we found a key greater than an xattr key, there can't 2498 * be any acls later on 2499 */ 2500 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2501 return 0; 2502 2503 slot++; 2504 scanned++; 2505 2506 /* 2507 * it goes inode, inode backrefs, xattrs, extents, 2508 * so if there are a ton of hard links to an inode there can 2509 * be a lot of backrefs. Don't waste time searching too hard, 2510 * this is just an optimization 2511 */ 2512 if (scanned >= 8) 2513 break; 2514 } 2515 /* we hit the end of the leaf before we found an xattr or 2516 * something larger than an xattr. We have to assume the inode 2517 * has acls 2518 */ 2519 return 1; 2520 } 2521 2522 /* 2523 * read an inode from the btree into the in-memory inode 2524 */ 2525 static void btrfs_read_locked_inode(struct inode *inode) 2526 { 2527 struct btrfs_path *path; 2528 struct extent_buffer *leaf; 2529 struct btrfs_inode_item *inode_item; 2530 struct btrfs_timespec *tspec; 2531 struct btrfs_root *root = BTRFS_I(inode)->root; 2532 struct btrfs_key location; 2533 int maybe_acls; 2534 u32 rdev; 2535 int ret; 2536 bool filled = false; 2537 2538 ret = btrfs_fill_inode(inode, &rdev); 2539 if (!ret) 2540 filled = true; 2541 2542 path = btrfs_alloc_path(); 2543 if (!path) 2544 goto make_bad; 2545 2546 path->leave_spinning = 1; 2547 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2548 2549 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2550 if (ret) 2551 goto make_bad; 2552 2553 leaf = path->nodes[0]; 2554 2555 if (filled) 2556 goto cache_acl; 2557 2558 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2559 struct btrfs_inode_item); 2560 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2561 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 2562 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 2563 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 2564 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2565 2566 tspec = btrfs_inode_atime(inode_item); 2567 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2568 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2569 2570 tspec = btrfs_inode_mtime(inode_item); 2571 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2572 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2573 2574 tspec = btrfs_inode_ctime(inode_item); 2575 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2576 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2577 2578 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2579 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2580 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 2581 2582 /* 2583 * If we were modified in the current generation and evicted from memory 2584 * and then re-read we need to do a full sync since we don't have any 2585 * idea about which extents were modified before we were evicted from 2586 * cache. 2587 */ 2588 if (BTRFS_I(inode)->last_trans == root->fs_info->generation) 2589 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2590 &BTRFS_I(inode)->runtime_flags); 2591 2592 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2593 inode->i_generation = BTRFS_I(inode)->generation; 2594 inode->i_rdev = 0; 2595 rdev = btrfs_inode_rdev(leaf, inode_item); 2596 2597 BTRFS_I(inode)->index_cnt = (u64)-1; 2598 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2599 cache_acl: 2600 /* 2601 * try to precache a NULL acl entry for files that don't have 2602 * any xattrs or acls 2603 */ 2604 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 2605 btrfs_ino(inode)); 2606 if (!maybe_acls) 2607 cache_no_acl(inode); 2608 2609 btrfs_free_path(path); 2610 2611 switch (inode->i_mode & S_IFMT) { 2612 case S_IFREG: 2613 inode->i_mapping->a_ops = &btrfs_aops; 2614 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2615 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2616 inode->i_fop = &btrfs_file_operations; 2617 inode->i_op = &btrfs_file_inode_operations; 2618 break; 2619 case S_IFDIR: 2620 inode->i_fop = &btrfs_dir_file_operations; 2621 if (root == root->fs_info->tree_root) 2622 inode->i_op = &btrfs_dir_ro_inode_operations; 2623 else 2624 inode->i_op = &btrfs_dir_inode_operations; 2625 break; 2626 case S_IFLNK: 2627 inode->i_op = &btrfs_symlink_inode_operations; 2628 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2629 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2630 break; 2631 default: 2632 inode->i_op = &btrfs_special_inode_operations; 2633 init_special_inode(inode, inode->i_mode, rdev); 2634 break; 2635 } 2636 2637 btrfs_update_iflags(inode); 2638 return; 2639 2640 make_bad: 2641 btrfs_free_path(path); 2642 make_bad_inode(inode); 2643 } 2644 2645 /* 2646 * given a leaf and an inode, copy the inode fields into the leaf 2647 */ 2648 static void fill_inode_item(struct btrfs_trans_handle *trans, 2649 struct extent_buffer *leaf, 2650 struct btrfs_inode_item *item, 2651 struct inode *inode) 2652 { 2653 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 2654 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 2655 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2656 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2657 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2658 2659 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2660 inode->i_atime.tv_sec); 2661 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2662 inode->i_atime.tv_nsec); 2663 2664 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2665 inode->i_mtime.tv_sec); 2666 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2667 inode->i_mtime.tv_nsec); 2668 2669 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2670 inode->i_ctime.tv_sec); 2671 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2672 inode->i_ctime.tv_nsec); 2673 2674 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2675 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2676 btrfs_set_inode_sequence(leaf, item, inode->i_version); 2677 btrfs_set_inode_transid(leaf, item, trans->transid); 2678 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2679 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2680 btrfs_set_inode_block_group(leaf, item, 0); 2681 } 2682 2683 /* 2684 * copy everything in the in-memory inode into the btree. 2685 */ 2686 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 2687 struct btrfs_root *root, struct inode *inode) 2688 { 2689 struct btrfs_inode_item *inode_item; 2690 struct btrfs_path *path; 2691 struct extent_buffer *leaf; 2692 int ret; 2693 2694 path = btrfs_alloc_path(); 2695 if (!path) 2696 return -ENOMEM; 2697 2698 path->leave_spinning = 1; 2699 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 2700 1); 2701 if (ret) { 2702 if (ret > 0) 2703 ret = -ENOENT; 2704 goto failed; 2705 } 2706 2707 btrfs_unlock_up_safe(path, 1); 2708 leaf = path->nodes[0]; 2709 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2710 struct btrfs_inode_item); 2711 2712 fill_inode_item(trans, leaf, inode_item, inode); 2713 btrfs_mark_buffer_dirty(leaf); 2714 btrfs_set_inode_last_trans(trans, inode); 2715 ret = 0; 2716 failed: 2717 btrfs_free_path(path); 2718 return ret; 2719 } 2720 2721 /* 2722 * copy everything in the in-memory inode into the btree. 2723 */ 2724 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2725 struct btrfs_root *root, struct inode *inode) 2726 { 2727 int ret; 2728 2729 /* 2730 * If the inode is a free space inode, we can deadlock during commit 2731 * if we put it into the delayed code. 2732 * 2733 * The data relocation inode should also be directly updated 2734 * without delay 2735 */ 2736 if (!btrfs_is_free_space_inode(inode) 2737 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2738 btrfs_update_root_times(trans, root); 2739 2740 ret = btrfs_delayed_update_inode(trans, root, inode); 2741 if (!ret) 2742 btrfs_set_inode_last_trans(trans, inode); 2743 return ret; 2744 } 2745 2746 return btrfs_update_inode_item(trans, root, inode); 2747 } 2748 2749 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 2750 struct btrfs_root *root, struct inode *inode) 2751 { 2752 int ret; 2753 2754 ret = btrfs_update_inode(trans, root, inode); 2755 if (ret == -ENOSPC) 2756 return btrfs_update_inode_item(trans, root, inode); 2757 return ret; 2758 } 2759 2760 /* 2761 * unlink helper that gets used here in inode.c and in the tree logging 2762 * recovery code. It remove a link in a directory with a given name, and 2763 * also drops the back refs in the inode to the directory 2764 */ 2765 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2766 struct btrfs_root *root, 2767 struct inode *dir, struct inode *inode, 2768 const char *name, int name_len) 2769 { 2770 struct btrfs_path *path; 2771 int ret = 0; 2772 struct extent_buffer *leaf; 2773 struct btrfs_dir_item *di; 2774 struct btrfs_key key; 2775 u64 index; 2776 u64 ino = btrfs_ino(inode); 2777 u64 dir_ino = btrfs_ino(dir); 2778 2779 path = btrfs_alloc_path(); 2780 if (!path) { 2781 ret = -ENOMEM; 2782 goto out; 2783 } 2784 2785 path->leave_spinning = 1; 2786 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2787 name, name_len, -1); 2788 if (IS_ERR(di)) { 2789 ret = PTR_ERR(di); 2790 goto err; 2791 } 2792 if (!di) { 2793 ret = -ENOENT; 2794 goto err; 2795 } 2796 leaf = path->nodes[0]; 2797 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2798 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2799 if (ret) 2800 goto err; 2801 btrfs_release_path(path); 2802 2803 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 2804 dir_ino, &index); 2805 if (ret) { 2806 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2807 "inode %llu parent %llu\n", name_len, name, 2808 (unsigned long long)ino, (unsigned long long)dir_ino); 2809 btrfs_abort_transaction(trans, root, ret); 2810 goto err; 2811 } 2812 2813 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 2814 if (ret) { 2815 btrfs_abort_transaction(trans, root, ret); 2816 goto err; 2817 } 2818 2819 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2820 inode, dir_ino); 2821 if (ret != 0 && ret != -ENOENT) { 2822 btrfs_abort_transaction(trans, root, ret); 2823 goto err; 2824 } 2825 2826 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2827 dir, index); 2828 if (ret == -ENOENT) 2829 ret = 0; 2830 err: 2831 btrfs_free_path(path); 2832 if (ret) 2833 goto out; 2834 2835 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2836 inode_inc_iversion(inode); 2837 inode_inc_iversion(dir); 2838 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2839 ret = btrfs_update_inode(trans, root, dir); 2840 out: 2841 return ret; 2842 } 2843 2844 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2845 struct btrfs_root *root, 2846 struct inode *dir, struct inode *inode, 2847 const char *name, int name_len) 2848 { 2849 int ret; 2850 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 2851 if (!ret) { 2852 btrfs_drop_nlink(inode); 2853 ret = btrfs_update_inode(trans, root, inode); 2854 } 2855 return ret; 2856 } 2857 2858 2859 /* helper to check if there is any shared block in the path */ 2860 static int check_path_shared(struct btrfs_root *root, 2861 struct btrfs_path *path) 2862 { 2863 struct extent_buffer *eb; 2864 int level; 2865 u64 refs = 1; 2866 2867 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2868 int ret; 2869 2870 if (!path->nodes[level]) 2871 break; 2872 eb = path->nodes[level]; 2873 if (!btrfs_block_can_be_shared(root, eb)) 2874 continue; 2875 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, 2876 &refs, NULL); 2877 if (refs > 1) 2878 return 1; 2879 } 2880 return 0; 2881 } 2882 2883 /* 2884 * helper to start transaction for unlink and rmdir. 2885 * 2886 * unlink and rmdir are special in btrfs, they do not always free space. 2887 * so in enospc case, we should make sure they will free space before 2888 * allowing them to use the global metadata reservation. 2889 */ 2890 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 2891 struct dentry *dentry) 2892 { 2893 struct btrfs_trans_handle *trans; 2894 struct btrfs_root *root = BTRFS_I(dir)->root; 2895 struct btrfs_path *path; 2896 struct btrfs_dir_item *di; 2897 struct inode *inode = dentry->d_inode; 2898 u64 index; 2899 int check_link = 1; 2900 int err = -ENOSPC; 2901 int ret; 2902 u64 ino = btrfs_ino(inode); 2903 u64 dir_ino = btrfs_ino(dir); 2904 2905 /* 2906 * 1 for the possible orphan item 2907 * 1 for the dir item 2908 * 1 for the dir index 2909 * 1 for the inode ref 2910 * 1 for the inode ref in the tree log 2911 * 2 for the dir entries in the log 2912 * 1 for the inode 2913 */ 2914 trans = btrfs_start_transaction(root, 8); 2915 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2916 return trans; 2917 2918 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 2919 return ERR_PTR(-ENOSPC); 2920 2921 /* check if there is someone else holds reference */ 2922 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) 2923 return ERR_PTR(-ENOSPC); 2924 2925 if (atomic_read(&inode->i_count) > 2) 2926 return ERR_PTR(-ENOSPC); 2927 2928 if (xchg(&root->fs_info->enospc_unlink, 1)) 2929 return ERR_PTR(-ENOSPC); 2930 2931 path = btrfs_alloc_path(); 2932 if (!path) { 2933 root->fs_info->enospc_unlink = 0; 2934 return ERR_PTR(-ENOMEM); 2935 } 2936 2937 /* 1 for the orphan item */ 2938 trans = btrfs_start_transaction(root, 1); 2939 if (IS_ERR(trans)) { 2940 btrfs_free_path(path); 2941 root->fs_info->enospc_unlink = 0; 2942 return trans; 2943 } 2944 2945 path->skip_locking = 1; 2946 path->search_commit_root = 1; 2947 2948 ret = btrfs_lookup_inode(trans, root, path, 2949 &BTRFS_I(dir)->location, 0); 2950 if (ret < 0) { 2951 err = ret; 2952 goto out; 2953 } 2954 if (ret == 0) { 2955 if (check_path_shared(root, path)) 2956 goto out; 2957 } else { 2958 check_link = 0; 2959 } 2960 btrfs_release_path(path); 2961 2962 ret = btrfs_lookup_inode(trans, root, path, 2963 &BTRFS_I(inode)->location, 0); 2964 if (ret < 0) { 2965 err = ret; 2966 goto out; 2967 } 2968 if (ret == 0) { 2969 if (check_path_shared(root, path)) 2970 goto out; 2971 } else { 2972 check_link = 0; 2973 } 2974 btrfs_release_path(path); 2975 2976 if (ret == 0 && S_ISREG(inode->i_mode)) { 2977 ret = btrfs_lookup_file_extent(trans, root, path, 2978 ino, (u64)-1, 0); 2979 if (ret < 0) { 2980 err = ret; 2981 goto out; 2982 } 2983 BUG_ON(ret == 0); /* Corruption */ 2984 if (check_path_shared(root, path)) 2985 goto out; 2986 btrfs_release_path(path); 2987 } 2988 2989 if (!check_link) { 2990 err = 0; 2991 goto out; 2992 } 2993 2994 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2995 dentry->d_name.name, dentry->d_name.len, 0); 2996 if (IS_ERR(di)) { 2997 err = PTR_ERR(di); 2998 goto out; 2999 } 3000 if (di) { 3001 if (check_path_shared(root, path)) 3002 goto out; 3003 } else { 3004 err = 0; 3005 goto out; 3006 } 3007 btrfs_release_path(path); 3008 3009 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name, 3010 dentry->d_name.len, ino, dir_ino, 0, 3011 &index); 3012 if (ret) { 3013 err = ret; 3014 goto out; 3015 } 3016 3017 if (check_path_shared(root, path)) 3018 goto out; 3019 3020 btrfs_release_path(path); 3021 3022 /* 3023 * This is a commit root search, if we can lookup inode item and other 3024 * relative items in the commit root, it means the transaction of 3025 * dir/file creation has been committed, and the dir index item that we 3026 * delay to insert has also been inserted into the commit root. So 3027 * we needn't worry about the delayed insertion of the dir index item 3028 * here. 3029 */ 3030 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, 3031 dentry->d_name.name, dentry->d_name.len, 0); 3032 if (IS_ERR(di)) { 3033 err = PTR_ERR(di); 3034 goto out; 3035 } 3036 BUG_ON(ret == -ENOENT); 3037 if (check_path_shared(root, path)) 3038 goto out; 3039 3040 err = 0; 3041 out: 3042 btrfs_free_path(path); 3043 /* Migrate the orphan reservation over */ 3044 if (!err) 3045 err = btrfs_block_rsv_migrate(trans->block_rsv, 3046 &root->fs_info->global_block_rsv, 3047 trans->bytes_reserved); 3048 3049 if (err) { 3050 btrfs_end_transaction(trans, root); 3051 root->fs_info->enospc_unlink = 0; 3052 return ERR_PTR(err); 3053 } 3054 3055 trans->block_rsv = &root->fs_info->global_block_rsv; 3056 return trans; 3057 } 3058 3059 static void __unlink_end_trans(struct btrfs_trans_handle *trans, 3060 struct btrfs_root *root) 3061 { 3062 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) { 3063 btrfs_block_rsv_release(root, trans->block_rsv, 3064 trans->bytes_reserved); 3065 trans->block_rsv = &root->fs_info->trans_block_rsv; 3066 BUG_ON(!root->fs_info->enospc_unlink); 3067 root->fs_info->enospc_unlink = 0; 3068 } 3069 btrfs_end_transaction(trans, root); 3070 } 3071 3072 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3073 { 3074 struct btrfs_root *root = BTRFS_I(dir)->root; 3075 struct btrfs_trans_handle *trans; 3076 struct inode *inode = dentry->d_inode; 3077 int ret; 3078 unsigned long nr = 0; 3079 3080 trans = __unlink_start_trans(dir, dentry); 3081 if (IS_ERR(trans)) 3082 return PTR_ERR(trans); 3083 3084 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 3085 3086 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3087 dentry->d_name.name, dentry->d_name.len); 3088 if (ret) 3089 goto out; 3090 3091 if (inode->i_nlink == 0) { 3092 ret = btrfs_orphan_add(trans, inode); 3093 if (ret) 3094 goto out; 3095 } 3096 3097 out: 3098 nr = trans->blocks_used; 3099 __unlink_end_trans(trans, root); 3100 btrfs_btree_balance_dirty(root, nr); 3101 return ret; 3102 } 3103 3104 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 3105 struct btrfs_root *root, 3106 struct inode *dir, u64 objectid, 3107 const char *name, int name_len) 3108 { 3109 struct btrfs_path *path; 3110 struct extent_buffer *leaf; 3111 struct btrfs_dir_item *di; 3112 struct btrfs_key key; 3113 u64 index; 3114 int ret; 3115 u64 dir_ino = btrfs_ino(dir); 3116 3117 path = btrfs_alloc_path(); 3118 if (!path) 3119 return -ENOMEM; 3120 3121 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3122 name, name_len, -1); 3123 if (IS_ERR_OR_NULL(di)) { 3124 if (!di) 3125 ret = -ENOENT; 3126 else 3127 ret = PTR_ERR(di); 3128 goto out; 3129 } 3130 3131 leaf = path->nodes[0]; 3132 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3133 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3134 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3135 if (ret) { 3136 btrfs_abort_transaction(trans, root, ret); 3137 goto out; 3138 } 3139 btrfs_release_path(path); 3140 3141 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 3142 objectid, root->root_key.objectid, 3143 dir_ino, &index, name, name_len); 3144 if (ret < 0) { 3145 if (ret != -ENOENT) { 3146 btrfs_abort_transaction(trans, root, ret); 3147 goto out; 3148 } 3149 di = btrfs_search_dir_index_item(root, path, dir_ino, 3150 name, name_len); 3151 if (IS_ERR_OR_NULL(di)) { 3152 if (!di) 3153 ret = -ENOENT; 3154 else 3155 ret = PTR_ERR(di); 3156 btrfs_abort_transaction(trans, root, ret); 3157 goto out; 3158 } 3159 3160 leaf = path->nodes[0]; 3161 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3162 btrfs_release_path(path); 3163 index = key.offset; 3164 } 3165 btrfs_release_path(path); 3166 3167 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 3168 if (ret) { 3169 btrfs_abort_transaction(trans, root, ret); 3170 goto out; 3171 } 3172 3173 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3174 inode_inc_iversion(dir); 3175 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3176 ret = btrfs_update_inode_fallback(trans, root, dir); 3177 if (ret) 3178 btrfs_abort_transaction(trans, root, ret); 3179 out: 3180 btrfs_free_path(path); 3181 return ret; 3182 } 3183 3184 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 3185 { 3186 struct inode *inode = dentry->d_inode; 3187 int err = 0; 3188 struct btrfs_root *root = BTRFS_I(dir)->root; 3189 struct btrfs_trans_handle *trans; 3190 unsigned long nr = 0; 3191 3192 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3193 return -ENOTEMPTY; 3194 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3195 return -EPERM; 3196 3197 trans = __unlink_start_trans(dir, dentry); 3198 if (IS_ERR(trans)) 3199 return PTR_ERR(trans); 3200 3201 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 3202 err = btrfs_unlink_subvol(trans, root, dir, 3203 BTRFS_I(inode)->location.objectid, 3204 dentry->d_name.name, 3205 dentry->d_name.len); 3206 goto out; 3207 } 3208 3209 err = btrfs_orphan_add(trans, inode); 3210 if (err) 3211 goto out; 3212 3213 /* now the directory is empty */ 3214 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3215 dentry->d_name.name, dentry->d_name.len); 3216 if (!err) 3217 btrfs_i_size_write(inode, 0); 3218 out: 3219 nr = trans->blocks_used; 3220 __unlink_end_trans(trans, root); 3221 btrfs_btree_balance_dirty(root, nr); 3222 3223 return err; 3224 } 3225 3226 /* 3227 * this can truncate away extent items, csum items and directory items. 3228 * It starts at a high offset and removes keys until it can't find 3229 * any higher than new_size 3230 * 3231 * csum items that cross the new i_size are truncated to the new size 3232 * as well. 3233 * 3234 * min_type is the minimum key type to truncate down to. If set to 0, this 3235 * will kill all the items on this inode, including the INODE_ITEM_KEY. 3236 */ 3237 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3238 struct btrfs_root *root, 3239 struct inode *inode, 3240 u64 new_size, u32 min_type) 3241 { 3242 struct btrfs_path *path; 3243 struct extent_buffer *leaf; 3244 struct btrfs_file_extent_item *fi; 3245 struct btrfs_key key; 3246 struct btrfs_key found_key; 3247 u64 extent_start = 0; 3248 u64 extent_num_bytes = 0; 3249 u64 extent_offset = 0; 3250 u64 item_end = 0; 3251 u64 mask = root->sectorsize - 1; 3252 u32 found_type = (u8)-1; 3253 int found_extent; 3254 int del_item; 3255 int pending_del_nr = 0; 3256 int pending_del_slot = 0; 3257 int extent_type = -1; 3258 int ret; 3259 int err = 0; 3260 u64 ino = btrfs_ino(inode); 3261 3262 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3263 3264 path = btrfs_alloc_path(); 3265 if (!path) 3266 return -ENOMEM; 3267 path->reada = -1; 3268 3269 /* 3270 * We want to drop from the next block forward in case this new size is 3271 * not block aligned since we will be keeping the last block of the 3272 * extent just the way it is. 3273 */ 3274 if (root->ref_cows || root == root->fs_info->tree_root) 3275 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); 3276 3277 /* 3278 * This function is also used to drop the items in the log tree before 3279 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 3280 * it is used to drop the loged items. So we shouldn't kill the delayed 3281 * items. 3282 */ 3283 if (min_type == 0 && root == BTRFS_I(inode)->root) 3284 btrfs_kill_delayed_inode_items(inode); 3285 3286 key.objectid = ino; 3287 key.offset = (u64)-1; 3288 key.type = (u8)-1; 3289 3290 search_again: 3291 path->leave_spinning = 1; 3292 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3293 if (ret < 0) { 3294 err = ret; 3295 goto out; 3296 } 3297 3298 if (ret > 0) { 3299 /* there are no items in the tree for us to truncate, we're 3300 * done 3301 */ 3302 if (path->slots[0] == 0) 3303 goto out; 3304 path->slots[0]--; 3305 } 3306 3307 while (1) { 3308 fi = NULL; 3309 leaf = path->nodes[0]; 3310 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3311 found_type = btrfs_key_type(&found_key); 3312 3313 if (found_key.objectid != ino) 3314 break; 3315 3316 if (found_type < min_type) 3317 break; 3318 3319 item_end = found_key.offset; 3320 if (found_type == BTRFS_EXTENT_DATA_KEY) { 3321 fi = btrfs_item_ptr(leaf, path->slots[0], 3322 struct btrfs_file_extent_item); 3323 extent_type = btrfs_file_extent_type(leaf, fi); 3324 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3325 item_end += 3326 btrfs_file_extent_num_bytes(leaf, fi); 3327 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3328 item_end += btrfs_file_extent_inline_len(leaf, 3329 fi); 3330 } 3331 item_end--; 3332 } 3333 if (found_type > min_type) { 3334 del_item = 1; 3335 } else { 3336 if (item_end < new_size) 3337 break; 3338 if (found_key.offset >= new_size) 3339 del_item = 1; 3340 else 3341 del_item = 0; 3342 } 3343 found_extent = 0; 3344 /* FIXME, shrink the extent if the ref count is only 1 */ 3345 if (found_type != BTRFS_EXTENT_DATA_KEY) 3346 goto delete; 3347 3348 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3349 u64 num_dec; 3350 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3351 if (!del_item) { 3352 u64 orig_num_bytes = 3353 btrfs_file_extent_num_bytes(leaf, fi); 3354 extent_num_bytes = new_size - 3355 found_key.offset + root->sectorsize - 1; 3356 extent_num_bytes = extent_num_bytes & 3357 ~((u64)root->sectorsize - 1); 3358 btrfs_set_file_extent_num_bytes(leaf, fi, 3359 extent_num_bytes); 3360 num_dec = (orig_num_bytes - 3361 extent_num_bytes); 3362 if (root->ref_cows && extent_start != 0) 3363 inode_sub_bytes(inode, num_dec); 3364 btrfs_mark_buffer_dirty(leaf); 3365 } else { 3366 extent_num_bytes = 3367 btrfs_file_extent_disk_num_bytes(leaf, 3368 fi); 3369 extent_offset = found_key.offset - 3370 btrfs_file_extent_offset(leaf, fi); 3371 3372 /* FIXME blocksize != 4096 */ 3373 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 3374 if (extent_start != 0) { 3375 found_extent = 1; 3376 if (root->ref_cows) 3377 inode_sub_bytes(inode, num_dec); 3378 } 3379 } 3380 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3381 /* 3382 * we can't truncate inline items that have had 3383 * special encodings 3384 */ 3385 if (!del_item && 3386 btrfs_file_extent_compression(leaf, fi) == 0 && 3387 btrfs_file_extent_encryption(leaf, fi) == 0 && 3388 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 3389 u32 size = new_size - found_key.offset; 3390 3391 if (root->ref_cows) { 3392 inode_sub_bytes(inode, item_end + 1 - 3393 new_size); 3394 } 3395 size = 3396 btrfs_file_extent_calc_inline_size(size); 3397 btrfs_truncate_item(trans, root, path, 3398 size, 1); 3399 } else if (root->ref_cows) { 3400 inode_sub_bytes(inode, item_end + 1 - 3401 found_key.offset); 3402 } 3403 } 3404 delete: 3405 if (del_item) { 3406 if (!pending_del_nr) { 3407 /* no pending yet, add ourselves */ 3408 pending_del_slot = path->slots[0]; 3409 pending_del_nr = 1; 3410 } else if (pending_del_nr && 3411 path->slots[0] + 1 == pending_del_slot) { 3412 /* hop on the pending chunk */ 3413 pending_del_nr++; 3414 pending_del_slot = path->slots[0]; 3415 } else { 3416 BUG(); 3417 } 3418 } else { 3419 break; 3420 } 3421 if (found_extent && (root->ref_cows || 3422 root == root->fs_info->tree_root)) { 3423 btrfs_set_path_blocking(path); 3424 ret = btrfs_free_extent(trans, root, extent_start, 3425 extent_num_bytes, 0, 3426 btrfs_header_owner(leaf), 3427 ino, extent_offset, 0); 3428 BUG_ON(ret); 3429 } 3430 3431 if (found_type == BTRFS_INODE_ITEM_KEY) 3432 break; 3433 3434 if (path->slots[0] == 0 || 3435 path->slots[0] != pending_del_slot) { 3436 if (pending_del_nr) { 3437 ret = btrfs_del_items(trans, root, path, 3438 pending_del_slot, 3439 pending_del_nr); 3440 if (ret) { 3441 btrfs_abort_transaction(trans, 3442 root, ret); 3443 goto error; 3444 } 3445 pending_del_nr = 0; 3446 } 3447 btrfs_release_path(path); 3448 goto search_again; 3449 } else { 3450 path->slots[0]--; 3451 } 3452 } 3453 out: 3454 if (pending_del_nr) { 3455 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3456 pending_del_nr); 3457 if (ret) 3458 btrfs_abort_transaction(trans, root, ret); 3459 } 3460 error: 3461 btrfs_free_path(path); 3462 return err; 3463 } 3464 3465 /* 3466 * btrfs_truncate_page - read, zero a chunk and write a page 3467 * @inode - inode that we're zeroing 3468 * @from - the offset to start zeroing 3469 * @len - the length to zero, 0 to zero the entire range respective to the 3470 * offset 3471 * @front - zero up to the offset instead of from the offset on 3472 * 3473 * This will find the page for the "from" offset and cow the page and zero the 3474 * part we want to zero. This is used with truncate and hole punching. 3475 */ 3476 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 3477 int front) 3478 { 3479 struct address_space *mapping = inode->i_mapping; 3480 struct btrfs_root *root = BTRFS_I(inode)->root; 3481 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3482 struct btrfs_ordered_extent *ordered; 3483 struct extent_state *cached_state = NULL; 3484 char *kaddr; 3485 u32 blocksize = root->sectorsize; 3486 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3487 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3488 struct page *page; 3489 gfp_t mask = btrfs_alloc_write_mask(mapping); 3490 int ret = 0; 3491 u64 page_start; 3492 u64 page_end; 3493 3494 if ((offset & (blocksize - 1)) == 0 && 3495 (!len || ((len & (blocksize - 1)) == 0))) 3496 goto out; 3497 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3498 if (ret) 3499 goto out; 3500 3501 ret = -ENOMEM; 3502 again: 3503 page = find_or_create_page(mapping, index, mask); 3504 if (!page) { 3505 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3506 goto out; 3507 } 3508 3509 page_start = page_offset(page); 3510 page_end = page_start + PAGE_CACHE_SIZE - 1; 3511 3512 if (!PageUptodate(page)) { 3513 ret = btrfs_readpage(NULL, page); 3514 lock_page(page); 3515 if (page->mapping != mapping) { 3516 unlock_page(page); 3517 page_cache_release(page); 3518 goto again; 3519 } 3520 if (!PageUptodate(page)) { 3521 ret = -EIO; 3522 goto out_unlock; 3523 } 3524 } 3525 wait_on_page_writeback(page); 3526 3527 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 3528 set_page_extent_mapped(page); 3529 3530 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3531 if (ordered) { 3532 unlock_extent_cached(io_tree, page_start, page_end, 3533 &cached_state, GFP_NOFS); 3534 unlock_page(page); 3535 page_cache_release(page); 3536 btrfs_start_ordered_extent(inode, ordered, 1); 3537 btrfs_put_ordered_extent(ordered); 3538 goto again; 3539 } 3540 3541 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3542 EXTENT_DIRTY | EXTENT_DELALLOC | 3543 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 3544 0, 0, &cached_state, GFP_NOFS); 3545 3546 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3547 &cached_state); 3548 if (ret) { 3549 unlock_extent_cached(io_tree, page_start, page_end, 3550 &cached_state, GFP_NOFS); 3551 goto out_unlock; 3552 } 3553 3554 ret = 0; 3555 if (offset != PAGE_CACHE_SIZE) { 3556 if (!len) 3557 len = PAGE_CACHE_SIZE - offset; 3558 kaddr = kmap(page); 3559 if (front) 3560 memset(kaddr, 0, offset); 3561 else 3562 memset(kaddr + offset, 0, len); 3563 flush_dcache_page(page); 3564 kunmap(page); 3565 } 3566 ClearPageChecked(page); 3567 set_page_dirty(page); 3568 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 3569 GFP_NOFS); 3570 3571 out_unlock: 3572 if (ret) 3573 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3574 unlock_page(page); 3575 page_cache_release(page); 3576 out: 3577 return ret; 3578 } 3579 3580 /* 3581 * This function puts in dummy file extents for the area we're creating a hole 3582 * for. So if we are truncating this file to a larger size we need to insert 3583 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 3584 * the range between oldsize and size 3585 */ 3586 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 3587 { 3588 struct btrfs_trans_handle *trans; 3589 struct btrfs_root *root = BTRFS_I(inode)->root; 3590 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3591 struct extent_map *em = NULL; 3592 struct extent_state *cached_state = NULL; 3593 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3594 u64 mask = root->sectorsize - 1; 3595 u64 hole_start = (oldsize + mask) & ~mask; 3596 u64 block_end = (size + mask) & ~mask; 3597 u64 last_byte; 3598 u64 cur_offset; 3599 u64 hole_size; 3600 int err = 0; 3601 3602 if (size <= hole_start) 3603 return 0; 3604 3605 while (1) { 3606 struct btrfs_ordered_extent *ordered; 3607 btrfs_wait_ordered_range(inode, hole_start, 3608 block_end - hole_start); 3609 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 3610 &cached_state); 3611 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3612 if (!ordered) 3613 break; 3614 unlock_extent_cached(io_tree, hole_start, block_end - 1, 3615 &cached_state, GFP_NOFS); 3616 btrfs_put_ordered_extent(ordered); 3617 } 3618 3619 cur_offset = hole_start; 3620 while (1) { 3621 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3622 block_end - cur_offset, 0); 3623 if (IS_ERR(em)) { 3624 err = PTR_ERR(em); 3625 break; 3626 } 3627 last_byte = min(extent_map_end(em), block_end); 3628 last_byte = (last_byte + mask) & ~mask; 3629 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3630 struct extent_map *hole_em; 3631 hole_size = last_byte - cur_offset; 3632 3633 trans = btrfs_start_transaction(root, 3); 3634 if (IS_ERR(trans)) { 3635 err = PTR_ERR(trans); 3636 break; 3637 } 3638 3639 err = btrfs_drop_extents(trans, root, inode, 3640 cur_offset, 3641 cur_offset + hole_size, 1); 3642 if (err) { 3643 btrfs_abort_transaction(trans, root, err); 3644 btrfs_end_transaction(trans, root); 3645 break; 3646 } 3647 3648 err = btrfs_insert_file_extent(trans, root, 3649 btrfs_ino(inode), cur_offset, 0, 3650 0, hole_size, 0, hole_size, 3651 0, 0, 0); 3652 if (err) { 3653 btrfs_abort_transaction(trans, root, err); 3654 btrfs_end_transaction(trans, root); 3655 break; 3656 } 3657 3658 btrfs_drop_extent_cache(inode, cur_offset, 3659 cur_offset + hole_size - 1, 0); 3660 hole_em = alloc_extent_map(); 3661 if (!hole_em) { 3662 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3663 &BTRFS_I(inode)->runtime_flags); 3664 goto next; 3665 } 3666 hole_em->start = cur_offset; 3667 hole_em->len = hole_size; 3668 hole_em->orig_start = cur_offset; 3669 3670 hole_em->block_start = EXTENT_MAP_HOLE; 3671 hole_em->block_len = 0; 3672 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 3673 hole_em->compress_type = BTRFS_COMPRESS_NONE; 3674 hole_em->generation = trans->transid; 3675 3676 while (1) { 3677 write_lock(&em_tree->lock); 3678 err = add_extent_mapping(em_tree, hole_em); 3679 if (!err) 3680 list_move(&hole_em->list, 3681 &em_tree->modified_extents); 3682 write_unlock(&em_tree->lock); 3683 if (err != -EEXIST) 3684 break; 3685 btrfs_drop_extent_cache(inode, cur_offset, 3686 cur_offset + 3687 hole_size - 1, 0); 3688 } 3689 free_extent_map(hole_em); 3690 next: 3691 btrfs_update_inode(trans, root, inode); 3692 btrfs_end_transaction(trans, root); 3693 } 3694 free_extent_map(em); 3695 em = NULL; 3696 cur_offset = last_byte; 3697 if (cur_offset >= block_end) 3698 break; 3699 } 3700 3701 free_extent_map(em); 3702 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3703 GFP_NOFS); 3704 return err; 3705 } 3706 3707 static int btrfs_setsize(struct inode *inode, loff_t newsize) 3708 { 3709 struct btrfs_root *root = BTRFS_I(inode)->root; 3710 struct btrfs_trans_handle *trans; 3711 loff_t oldsize = i_size_read(inode); 3712 int ret; 3713 3714 if (newsize == oldsize) 3715 return 0; 3716 3717 if (newsize > oldsize) { 3718 truncate_pagecache(inode, oldsize, newsize); 3719 ret = btrfs_cont_expand(inode, oldsize, newsize); 3720 if (ret) 3721 return ret; 3722 3723 trans = btrfs_start_transaction(root, 1); 3724 if (IS_ERR(trans)) 3725 return PTR_ERR(trans); 3726 3727 i_size_write(inode, newsize); 3728 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3729 ret = btrfs_update_inode(trans, root, inode); 3730 btrfs_end_transaction(trans, root); 3731 } else { 3732 3733 /* 3734 * We're truncating a file that used to have good data down to 3735 * zero. Make sure it gets into the ordered flush list so that 3736 * any new writes get down to disk quickly. 3737 */ 3738 if (newsize == 0) 3739 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 3740 &BTRFS_I(inode)->runtime_flags); 3741 3742 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3743 truncate_setsize(inode, newsize); 3744 ret = btrfs_truncate(inode); 3745 } 3746 3747 return ret; 3748 } 3749 3750 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3751 { 3752 struct inode *inode = dentry->d_inode; 3753 struct btrfs_root *root = BTRFS_I(inode)->root; 3754 int err; 3755 3756 if (btrfs_root_readonly(root)) 3757 return -EROFS; 3758 3759 err = inode_change_ok(inode, attr); 3760 if (err) 3761 return err; 3762 3763 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3764 err = btrfs_setsize(inode, attr->ia_size); 3765 if (err) 3766 return err; 3767 } 3768 3769 if (attr->ia_valid) { 3770 setattr_copy(inode, attr); 3771 inode_inc_iversion(inode); 3772 err = btrfs_dirty_inode(inode); 3773 3774 if (!err && attr->ia_valid & ATTR_MODE) 3775 err = btrfs_acl_chmod(inode); 3776 } 3777 3778 return err; 3779 } 3780 3781 void btrfs_evict_inode(struct inode *inode) 3782 { 3783 struct btrfs_trans_handle *trans; 3784 struct btrfs_root *root = BTRFS_I(inode)->root; 3785 struct btrfs_block_rsv *rsv, *global_rsv; 3786 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3787 unsigned long nr; 3788 int ret; 3789 3790 trace_btrfs_inode_evict(inode); 3791 3792 truncate_inode_pages(&inode->i_data, 0); 3793 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3794 btrfs_is_free_space_inode(inode))) 3795 goto no_delete; 3796 3797 if (is_bad_inode(inode)) { 3798 btrfs_orphan_del(NULL, inode); 3799 goto no_delete; 3800 } 3801 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 3802 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3803 3804 if (root->fs_info->log_root_recovering) { 3805 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3806 &BTRFS_I(inode)->runtime_flags)); 3807 goto no_delete; 3808 } 3809 3810 if (inode->i_nlink > 0) { 3811 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3812 goto no_delete; 3813 } 3814 3815 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 3816 if (!rsv) { 3817 btrfs_orphan_del(NULL, inode); 3818 goto no_delete; 3819 } 3820 rsv->size = min_size; 3821 rsv->failfast = 1; 3822 global_rsv = &root->fs_info->global_block_rsv; 3823 3824 btrfs_i_size_write(inode, 0); 3825 3826 /* 3827 * This is a bit simpler than btrfs_truncate since we've already 3828 * reserved our space for our orphan item in the unlink, so we just 3829 * need to reserve some slack space in case we add bytes and update 3830 * inode item when doing the truncate. 3831 */ 3832 while (1) { 3833 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3834 3835 /* 3836 * Try and steal from the global reserve since we will 3837 * likely not use this space anyway, we want to try as 3838 * hard as possible to get this to work. 3839 */ 3840 if (ret) 3841 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); 3842 3843 if (ret) { 3844 printk(KERN_WARNING "Could not get space for a " 3845 "delete, will truncate on mount %d\n", ret); 3846 btrfs_orphan_del(NULL, inode); 3847 btrfs_free_block_rsv(root, rsv); 3848 goto no_delete; 3849 } 3850 3851 trans = btrfs_start_transaction_noflush(root, 1); 3852 if (IS_ERR(trans)) { 3853 btrfs_orphan_del(NULL, inode); 3854 btrfs_free_block_rsv(root, rsv); 3855 goto no_delete; 3856 } 3857 3858 trans->block_rsv = rsv; 3859 3860 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3861 if (ret != -ENOSPC) 3862 break; 3863 3864 trans->block_rsv = &root->fs_info->trans_block_rsv; 3865 ret = btrfs_update_inode(trans, root, inode); 3866 BUG_ON(ret); 3867 3868 nr = trans->blocks_used; 3869 btrfs_end_transaction(trans, root); 3870 trans = NULL; 3871 btrfs_btree_balance_dirty(root, nr); 3872 } 3873 3874 btrfs_free_block_rsv(root, rsv); 3875 3876 if (ret == 0) { 3877 trans->block_rsv = root->orphan_block_rsv; 3878 ret = btrfs_orphan_del(trans, inode); 3879 BUG_ON(ret); 3880 } 3881 3882 trans->block_rsv = &root->fs_info->trans_block_rsv; 3883 if (!(root == root->fs_info->tree_root || 3884 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3885 btrfs_return_ino(root, btrfs_ino(inode)); 3886 3887 nr = trans->blocks_used; 3888 btrfs_end_transaction(trans, root); 3889 btrfs_btree_balance_dirty(root, nr); 3890 no_delete: 3891 clear_inode(inode); 3892 return; 3893 } 3894 3895 /* 3896 * this returns the key found in the dir entry in the location pointer. 3897 * If no dir entries were found, location->objectid is 0. 3898 */ 3899 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3900 struct btrfs_key *location) 3901 { 3902 const char *name = dentry->d_name.name; 3903 int namelen = dentry->d_name.len; 3904 struct btrfs_dir_item *di; 3905 struct btrfs_path *path; 3906 struct btrfs_root *root = BTRFS_I(dir)->root; 3907 int ret = 0; 3908 3909 path = btrfs_alloc_path(); 3910 if (!path) 3911 return -ENOMEM; 3912 3913 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 3914 namelen, 0); 3915 if (IS_ERR(di)) 3916 ret = PTR_ERR(di); 3917 3918 if (IS_ERR_OR_NULL(di)) 3919 goto out_err; 3920 3921 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3922 out: 3923 btrfs_free_path(path); 3924 return ret; 3925 out_err: 3926 location->objectid = 0; 3927 goto out; 3928 } 3929 3930 /* 3931 * when we hit a tree root in a directory, the btrfs part of the inode 3932 * needs to be changed to reflect the root directory of the tree root. This 3933 * is kind of like crossing a mount point. 3934 */ 3935 static int fixup_tree_root_location(struct btrfs_root *root, 3936 struct inode *dir, 3937 struct dentry *dentry, 3938 struct btrfs_key *location, 3939 struct btrfs_root **sub_root) 3940 { 3941 struct btrfs_path *path; 3942 struct btrfs_root *new_root; 3943 struct btrfs_root_ref *ref; 3944 struct extent_buffer *leaf; 3945 int ret; 3946 int err = 0; 3947 3948 path = btrfs_alloc_path(); 3949 if (!path) { 3950 err = -ENOMEM; 3951 goto out; 3952 } 3953 3954 err = -ENOENT; 3955 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3956 BTRFS_I(dir)->root->root_key.objectid, 3957 location->objectid); 3958 if (ret) { 3959 if (ret < 0) 3960 err = ret; 3961 goto out; 3962 } 3963 3964 leaf = path->nodes[0]; 3965 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3966 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 3967 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3968 goto out; 3969 3970 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3971 (unsigned long)(ref + 1), 3972 dentry->d_name.len); 3973 if (ret) 3974 goto out; 3975 3976 btrfs_release_path(path); 3977 3978 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3979 if (IS_ERR(new_root)) { 3980 err = PTR_ERR(new_root); 3981 goto out; 3982 } 3983 3984 if (btrfs_root_refs(&new_root->root_item) == 0) { 3985 err = -ENOENT; 3986 goto out; 3987 } 3988 3989 *sub_root = new_root; 3990 location->objectid = btrfs_root_dirid(&new_root->root_item); 3991 location->type = BTRFS_INODE_ITEM_KEY; 3992 location->offset = 0; 3993 err = 0; 3994 out: 3995 btrfs_free_path(path); 3996 return err; 3997 } 3998 3999 static void inode_tree_add(struct inode *inode) 4000 { 4001 struct btrfs_root *root = BTRFS_I(inode)->root; 4002 struct btrfs_inode *entry; 4003 struct rb_node **p; 4004 struct rb_node *parent; 4005 u64 ino = btrfs_ino(inode); 4006 again: 4007 p = &root->inode_tree.rb_node; 4008 parent = NULL; 4009 4010 if (inode_unhashed(inode)) 4011 return; 4012 4013 spin_lock(&root->inode_lock); 4014 while (*p) { 4015 parent = *p; 4016 entry = rb_entry(parent, struct btrfs_inode, rb_node); 4017 4018 if (ino < btrfs_ino(&entry->vfs_inode)) 4019 p = &parent->rb_left; 4020 else if (ino > btrfs_ino(&entry->vfs_inode)) 4021 p = &parent->rb_right; 4022 else { 4023 WARN_ON(!(entry->vfs_inode.i_state & 4024 (I_WILL_FREE | I_FREEING))); 4025 rb_erase(parent, &root->inode_tree); 4026 RB_CLEAR_NODE(parent); 4027 spin_unlock(&root->inode_lock); 4028 goto again; 4029 } 4030 } 4031 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 4032 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 4033 spin_unlock(&root->inode_lock); 4034 } 4035 4036 static void inode_tree_del(struct inode *inode) 4037 { 4038 struct btrfs_root *root = BTRFS_I(inode)->root; 4039 int empty = 0; 4040 4041 spin_lock(&root->inode_lock); 4042 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 4043 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 4044 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 4045 empty = RB_EMPTY_ROOT(&root->inode_tree); 4046 } 4047 spin_unlock(&root->inode_lock); 4048 4049 /* 4050 * Free space cache has inodes in the tree root, but the tree root has a 4051 * root_refs of 0, so this could end up dropping the tree root as a 4052 * snapshot, so we need the extra !root->fs_info->tree_root check to 4053 * make sure we don't drop it. 4054 */ 4055 if (empty && btrfs_root_refs(&root->root_item) == 0 && 4056 root != root->fs_info->tree_root) { 4057 synchronize_srcu(&root->fs_info->subvol_srcu); 4058 spin_lock(&root->inode_lock); 4059 empty = RB_EMPTY_ROOT(&root->inode_tree); 4060 spin_unlock(&root->inode_lock); 4061 if (empty) 4062 btrfs_add_dead_root(root); 4063 } 4064 } 4065 4066 void btrfs_invalidate_inodes(struct btrfs_root *root) 4067 { 4068 struct rb_node *node; 4069 struct rb_node *prev; 4070 struct btrfs_inode *entry; 4071 struct inode *inode; 4072 u64 objectid = 0; 4073 4074 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4075 4076 spin_lock(&root->inode_lock); 4077 again: 4078 node = root->inode_tree.rb_node; 4079 prev = NULL; 4080 while (node) { 4081 prev = node; 4082 entry = rb_entry(node, struct btrfs_inode, rb_node); 4083 4084 if (objectid < btrfs_ino(&entry->vfs_inode)) 4085 node = node->rb_left; 4086 else if (objectid > btrfs_ino(&entry->vfs_inode)) 4087 node = node->rb_right; 4088 else 4089 break; 4090 } 4091 if (!node) { 4092 while (prev) { 4093 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4094 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 4095 node = prev; 4096 break; 4097 } 4098 prev = rb_next(prev); 4099 } 4100 } 4101 while (node) { 4102 entry = rb_entry(node, struct btrfs_inode, rb_node); 4103 objectid = btrfs_ino(&entry->vfs_inode) + 1; 4104 inode = igrab(&entry->vfs_inode); 4105 if (inode) { 4106 spin_unlock(&root->inode_lock); 4107 if (atomic_read(&inode->i_count) > 1) 4108 d_prune_aliases(inode); 4109 /* 4110 * btrfs_drop_inode will have it removed from 4111 * the inode cache when its usage count 4112 * hits zero. 4113 */ 4114 iput(inode); 4115 cond_resched(); 4116 spin_lock(&root->inode_lock); 4117 goto again; 4118 } 4119 4120 if (cond_resched_lock(&root->inode_lock)) 4121 goto again; 4122 4123 node = rb_next(node); 4124 } 4125 spin_unlock(&root->inode_lock); 4126 } 4127 4128 static int btrfs_init_locked_inode(struct inode *inode, void *p) 4129 { 4130 struct btrfs_iget_args *args = p; 4131 inode->i_ino = args->ino; 4132 BTRFS_I(inode)->root = args->root; 4133 return 0; 4134 } 4135 4136 static int btrfs_find_actor(struct inode *inode, void *opaque) 4137 { 4138 struct btrfs_iget_args *args = opaque; 4139 return args->ino == btrfs_ino(inode) && 4140 args->root == BTRFS_I(inode)->root; 4141 } 4142 4143 static struct inode *btrfs_iget_locked(struct super_block *s, 4144 u64 objectid, 4145 struct btrfs_root *root) 4146 { 4147 struct inode *inode; 4148 struct btrfs_iget_args args; 4149 args.ino = objectid; 4150 args.root = root; 4151 4152 inode = iget5_locked(s, objectid, btrfs_find_actor, 4153 btrfs_init_locked_inode, 4154 (void *)&args); 4155 return inode; 4156 } 4157 4158 /* Get an inode object given its location and corresponding root. 4159 * Returns in *is_new if the inode was read from disk 4160 */ 4161 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 4162 struct btrfs_root *root, int *new) 4163 { 4164 struct inode *inode; 4165 4166 inode = btrfs_iget_locked(s, location->objectid, root); 4167 if (!inode) 4168 return ERR_PTR(-ENOMEM); 4169 4170 if (inode->i_state & I_NEW) { 4171 BTRFS_I(inode)->root = root; 4172 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4173 btrfs_read_locked_inode(inode); 4174 if (!is_bad_inode(inode)) { 4175 inode_tree_add(inode); 4176 unlock_new_inode(inode); 4177 if (new) 4178 *new = 1; 4179 } else { 4180 unlock_new_inode(inode); 4181 iput(inode); 4182 inode = ERR_PTR(-ESTALE); 4183 } 4184 } 4185 4186 return inode; 4187 } 4188 4189 static struct inode *new_simple_dir(struct super_block *s, 4190 struct btrfs_key *key, 4191 struct btrfs_root *root) 4192 { 4193 struct inode *inode = new_inode(s); 4194 4195 if (!inode) 4196 return ERR_PTR(-ENOMEM); 4197 4198 BTRFS_I(inode)->root = root; 4199 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4200 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 4201 4202 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4203 inode->i_op = &btrfs_dir_ro_inode_operations; 4204 inode->i_fop = &simple_dir_operations; 4205 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 4206 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4207 4208 return inode; 4209 } 4210 4211 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 4212 { 4213 struct inode *inode; 4214 struct btrfs_root *root = BTRFS_I(dir)->root; 4215 struct btrfs_root *sub_root = root; 4216 struct btrfs_key location; 4217 int index; 4218 int ret = 0; 4219 4220 if (dentry->d_name.len > BTRFS_NAME_LEN) 4221 return ERR_PTR(-ENAMETOOLONG); 4222 4223 if (unlikely(d_need_lookup(dentry))) { 4224 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); 4225 kfree(dentry->d_fsdata); 4226 dentry->d_fsdata = NULL; 4227 /* This thing is hashed, drop it for now */ 4228 d_drop(dentry); 4229 } else { 4230 ret = btrfs_inode_by_name(dir, dentry, &location); 4231 } 4232 4233 if (ret < 0) 4234 return ERR_PTR(ret); 4235 4236 if (location.objectid == 0) 4237 return NULL; 4238 4239 if (location.type == BTRFS_INODE_ITEM_KEY) { 4240 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 4241 return inode; 4242 } 4243 4244 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 4245 4246 index = srcu_read_lock(&root->fs_info->subvol_srcu); 4247 ret = fixup_tree_root_location(root, dir, dentry, 4248 &location, &sub_root); 4249 if (ret < 0) { 4250 if (ret != -ENOENT) 4251 inode = ERR_PTR(ret); 4252 else 4253 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4254 } else { 4255 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 4256 } 4257 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4258 4259 if (!IS_ERR(inode) && root != sub_root) { 4260 down_read(&root->fs_info->cleanup_work_sem); 4261 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4262 ret = btrfs_orphan_cleanup(sub_root); 4263 up_read(&root->fs_info->cleanup_work_sem); 4264 if (ret) 4265 inode = ERR_PTR(ret); 4266 } 4267 4268 return inode; 4269 } 4270 4271 static int btrfs_dentry_delete(const struct dentry *dentry) 4272 { 4273 struct btrfs_root *root; 4274 struct inode *inode = dentry->d_inode; 4275 4276 if (!inode && !IS_ROOT(dentry)) 4277 inode = dentry->d_parent->d_inode; 4278 4279 if (inode) { 4280 root = BTRFS_I(inode)->root; 4281 if (btrfs_root_refs(&root->root_item) == 0) 4282 return 1; 4283 4284 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 4285 return 1; 4286 } 4287 return 0; 4288 } 4289 4290 static void btrfs_dentry_release(struct dentry *dentry) 4291 { 4292 if (dentry->d_fsdata) 4293 kfree(dentry->d_fsdata); 4294 } 4295 4296 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4297 unsigned int flags) 4298 { 4299 struct dentry *ret; 4300 4301 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4302 if (unlikely(d_need_lookup(dentry))) { 4303 spin_lock(&dentry->d_lock); 4304 dentry->d_flags &= ~DCACHE_NEED_LOOKUP; 4305 spin_unlock(&dentry->d_lock); 4306 } 4307 return ret; 4308 } 4309 4310 unsigned char btrfs_filetype_table[] = { 4311 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 4312 }; 4313 4314 static int btrfs_real_readdir(struct file *filp, void *dirent, 4315 filldir_t filldir) 4316 { 4317 struct inode *inode = filp->f_dentry->d_inode; 4318 struct btrfs_root *root = BTRFS_I(inode)->root; 4319 struct btrfs_item *item; 4320 struct btrfs_dir_item *di; 4321 struct btrfs_key key; 4322 struct btrfs_key found_key; 4323 struct btrfs_path *path; 4324 struct list_head ins_list; 4325 struct list_head del_list; 4326 int ret; 4327 struct extent_buffer *leaf; 4328 int slot; 4329 unsigned char d_type; 4330 int over = 0; 4331 u32 di_cur; 4332 u32 di_total; 4333 u32 di_len; 4334 int key_type = BTRFS_DIR_INDEX_KEY; 4335 char tmp_name[32]; 4336 char *name_ptr; 4337 int name_len; 4338 int is_curr = 0; /* filp->f_pos points to the current index? */ 4339 4340 /* FIXME, use a real flag for deciding about the key type */ 4341 if (root->fs_info->tree_root == root) 4342 key_type = BTRFS_DIR_ITEM_KEY; 4343 4344 /* special case for "." */ 4345 if (filp->f_pos == 0) { 4346 over = filldir(dirent, ".", 1, 4347 filp->f_pos, btrfs_ino(inode), DT_DIR); 4348 if (over) 4349 return 0; 4350 filp->f_pos = 1; 4351 } 4352 /* special case for .., just use the back ref */ 4353 if (filp->f_pos == 1) { 4354 u64 pino = parent_ino(filp->f_path.dentry); 4355 over = filldir(dirent, "..", 2, 4356 filp->f_pos, pino, DT_DIR); 4357 if (over) 4358 return 0; 4359 filp->f_pos = 2; 4360 } 4361 path = btrfs_alloc_path(); 4362 if (!path) 4363 return -ENOMEM; 4364 4365 path->reada = 1; 4366 4367 if (key_type == BTRFS_DIR_INDEX_KEY) { 4368 INIT_LIST_HEAD(&ins_list); 4369 INIT_LIST_HEAD(&del_list); 4370 btrfs_get_delayed_items(inode, &ins_list, &del_list); 4371 } 4372 4373 btrfs_set_key_type(&key, key_type); 4374 key.offset = filp->f_pos; 4375 key.objectid = btrfs_ino(inode); 4376 4377 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4378 if (ret < 0) 4379 goto err; 4380 4381 while (1) { 4382 leaf = path->nodes[0]; 4383 slot = path->slots[0]; 4384 if (slot >= btrfs_header_nritems(leaf)) { 4385 ret = btrfs_next_leaf(root, path); 4386 if (ret < 0) 4387 goto err; 4388 else if (ret > 0) 4389 break; 4390 continue; 4391 } 4392 4393 item = btrfs_item_nr(leaf, slot); 4394 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4395 4396 if (found_key.objectid != key.objectid) 4397 break; 4398 if (btrfs_key_type(&found_key) != key_type) 4399 break; 4400 if (found_key.offset < filp->f_pos) 4401 goto next; 4402 if (key_type == BTRFS_DIR_INDEX_KEY && 4403 btrfs_should_delete_dir_index(&del_list, 4404 found_key.offset)) 4405 goto next; 4406 4407 filp->f_pos = found_key.offset; 4408 is_curr = 1; 4409 4410 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 4411 di_cur = 0; 4412 di_total = btrfs_item_size(leaf, item); 4413 4414 while (di_cur < di_total) { 4415 struct btrfs_key location; 4416 4417 if (verify_dir_item(root, leaf, di)) 4418 break; 4419 4420 name_len = btrfs_dir_name_len(leaf, di); 4421 if (name_len <= sizeof(tmp_name)) { 4422 name_ptr = tmp_name; 4423 } else { 4424 name_ptr = kmalloc(name_len, GFP_NOFS); 4425 if (!name_ptr) { 4426 ret = -ENOMEM; 4427 goto err; 4428 } 4429 } 4430 read_extent_buffer(leaf, name_ptr, 4431 (unsigned long)(di + 1), name_len); 4432 4433 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4434 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4435 4436 4437 /* is this a reference to our own snapshot? If so 4438 * skip it. 4439 * 4440 * In contrast to old kernels, we insert the snapshot's 4441 * dir item and dir index after it has been created, so 4442 * we won't find a reference to our own snapshot. We 4443 * still keep the following code for backward 4444 * compatibility. 4445 */ 4446 if (location.type == BTRFS_ROOT_ITEM_KEY && 4447 location.objectid == root->root_key.objectid) { 4448 over = 0; 4449 goto skip; 4450 } 4451 over = filldir(dirent, name_ptr, name_len, 4452 found_key.offset, location.objectid, 4453 d_type); 4454 4455 skip: 4456 if (name_ptr != tmp_name) 4457 kfree(name_ptr); 4458 4459 if (over) 4460 goto nopos; 4461 di_len = btrfs_dir_name_len(leaf, di) + 4462 btrfs_dir_data_len(leaf, di) + sizeof(*di); 4463 di_cur += di_len; 4464 di = (struct btrfs_dir_item *)((char *)di + di_len); 4465 } 4466 next: 4467 path->slots[0]++; 4468 } 4469 4470 if (key_type == BTRFS_DIR_INDEX_KEY) { 4471 if (is_curr) 4472 filp->f_pos++; 4473 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, 4474 &ins_list); 4475 if (ret) 4476 goto nopos; 4477 } 4478 4479 /* Reached end of directory/root. Bump pos past the last item. */ 4480 if (key_type == BTRFS_DIR_INDEX_KEY) 4481 /* 4482 * 32-bit glibc will use getdents64, but then strtol - 4483 * so the last number we can serve is this. 4484 */ 4485 filp->f_pos = 0x7fffffff; 4486 else 4487 filp->f_pos++; 4488 nopos: 4489 ret = 0; 4490 err: 4491 if (key_type == BTRFS_DIR_INDEX_KEY) 4492 btrfs_put_delayed_items(&ins_list, &del_list); 4493 btrfs_free_path(path); 4494 return ret; 4495 } 4496 4497 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 4498 { 4499 struct btrfs_root *root = BTRFS_I(inode)->root; 4500 struct btrfs_trans_handle *trans; 4501 int ret = 0; 4502 bool nolock = false; 4503 4504 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4505 return 0; 4506 4507 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) 4508 nolock = true; 4509 4510 if (wbc->sync_mode == WB_SYNC_ALL) { 4511 if (nolock) 4512 trans = btrfs_join_transaction_nolock(root); 4513 else 4514 trans = btrfs_join_transaction(root); 4515 if (IS_ERR(trans)) 4516 return PTR_ERR(trans); 4517 ret = btrfs_commit_transaction(trans, root); 4518 } 4519 return ret; 4520 } 4521 4522 /* 4523 * This is somewhat expensive, updating the tree every time the 4524 * inode changes. But, it is most likely to find the inode in cache. 4525 * FIXME, needs more benchmarking...there are no reasons other than performance 4526 * to keep or drop this code. 4527 */ 4528 int btrfs_dirty_inode(struct inode *inode) 4529 { 4530 struct btrfs_root *root = BTRFS_I(inode)->root; 4531 struct btrfs_trans_handle *trans; 4532 int ret; 4533 4534 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4535 return 0; 4536 4537 trans = btrfs_join_transaction(root); 4538 if (IS_ERR(trans)) 4539 return PTR_ERR(trans); 4540 4541 ret = btrfs_update_inode(trans, root, inode); 4542 if (ret && ret == -ENOSPC) { 4543 /* whoops, lets try again with the full transaction */ 4544 btrfs_end_transaction(trans, root); 4545 trans = btrfs_start_transaction(root, 1); 4546 if (IS_ERR(trans)) 4547 return PTR_ERR(trans); 4548 4549 ret = btrfs_update_inode(trans, root, inode); 4550 } 4551 btrfs_end_transaction(trans, root); 4552 if (BTRFS_I(inode)->delayed_node) 4553 btrfs_balance_delayed_items(root); 4554 4555 return ret; 4556 } 4557 4558 /* 4559 * This is a copy of file_update_time. We need this so we can return error on 4560 * ENOSPC for updating the inode in the case of file write and mmap writes. 4561 */ 4562 static int btrfs_update_time(struct inode *inode, struct timespec *now, 4563 int flags) 4564 { 4565 struct btrfs_root *root = BTRFS_I(inode)->root; 4566 4567 if (btrfs_root_readonly(root)) 4568 return -EROFS; 4569 4570 if (flags & S_VERSION) 4571 inode_inc_iversion(inode); 4572 if (flags & S_CTIME) 4573 inode->i_ctime = *now; 4574 if (flags & S_MTIME) 4575 inode->i_mtime = *now; 4576 if (flags & S_ATIME) 4577 inode->i_atime = *now; 4578 return btrfs_dirty_inode(inode); 4579 } 4580 4581 /* 4582 * find the highest existing sequence number in a directory 4583 * and then set the in-memory index_cnt variable to reflect 4584 * free sequence numbers 4585 */ 4586 static int btrfs_set_inode_index_count(struct inode *inode) 4587 { 4588 struct btrfs_root *root = BTRFS_I(inode)->root; 4589 struct btrfs_key key, found_key; 4590 struct btrfs_path *path; 4591 struct extent_buffer *leaf; 4592 int ret; 4593 4594 key.objectid = btrfs_ino(inode); 4595 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 4596 key.offset = (u64)-1; 4597 4598 path = btrfs_alloc_path(); 4599 if (!path) 4600 return -ENOMEM; 4601 4602 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4603 if (ret < 0) 4604 goto out; 4605 /* FIXME: we should be able to handle this */ 4606 if (ret == 0) 4607 goto out; 4608 ret = 0; 4609 4610 /* 4611 * MAGIC NUMBER EXPLANATION: 4612 * since we search a directory based on f_pos we have to start at 2 4613 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 4614 * else has to start at 2 4615 */ 4616 if (path->slots[0] == 0) { 4617 BTRFS_I(inode)->index_cnt = 2; 4618 goto out; 4619 } 4620 4621 path->slots[0]--; 4622 4623 leaf = path->nodes[0]; 4624 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4625 4626 if (found_key.objectid != btrfs_ino(inode) || 4627 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 4628 BTRFS_I(inode)->index_cnt = 2; 4629 goto out; 4630 } 4631 4632 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 4633 out: 4634 btrfs_free_path(path); 4635 return ret; 4636 } 4637 4638 /* 4639 * helper to find a free sequence number in a given directory. This current 4640 * code is very simple, later versions will do smarter things in the btree 4641 */ 4642 int btrfs_set_inode_index(struct inode *dir, u64 *index) 4643 { 4644 int ret = 0; 4645 4646 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 4647 ret = btrfs_inode_delayed_dir_index_count(dir); 4648 if (ret) { 4649 ret = btrfs_set_inode_index_count(dir); 4650 if (ret) 4651 return ret; 4652 } 4653 } 4654 4655 *index = BTRFS_I(dir)->index_cnt; 4656 BTRFS_I(dir)->index_cnt++; 4657 4658 return ret; 4659 } 4660 4661 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 4662 struct btrfs_root *root, 4663 struct inode *dir, 4664 const char *name, int name_len, 4665 u64 ref_objectid, u64 objectid, 4666 umode_t mode, u64 *index) 4667 { 4668 struct inode *inode; 4669 struct btrfs_inode_item *inode_item; 4670 struct btrfs_key *location; 4671 struct btrfs_path *path; 4672 struct btrfs_inode_ref *ref; 4673 struct btrfs_key key[2]; 4674 u32 sizes[2]; 4675 unsigned long ptr; 4676 int ret; 4677 int owner; 4678 4679 path = btrfs_alloc_path(); 4680 if (!path) 4681 return ERR_PTR(-ENOMEM); 4682 4683 inode = new_inode(root->fs_info->sb); 4684 if (!inode) { 4685 btrfs_free_path(path); 4686 return ERR_PTR(-ENOMEM); 4687 } 4688 4689 /* 4690 * we have to initialize this early, so we can reclaim the inode 4691 * number if we fail afterwards in this function. 4692 */ 4693 inode->i_ino = objectid; 4694 4695 if (dir) { 4696 trace_btrfs_inode_request(dir); 4697 4698 ret = btrfs_set_inode_index(dir, index); 4699 if (ret) { 4700 btrfs_free_path(path); 4701 iput(inode); 4702 return ERR_PTR(ret); 4703 } 4704 } 4705 /* 4706 * index_cnt is ignored for everything but a dir, 4707 * btrfs_get_inode_index_count has an explanation for the magic 4708 * number 4709 */ 4710 BTRFS_I(inode)->index_cnt = 2; 4711 BTRFS_I(inode)->root = root; 4712 BTRFS_I(inode)->generation = trans->transid; 4713 inode->i_generation = BTRFS_I(inode)->generation; 4714 4715 /* 4716 * We could have gotten an inode number from somebody who was fsynced 4717 * and then removed in this same transaction, so let's just set full 4718 * sync since it will be a full sync anyway and this will blow away the 4719 * old info in the log. 4720 */ 4721 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 4722 4723 if (S_ISDIR(mode)) 4724 owner = 0; 4725 else 4726 owner = 1; 4727 4728 key[0].objectid = objectid; 4729 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4730 key[0].offset = 0; 4731 4732 /* 4733 * Start new inodes with an inode_ref. This is slightly more 4734 * efficient for small numbers of hard links since they will 4735 * be packed into one item. Extended refs will kick in if we 4736 * add more hard links than can fit in the ref item. 4737 */ 4738 key[1].objectid = objectid; 4739 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4740 key[1].offset = ref_objectid; 4741 4742 sizes[0] = sizeof(struct btrfs_inode_item); 4743 sizes[1] = name_len + sizeof(*ref); 4744 4745 path->leave_spinning = 1; 4746 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 4747 if (ret != 0) 4748 goto fail; 4749 4750 inode_init_owner(inode, dir, mode); 4751 inode_set_bytes(inode, 0); 4752 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4753 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4754 struct btrfs_inode_item); 4755 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, 4756 sizeof(*inode_item)); 4757 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4758 4759 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4760 struct btrfs_inode_ref); 4761 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 4762 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 4763 ptr = (unsigned long)(ref + 1); 4764 write_extent_buffer(path->nodes[0], name, ptr, name_len); 4765 4766 btrfs_mark_buffer_dirty(path->nodes[0]); 4767 btrfs_free_path(path); 4768 4769 location = &BTRFS_I(inode)->location; 4770 location->objectid = objectid; 4771 location->offset = 0; 4772 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 4773 4774 btrfs_inherit_iflags(inode, dir); 4775 4776 if (S_ISREG(mode)) { 4777 if (btrfs_test_opt(root, NODATASUM)) 4778 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4779 if (btrfs_test_opt(root, NODATACOW) || 4780 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) 4781 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4782 } 4783 4784 insert_inode_hash(inode); 4785 inode_tree_add(inode); 4786 4787 trace_btrfs_inode_new(inode); 4788 btrfs_set_inode_last_trans(trans, inode); 4789 4790 btrfs_update_root_times(trans, root); 4791 4792 return inode; 4793 fail: 4794 if (dir) 4795 BTRFS_I(dir)->index_cnt--; 4796 btrfs_free_path(path); 4797 iput(inode); 4798 return ERR_PTR(ret); 4799 } 4800 4801 static inline u8 btrfs_inode_type(struct inode *inode) 4802 { 4803 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 4804 } 4805 4806 /* 4807 * utility function to add 'inode' into 'parent_inode' with 4808 * a give name and a given sequence number. 4809 * if 'add_backref' is true, also insert a backref from the 4810 * inode to the parent directory. 4811 */ 4812 int btrfs_add_link(struct btrfs_trans_handle *trans, 4813 struct inode *parent_inode, struct inode *inode, 4814 const char *name, int name_len, int add_backref, u64 index) 4815 { 4816 int ret = 0; 4817 struct btrfs_key key; 4818 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4819 u64 ino = btrfs_ino(inode); 4820 u64 parent_ino = btrfs_ino(parent_inode); 4821 4822 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4823 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4824 } else { 4825 key.objectid = ino; 4826 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4827 key.offset = 0; 4828 } 4829 4830 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4831 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4832 key.objectid, root->root_key.objectid, 4833 parent_ino, index, name, name_len); 4834 } else if (add_backref) { 4835 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 4836 parent_ino, index); 4837 } 4838 4839 /* Nothing to clean up yet */ 4840 if (ret) 4841 return ret; 4842 4843 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4844 parent_inode, &key, 4845 btrfs_inode_type(inode), index); 4846 if (ret == -EEXIST) 4847 goto fail_dir_item; 4848 else if (ret) { 4849 btrfs_abort_transaction(trans, root, ret); 4850 return ret; 4851 } 4852 4853 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4854 name_len * 2); 4855 inode_inc_iversion(parent_inode); 4856 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4857 ret = btrfs_update_inode(trans, root, parent_inode); 4858 if (ret) 4859 btrfs_abort_transaction(trans, root, ret); 4860 return ret; 4861 4862 fail_dir_item: 4863 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4864 u64 local_index; 4865 int err; 4866 err = btrfs_del_root_ref(trans, root->fs_info->tree_root, 4867 key.objectid, root->root_key.objectid, 4868 parent_ino, &local_index, name, name_len); 4869 4870 } else if (add_backref) { 4871 u64 local_index; 4872 int err; 4873 4874 err = btrfs_del_inode_ref(trans, root, name, name_len, 4875 ino, parent_ino, &local_index); 4876 } 4877 return ret; 4878 } 4879 4880 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4881 struct inode *dir, struct dentry *dentry, 4882 struct inode *inode, int backref, u64 index) 4883 { 4884 int err = btrfs_add_link(trans, dir, inode, 4885 dentry->d_name.name, dentry->d_name.len, 4886 backref, index); 4887 if (err > 0) 4888 err = -EEXIST; 4889 return err; 4890 } 4891 4892 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4893 umode_t mode, dev_t rdev) 4894 { 4895 struct btrfs_trans_handle *trans; 4896 struct btrfs_root *root = BTRFS_I(dir)->root; 4897 struct inode *inode = NULL; 4898 int err; 4899 int drop_inode = 0; 4900 u64 objectid; 4901 unsigned long nr = 0; 4902 u64 index = 0; 4903 4904 if (!new_valid_dev(rdev)) 4905 return -EINVAL; 4906 4907 /* 4908 * 2 for inode item and ref 4909 * 2 for dir items 4910 * 1 for xattr if selinux is on 4911 */ 4912 trans = btrfs_start_transaction(root, 5); 4913 if (IS_ERR(trans)) 4914 return PTR_ERR(trans); 4915 4916 err = btrfs_find_free_ino(root, &objectid); 4917 if (err) 4918 goto out_unlock; 4919 4920 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4921 dentry->d_name.len, btrfs_ino(dir), objectid, 4922 mode, &index); 4923 if (IS_ERR(inode)) { 4924 err = PTR_ERR(inode); 4925 goto out_unlock; 4926 } 4927 4928 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4929 if (err) { 4930 drop_inode = 1; 4931 goto out_unlock; 4932 } 4933 4934 /* 4935 * If the active LSM wants to access the inode during 4936 * d_instantiate it needs these. Smack checks to see 4937 * if the filesystem supports xattrs by looking at the 4938 * ops vector. 4939 */ 4940 4941 inode->i_op = &btrfs_special_inode_operations; 4942 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4943 if (err) 4944 drop_inode = 1; 4945 else { 4946 init_special_inode(inode, inode->i_mode, rdev); 4947 btrfs_update_inode(trans, root, inode); 4948 d_instantiate(dentry, inode); 4949 } 4950 out_unlock: 4951 nr = trans->blocks_used; 4952 btrfs_end_transaction(trans, root); 4953 btrfs_btree_balance_dirty(root, nr); 4954 if (drop_inode) { 4955 inode_dec_link_count(inode); 4956 iput(inode); 4957 } 4958 return err; 4959 } 4960 4961 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4962 umode_t mode, bool excl) 4963 { 4964 struct btrfs_trans_handle *trans; 4965 struct btrfs_root *root = BTRFS_I(dir)->root; 4966 struct inode *inode = NULL; 4967 int drop_inode = 0; 4968 int err; 4969 unsigned long nr = 0; 4970 u64 objectid; 4971 u64 index = 0; 4972 4973 /* 4974 * 2 for inode item and ref 4975 * 2 for dir items 4976 * 1 for xattr if selinux is on 4977 */ 4978 trans = btrfs_start_transaction(root, 5); 4979 if (IS_ERR(trans)) 4980 return PTR_ERR(trans); 4981 4982 err = btrfs_find_free_ino(root, &objectid); 4983 if (err) 4984 goto out_unlock; 4985 4986 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4987 dentry->d_name.len, btrfs_ino(dir), objectid, 4988 mode, &index); 4989 if (IS_ERR(inode)) { 4990 err = PTR_ERR(inode); 4991 goto out_unlock; 4992 } 4993 4994 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4995 if (err) { 4996 drop_inode = 1; 4997 goto out_unlock; 4998 } 4999 5000 /* 5001 * If the active LSM wants to access the inode during 5002 * d_instantiate it needs these. Smack checks to see 5003 * if the filesystem supports xattrs by looking at the 5004 * ops vector. 5005 */ 5006 inode->i_fop = &btrfs_file_operations; 5007 inode->i_op = &btrfs_file_inode_operations; 5008 5009 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5010 if (err) 5011 drop_inode = 1; 5012 else { 5013 inode->i_mapping->a_ops = &btrfs_aops; 5014 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5015 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5016 d_instantiate(dentry, inode); 5017 } 5018 out_unlock: 5019 nr = trans->blocks_used; 5020 btrfs_end_transaction(trans, root); 5021 if (drop_inode) { 5022 inode_dec_link_count(inode); 5023 iput(inode); 5024 } 5025 btrfs_btree_balance_dirty(root, nr); 5026 return err; 5027 } 5028 5029 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 5030 struct dentry *dentry) 5031 { 5032 struct btrfs_trans_handle *trans; 5033 struct btrfs_root *root = BTRFS_I(dir)->root; 5034 struct inode *inode = old_dentry->d_inode; 5035 u64 index; 5036 unsigned long nr = 0; 5037 int err; 5038 int drop_inode = 0; 5039 5040 /* do not allow sys_link's with other subvols of the same device */ 5041 if (root->objectid != BTRFS_I(inode)->root->objectid) 5042 return -EXDEV; 5043 5044 if (inode->i_nlink >= BTRFS_LINK_MAX) 5045 return -EMLINK; 5046 5047 err = btrfs_set_inode_index(dir, &index); 5048 if (err) 5049 goto fail; 5050 5051 /* 5052 * 2 items for inode and inode ref 5053 * 2 items for dir items 5054 * 1 item for parent inode 5055 */ 5056 trans = btrfs_start_transaction(root, 5); 5057 if (IS_ERR(trans)) { 5058 err = PTR_ERR(trans); 5059 goto fail; 5060 } 5061 5062 btrfs_inc_nlink(inode); 5063 inode_inc_iversion(inode); 5064 inode->i_ctime = CURRENT_TIME; 5065 ihold(inode); 5066 5067 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5068 5069 if (err) { 5070 drop_inode = 1; 5071 } else { 5072 struct dentry *parent = dentry->d_parent; 5073 err = btrfs_update_inode(trans, root, inode); 5074 if (err) 5075 goto fail; 5076 d_instantiate(dentry, inode); 5077 btrfs_log_new_name(trans, inode, NULL, parent); 5078 } 5079 5080 nr = trans->blocks_used; 5081 btrfs_end_transaction(trans, root); 5082 fail: 5083 if (drop_inode) { 5084 inode_dec_link_count(inode); 5085 iput(inode); 5086 } 5087 btrfs_btree_balance_dirty(root, nr); 5088 return err; 5089 } 5090 5091 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 5092 { 5093 struct inode *inode = NULL; 5094 struct btrfs_trans_handle *trans; 5095 struct btrfs_root *root = BTRFS_I(dir)->root; 5096 int err = 0; 5097 int drop_on_err = 0; 5098 u64 objectid = 0; 5099 u64 index = 0; 5100 unsigned long nr = 1; 5101 5102 /* 5103 * 2 items for inode and ref 5104 * 2 items for dir items 5105 * 1 for xattr if selinux is on 5106 */ 5107 trans = btrfs_start_transaction(root, 5); 5108 if (IS_ERR(trans)) 5109 return PTR_ERR(trans); 5110 5111 err = btrfs_find_free_ino(root, &objectid); 5112 if (err) 5113 goto out_fail; 5114 5115 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5116 dentry->d_name.len, btrfs_ino(dir), objectid, 5117 S_IFDIR | mode, &index); 5118 if (IS_ERR(inode)) { 5119 err = PTR_ERR(inode); 5120 goto out_fail; 5121 } 5122 5123 drop_on_err = 1; 5124 5125 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5126 if (err) 5127 goto out_fail; 5128 5129 inode->i_op = &btrfs_dir_inode_operations; 5130 inode->i_fop = &btrfs_dir_file_operations; 5131 5132 btrfs_i_size_write(inode, 0); 5133 err = btrfs_update_inode(trans, root, inode); 5134 if (err) 5135 goto out_fail; 5136 5137 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 5138 dentry->d_name.len, 0, index); 5139 if (err) 5140 goto out_fail; 5141 5142 d_instantiate(dentry, inode); 5143 drop_on_err = 0; 5144 5145 out_fail: 5146 nr = trans->blocks_used; 5147 btrfs_end_transaction(trans, root); 5148 if (drop_on_err) 5149 iput(inode); 5150 btrfs_btree_balance_dirty(root, nr); 5151 return err; 5152 } 5153 5154 /* helper for btfs_get_extent. Given an existing extent in the tree, 5155 * and an extent that you want to insert, deal with overlap and insert 5156 * the new extent into the tree. 5157 */ 5158 static int merge_extent_mapping(struct extent_map_tree *em_tree, 5159 struct extent_map *existing, 5160 struct extent_map *em, 5161 u64 map_start, u64 map_len) 5162 { 5163 u64 start_diff; 5164 5165 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 5166 start_diff = map_start - em->start; 5167 em->start = map_start; 5168 em->len = map_len; 5169 if (em->block_start < EXTENT_MAP_LAST_BYTE && 5170 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 5171 em->block_start += start_diff; 5172 em->block_len -= start_diff; 5173 } 5174 return add_extent_mapping(em_tree, em); 5175 } 5176 5177 static noinline int uncompress_inline(struct btrfs_path *path, 5178 struct inode *inode, struct page *page, 5179 size_t pg_offset, u64 extent_offset, 5180 struct btrfs_file_extent_item *item) 5181 { 5182 int ret; 5183 struct extent_buffer *leaf = path->nodes[0]; 5184 char *tmp; 5185 size_t max_size; 5186 unsigned long inline_size; 5187 unsigned long ptr; 5188 int compress_type; 5189 5190 WARN_ON(pg_offset != 0); 5191 compress_type = btrfs_file_extent_compression(leaf, item); 5192 max_size = btrfs_file_extent_ram_bytes(leaf, item); 5193 inline_size = btrfs_file_extent_inline_item_len(leaf, 5194 btrfs_item_nr(leaf, path->slots[0])); 5195 tmp = kmalloc(inline_size, GFP_NOFS); 5196 if (!tmp) 5197 return -ENOMEM; 5198 ptr = btrfs_file_extent_inline_start(item); 5199 5200 read_extent_buffer(leaf, tmp, ptr, inline_size); 5201 5202 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 5203 ret = btrfs_decompress(compress_type, tmp, page, 5204 extent_offset, inline_size, max_size); 5205 if (ret) { 5206 char *kaddr = kmap_atomic(page); 5207 unsigned long copy_size = min_t(u64, 5208 PAGE_CACHE_SIZE - pg_offset, 5209 max_size - extent_offset); 5210 memset(kaddr + pg_offset, 0, copy_size); 5211 kunmap_atomic(kaddr); 5212 } 5213 kfree(tmp); 5214 return 0; 5215 } 5216 5217 /* 5218 * a bit scary, this does extent mapping from logical file offset to the disk. 5219 * the ugly parts come from merging extents from the disk with the in-ram 5220 * representation. This gets more complex because of the data=ordered code, 5221 * where the in-ram extents might be locked pending data=ordered completion. 5222 * 5223 * This also copies inline extents directly into the page. 5224 */ 5225 5226 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 5227 size_t pg_offset, u64 start, u64 len, 5228 int create) 5229 { 5230 int ret; 5231 int err = 0; 5232 u64 bytenr; 5233 u64 extent_start = 0; 5234 u64 extent_end = 0; 5235 u64 objectid = btrfs_ino(inode); 5236 u32 found_type; 5237 struct btrfs_path *path = NULL; 5238 struct btrfs_root *root = BTRFS_I(inode)->root; 5239 struct btrfs_file_extent_item *item; 5240 struct extent_buffer *leaf; 5241 struct btrfs_key found_key; 5242 struct extent_map *em = NULL; 5243 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5244 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5245 struct btrfs_trans_handle *trans = NULL; 5246 int compress_type; 5247 5248 again: 5249 read_lock(&em_tree->lock); 5250 em = lookup_extent_mapping(em_tree, start, len); 5251 if (em) 5252 em->bdev = root->fs_info->fs_devices->latest_bdev; 5253 read_unlock(&em_tree->lock); 5254 5255 if (em) { 5256 if (em->start > start || em->start + em->len <= start) 5257 free_extent_map(em); 5258 else if (em->block_start == EXTENT_MAP_INLINE && page) 5259 free_extent_map(em); 5260 else 5261 goto out; 5262 } 5263 em = alloc_extent_map(); 5264 if (!em) { 5265 err = -ENOMEM; 5266 goto out; 5267 } 5268 em->bdev = root->fs_info->fs_devices->latest_bdev; 5269 em->start = EXTENT_MAP_HOLE; 5270 em->orig_start = EXTENT_MAP_HOLE; 5271 em->len = (u64)-1; 5272 em->block_len = (u64)-1; 5273 5274 if (!path) { 5275 path = btrfs_alloc_path(); 5276 if (!path) { 5277 err = -ENOMEM; 5278 goto out; 5279 } 5280 /* 5281 * Chances are we'll be called again, so go ahead and do 5282 * readahead 5283 */ 5284 path->reada = 1; 5285 } 5286 5287 ret = btrfs_lookup_file_extent(trans, root, path, 5288 objectid, start, trans != NULL); 5289 if (ret < 0) { 5290 err = ret; 5291 goto out; 5292 } 5293 5294 if (ret != 0) { 5295 if (path->slots[0] == 0) 5296 goto not_found; 5297 path->slots[0]--; 5298 } 5299 5300 leaf = path->nodes[0]; 5301 item = btrfs_item_ptr(leaf, path->slots[0], 5302 struct btrfs_file_extent_item); 5303 /* are we inside the extent that was found? */ 5304 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5305 found_type = btrfs_key_type(&found_key); 5306 if (found_key.objectid != objectid || 5307 found_type != BTRFS_EXTENT_DATA_KEY) { 5308 goto not_found; 5309 } 5310 5311 found_type = btrfs_file_extent_type(leaf, item); 5312 extent_start = found_key.offset; 5313 compress_type = btrfs_file_extent_compression(leaf, item); 5314 if (found_type == BTRFS_FILE_EXTENT_REG || 5315 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5316 extent_end = extent_start + 5317 btrfs_file_extent_num_bytes(leaf, item); 5318 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5319 size_t size; 5320 size = btrfs_file_extent_inline_len(leaf, item); 5321 extent_end = (extent_start + size + root->sectorsize - 1) & 5322 ~((u64)root->sectorsize - 1); 5323 } 5324 5325 if (start >= extent_end) { 5326 path->slots[0]++; 5327 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 5328 ret = btrfs_next_leaf(root, path); 5329 if (ret < 0) { 5330 err = ret; 5331 goto out; 5332 } 5333 if (ret > 0) 5334 goto not_found; 5335 leaf = path->nodes[0]; 5336 } 5337 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5338 if (found_key.objectid != objectid || 5339 found_key.type != BTRFS_EXTENT_DATA_KEY) 5340 goto not_found; 5341 if (start + len <= found_key.offset) 5342 goto not_found; 5343 em->start = start; 5344 em->len = found_key.offset - start; 5345 goto not_found_em; 5346 } 5347 5348 if (found_type == BTRFS_FILE_EXTENT_REG || 5349 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5350 em->start = extent_start; 5351 em->len = extent_end - extent_start; 5352 em->orig_start = extent_start - 5353 btrfs_file_extent_offset(leaf, item); 5354 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5355 if (bytenr == 0) { 5356 em->block_start = EXTENT_MAP_HOLE; 5357 goto insert; 5358 } 5359 if (compress_type != BTRFS_COMPRESS_NONE) { 5360 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5361 em->compress_type = compress_type; 5362 em->block_start = bytenr; 5363 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5364 item); 5365 } else { 5366 bytenr += btrfs_file_extent_offset(leaf, item); 5367 em->block_start = bytenr; 5368 em->block_len = em->len; 5369 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 5370 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5371 } 5372 goto insert; 5373 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5374 unsigned long ptr; 5375 char *map; 5376 size_t size; 5377 size_t extent_offset; 5378 size_t copy_size; 5379 5380 em->block_start = EXTENT_MAP_INLINE; 5381 if (!page || create) { 5382 em->start = extent_start; 5383 em->len = extent_end - extent_start; 5384 goto out; 5385 } 5386 5387 size = btrfs_file_extent_inline_len(leaf, item); 5388 extent_offset = page_offset(page) + pg_offset - extent_start; 5389 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 5390 size - extent_offset); 5391 em->start = extent_start + extent_offset; 5392 em->len = (copy_size + root->sectorsize - 1) & 5393 ~((u64)root->sectorsize - 1); 5394 em->orig_start = EXTENT_MAP_INLINE; 5395 if (compress_type) { 5396 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5397 em->compress_type = compress_type; 5398 } 5399 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5400 if (create == 0 && !PageUptodate(page)) { 5401 if (btrfs_file_extent_compression(leaf, item) != 5402 BTRFS_COMPRESS_NONE) { 5403 ret = uncompress_inline(path, inode, page, 5404 pg_offset, 5405 extent_offset, item); 5406 BUG_ON(ret); /* -ENOMEM */ 5407 } else { 5408 map = kmap(page); 5409 read_extent_buffer(leaf, map + pg_offset, ptr, 5410 copy_size); 5411 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 5412 memset(map + pg_offset + copy_size, 0, 5413 PAGE_CACHE_SIZE - pg_offset - 5414 copy_size); 5415 } 5416 kunmap(page); 5417 } 5418 flush_dcache_page(page); 5419 } else if (create && PageUptodate(page)) { 5420 BUG(); 5421 if (!trans) { 5422 kunmap(page); 5423 free_extent_map(em); 5424 em = NULL; 5425 5426 btrfs_release_path(path); 5427 trans = btrfs_join_transaction(root); 5428 5429 if (IS_ERR(trans)) 5430 return ERR_CAST(trans); 5431 goto again; 5432 } 5433 map = kmap(page); 5434 write_extent_buffer(leaf, map + pg_offset, ptr, 5435 copy_size); 5436 kunmap(page); 5437 btrfs_mark_buffer_dirty(leaf); 5438 } 5439 set_extent_uptodate(io_tree, em->start, 5440 extent_map_end(em) - 1, NULL, GFP_NOFS); 5441 goto insert; 5442 } else { 5443 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5444 WARN_ON(1); 5445 } 5446 not_found: 5447 em->start = start; 5448 em->len = len; 5449 not_found_em: 5450 em->block_start = EXTENT_MAP_HOLE; 5451 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 5452 insert: 5453 btrfs_release_path(path); 5454 if (em->start > start || extent_map_end(em) <= start) { 5455 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 5456 "[%llu %llu]\n", (unsigned long long)em->start, 5457 (unsigned long long)em->len, 5458 (unsigned long long)start, 5459 (unsigned long long)len); 5460 err = -EIO; 5461 goto out; 5462 } 5463 5464 err = 0; 5465 write_lock(&em_tree->lock); 5466 ret = add_extent_mapping(em_tree, em); 5467 /* it is possible that someone inserted the extent into the tree 5468 * while we had the lock dropped. It is also possible that 5469 * an overlapping map exists in the tree 5470 */ 5471 if (ret == -EEXIST) { 5472 struct extent_map *existing; 5473 5474 ret = 0; 5475 5476 existing = lookup_extent_mapping(em_tree, start, len); 5477 if (existing && (existing->start > start || 5478 existing->start + existing->len <= start)) { 5479 free_extent_map(existing); 5480 existing = NULL; 5481 } 5482 if (!existing) { 5483 existing = lookup_extent_mapping(em_tree, em->start, 5484 em->len); 5485 if (existing) { 5486 err = merge_extent_mapping(em_tree, existing, 5487 em, start, 5488 root->sectorsize); 5489 free_extent_map(existing); 5490 if (err) { 5491 free_extent_map(em); 5492 em = NULL; 5493 } 5494 } else { 5495 err = -EIO; 5496 free_extent_map(em); 5497 em = NULL; 5498 } 5499 } else { 5500 free_extent_map(em); 5501 em = existing; 5502 err = 0; 5503 } 5504 } 5505 write_unlock(&em_tree->lock); 5506 out: 5507 5508 if (em) 5509 trace_btrfs_get_extent(root, em); 5510 5511 if (path) 5512 btrfs_free_path(path); 5513 if (trans) { 5514 ret = btrfs_end_transaction(trans, root); 5515 if (!err) 5516 err = ret; 5517 } 5518 if (err) { 5519 free_extent_map(em); 5520 return ERR_PTR(err); 5521 } 5522 BUG_ON(!em); /* Error is always set */ 5523 return em; 5524 } 5525 5526 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 5527 size_t pg_offset, u64 start, u64 len, 5528 int create) 5529 { 5530 struct extent_map *em; 5531 struct extent_map *hole_em = NULL; 5532 u64 range_start = start; 5533 u64 end; 5534 u64 found; 5535 u64 found_end; 5536 int err = 0; 5537 5538 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 5539 if (IS_ERR(em)) 5540 return em; 5541 if (em) { 5542 /* 5543 * if our em maps to a hole, there might 5544 * actually be delalloc bytes behind it 5545 */ 5546 if (em->block_start != EXTENT_MAP_HOLE) 5547 return em; 5548 else 5549 hole_em = em; 5550 } 5551 5552 /* check to see if we've wrapped (len == -1 or similar) */ 5553 end = start + len; 5554 if (end < start) 5555 end = (u64)-1; 5556 else 5557 end -= 1; 5558 5559 em = NULL; 5560 5561 /* ok, we didn't find anything, lets look for delalloc */ 5562 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 5563 end, len, EXTENT_DELALLOC, 1); 5564 found_end = range_start + found; 5565 if (found_end < range_start) 5566 found_end = (u64)-1; 5567 5568 /* 5569 * we didn't find anything useful, return 5570 * the original results from get_extent() 5571 */ 5572 if (range_start > end || found_end <= start) { 5573 em = hole_em; 5574 hole_em = NULL; 5575 goto out; 5576 } 5577 5578 /* adjust the range_start to make sure it doesn't 5579 * go backwards from the start they passed in 5580 */ 5581 range_start = max(start,range_start); 5582 found = found_end - range_start; 5583 5584 if (found > 0) { 5585 u64 hole_start = start; 5586 u64 hole_len = len; 5587 5588 em = alloc_extent_map(); 5589 if (!em) { 5590 err = -ENOMEM; 5591 goto out; 5592 } 5593 /* 5594 * when btrfs_get_extent can't find anything it 5595 * returns one huge hole 5596 * 5597 * make sure what it found really fits our range, and 5598 * adjust to make sure it is based on the start from 5599 * the caller 5600 */ 5601 if (hole_em) { 5602 u64 calc_end = extent_map_end(hole_em); 5603 5604 if (calc_end <= start || (hole_em->start > end)) { 5605 free_extent_map(hole_em); 5606 hole_em = NULL; 5607 } else { 5608 hole_start = max(hole_em->start, start); 5609 hole_len = calc_end - hole_start; 5610 } 5611 } 5612 em->bdev = NULL; 5613 if (hole_em && range_start > hole_start) { 5614 /* our hole starts before our delalloc, so we 5615 * have to return just the parts of the hole 5616 * that go until the delalloc starts 5617 */ 5618 em->len = min(hole_len, 5619 range_start - hole_start); 5620 em->start = hole_start; 5621 em->orig_start = hole_start; 5622 /* 5623 * don't adjust block start at all, 5624 * it is fixed at EXTENT_MAP_HOLE 5625 */ 5626 em->block_start = hole_em->block_start; 5627 em->block_len = hole_len; 5628 } else { 5629 em->start = range_start; 5630 em->len = found; 5631 em->orig_start = range_start; 5632 em->block_start = EXTENT_MAP_DELALLOC; 5633 em->block_len = found; 5634 } 5635 } else if (hole_em) { 5636 return hole_em; 5637 } 5638 out: 5639 5640 free_extent_map(hole_em); 5641 if (err) { 5642 free_extent_map(em); 5643 return ERR_PTR(err); 5644 } 5645 return em; 5646 } 5647 5648 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5649 struct extent_map *em, 5650 u64 start, u64 len) 5651 { 5652 struct btrfs_root *root = BTRFS_I(inode)->root; 5653 struct btrfs_trans_handle *trans; 5654 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5655 struct btrfs_key ins; 5656 u64 alloc_hint; 5657 int ret; 5658 bool insert = false; 5659 5660 /* 5661 * Ok if the extent map we looked up is a hole and is for the exact 5662 * range we want, there is no reason to allocate a new one, however if 5663 * it is not right then we need to free this one and drop the cache for 5664 * our range. 5665 */ 5666 if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5667 em->len != len) { 5668 free_extent_map(em); 5669 em = NULL; 5670 insert = true; 5671 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5672 } 5673 5674 trans = btrfs_join_transaction(root); 5675 if (IS_ERR(trans)) 5676 return ERR_CAST(trans); 5677 5678 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) 5679 btrfs_add_inode_defrag(trans, inode); 5680 5681 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5682 5683 alloc_hint = get_extent_allocation_hint(inode, start, len); 5684 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, 5685 alloc_hint, &ins, 1); 5686 if (ret) { 5687 em = ERR_PTR(ret); 5688 goto out; 5689 } 5690 5691 if (!em) { 5692 em = alloc_extent_map(); 5693 if (!em) { 5694 em = ERR_PTR(-ENOMEM); 5695 goto out; 5696 } 5697 } 5698 5699 em->start = start; 5700 em->orig_start = em->start; 5701 em->len = ins.offset; 5702 5703 em->block_start = ins.objectid; 5704 em->block_len = ins.offset; 5705 em->bdev = root->fs_info->fs_devices->latest_bdev; 5706 5707 /* 5708 * We need to do this because if we're using the original em we searched 5709 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5710 */ 5711 em->flags = 0; 5712 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5713 5714 while (insert) { 5715 write_lock(&em_tree->lock); 5716 ret = add_extent_mapping(em_tree, em); 5717 write_unlock(&em_tree->lock); 5718 if (ret != -EEXIST) 5719 break; 5720 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5721 } 5722 5723 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5724 ins.offset, ins.offset, 0); 5725 if (ret) { 5726 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 5727 em = ERR_PTR(ret); 5728 } 5729 out: 5730 btrfs_end_transaction(trans, root); 5731 return em; 5732 } 5733 5734 /* 5735 * returns 1 when the nocow is safe, < 1 on error, 0 if the 5736 * block must be cow'd 5737 */ 5738 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 5739 struct inode *inode, u64 offset, u64 len) 5740 { 5741 struct btrfs_path *path; 5742 int ret; 5743 struct extent_buffer *leaf; 5744 struct btrfs_root *root = BTRFS_I(inode)->root; 5745 struct btrfs_file_extent_item *fi; 5746 struct btrfs_key key; 5747 u64 disk_bytenr; 5748 u64 backref_offset; 5749 u64 extent_end; 5750 u64 num_bytes; 5751 int slot; 5752 int found_type; 5753 5754 path = btrfs_alloc_path(); 5755 if (!path) 5756 return -ENOMEM; 5757 5758 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 5759 offset, 0); 5760 if (ret < 0) 5761 goto out; 5762 5763 slot = path->slots[0]; 5764 if (ret == 1) { 5765 if (slot == 0) { 5766 /* can't find the item, must cow */ 5767 ret = 0; 5768 goto out; 5769 } 5770 slot--; 5771 } 5772 ret = 0; 5773 leaf = path->nodes[0]; 5774 btrfs_item_key_to_cpu(leaf, &key, slot); 5775 if (key.objectid != btrfs_ino(inode) || 5776 key.type != BTRFS_EXTENT_DATA_KEY) { 5777 /* not our file or wrong item type, must cow */ 5778 goto out; 5779 } 5780 5781 if (key.offset > offset) { 5782 /* Wrong offset, must cow */ 5783 goto out; 5784 } 5785 5786 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5787 found_type = btrfs_file_extent_type(leaf, fi); 5788 if (found_type != BTRFS_FILE_EXTENT_REG && 5789 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 5790 /* not a regular extent, must cow */ 5791 goto out; 5792 } 5793 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 5794 backref_offset = btrfs_file_extent_offset(leaf, fi); 5795 5796 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 5797 if (extent_end < offset + len) { 5798 /* extent doesn't include our full range, must cow */ 5799 goto out; 5800 } 5801 5802 if (btrfs_extent_readonly(root, disk_bytenr)) 5803 goto out; 5804 5805 /* 5806 * look for other files referencing this extent, if we 5807 * find any we must cow 5808 */ 5809 if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), 5810 key.offset - backref_offset, disk_bytenr)) 5811 goto out; 5812 5813 /* 5814 * adjust disk_bytenr and num_bytes to cover just the bytes 5815 * in this extent we are about to write. If there 5816 * are any csums in that range we have to cow in order 5817 * to keep the csums correct 5818 */ 5819 disk_bytenr += backref_offset; 5820 disk_bytenr += offset - key.offset; 5821 num_bytes = min(offset + len, extent_end) - offset; 5822 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 5823 goto out; 5824 /* 5825 * all of the above have passed, it is safe to overwrite this extent 5826 * without cow 5827 */ 5828 ret = 1; 5829 out: 5830 btrfs_free_path(path); 5831 return ret; 5832 } 5833 5834 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 5835 struct extent_state **cached_state, int writing) 5836 { 5837 struct btrfs_ordered_extent *ordered; 5838 int ret = 0; 5839 5840 while (1) { 5841 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 5842 0, cached_state); 5843 /* 5844 * We're concerned with the entire range that we're going to be 5845 * doing DIO to, so we need to make sure theres no ordered 5846 * extents in this range. 5847 */ 5848 ordered = btrfs_lookup_ordered_range(inode, lockstart, 5849 lockend - lockstart + 1); 5850 5851 /* 5852 * We need to make sure there are no buffered pages in this 5853 * range either, we could have raced between the invalidate in 5854 * generic_file_direct_write and locking the extent. The 5855 * invalidate needs to happen so that reads after a write do not 5856 * get stale data. 5857 */ 5858 if (!ordered && (!writing || 5859 !test_range_bit(&BTRFS_I(inode)->io_tree, 5860 lockstart, lockend, EXTENT_UPTODATE, 0, 5861 *cached_state))) 5862 break; 5863 5864 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 5865 cached_state, GFP_NOFS); 5866 5867 if (ordered) { 5868 btrfs_start_ordered_extent(inode, ordered, 1); 5869 btrfs_put_ordered_extent(ordered); 5870 } else { 5871 /* Screw you mmap */ 5872 ret = filemap_write_and_wait_range(inode->i_mapping, 5873 lockstart, 5874 lockend); 5875 if (ret) 5876 break; 5877 5878 /* 5879 * If we found a page that couldn't be invalidated just 5880 * fall back to buffered. 5881 */ 5882 ret = invalidate_inode_pages2_range(inode->i_mapping, 5883 lockstart >> PAGE_CACHE_SHIFT, 5884 lockend >> PAGE_CACHE_SHIFT); 5885 if (ret) 5886 break; 5887 } 5888 5889 cond_resched(); 5890 } 5891 5892 return ret; 5893 } 5894 5895 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 5896 u64 len, u64 orig_start, 5897 u64 block_start, u64 block_len, 5898 int type) 5899 { 5900 struct extent_map_tree *em_tree; 5901 struct extent_map *em; 5902 struct btrfs_root *root = BTRFS_I(inode)->root; 5903 int ret; 5904 5905 em_tree = &BTRFS_I(inode)->extent_tree; 5906 em = alloc_extent_map(); 5907 if (!em) 5908 return ERR_PTR(-ENOMEM); 5909 5910 em->start = start; 5911 em->orig_start = orig_start; 5912 em->len = len; 5913 em->block_len = block_len; 5914 em->block_start = block_start; 5915 em->bdev = root->fs_info->fs_devices->latest_bdev; 5916 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5917 if (type == BTRFS_ORDERED_PREALLOC) 5918 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5919 5920 do { 5921 btrfs_drop_extent_cache(inode, em->start, 5922 em->start + em->len - 1, 0); 5923 write_lock(&em_tree->lock); 5924 ret = add_extent_mapping(em_tree, em); 5925 write_unlock(&em_tree->lock); 5926 } while (ret == -EEXIST); 5927 5928 if (ret) { 5929 free_extent_map(em); 5930 return ERR_PTR(ret); 5931 } 5932 5933 return em; 5934 } 5935 5936 5937 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5938 struct buffer_head *bh_result, int create) 5939 { 5940 struct extent_map *em; 5941 struct btrfs_root *root = BTRFS_I(inode)->root; 5942 struct extent_state *cached_state = NULL; 5943 u64 start = iblock << inode->i_blkbits; 5944 u64 lockstart, lockend; 5945 u64 len = bh_result->b_size; 5946 struct btrfs_trans_handle *trans; 5947 int unlock_bits = EXTENT_LOCKED; 5948 int ret; 5949 5950 if (create) { 5951 ret = btrfs_delalloc_reserve_space(inode, len); 5952 if (ret) 5953 return ret; 5954 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; 5955 } else { 5956 len = min_t(u64, len, root->sectorsize); 5957 } 5958 5959 lockstart = start; 5960 lockend = start + len - 1; 5961 5962 /* 5963 * If this errors out it's because we couldn't invalidate pagecache for 5964 * this range and we need to fallback to buffered. 5965 */ 5966 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) 5967 return -ENOTBLK; 5968 5969 if (create) { 5970 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 5971 lockend, EXTENT_DELALLOC, NULL, 5972 &cached_state, GFP_NOFS); 5973 if (ret) 5974 goto unlock_err; 5975 } 5976 5977 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5978 if (IS_ERR(em)) { 5979 ret = PTR_ERR(em); 5980 goto unlock_err; 5981 } 5982 5983 /* 5984 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5985 * io. INLINE is special, and we could probably kludge it in here, but 5986 * it's still buffered so for safety lets just fall back to the generic 5987 * buffered path. 5988 * 5989 * For COMPRESSED we _have_ to read the entire extent in so we can 5990 * decompress it, so there will be buffering required no matter what we 5991 * do, so go ahead and fallback to buffered. 5992 * 5993 * We return -ENOTBLK because thats what makes DIO go ahead and go back 5994 * to buffered IO. Don't blame me, this is the price we pay for using 5995 * the generic code. 5996 */ 5997 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5998 em->block_start == EXTENT_MAP_INLINE) { 5999 free_extent_map(em); 6000 ret = -ENOTBLK; 6001 goto unlock_err; 6002 } 6003 6004 /* Just a good old fashioned hole, return */ 6005 if (!create && (em->block_start == EXTENT_MAP_HOLE || 6006 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6007 free_extent_map(em); 6008 ret = 0; 6009 goto unlock_err; 6010 } 6011 6012 /* 6013 * We don't allocate a new extent in the following cases 6014 * 6015 * 1) The inode is marked as NODATACOW. In this case we'll just use the 6016 * existing extent. 6017 * 2) The extent is marked as PREALLOC. We're good to go here and can 6018 * just use the extent. 6019 * 6020 */ 6021 if (!create) { 6022 len = min(len, em->len - (start - em->start)); 6023 lockstart = start + len; 6024 goto unlock; 6025 } 6026 6027 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 6028 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 6029 em->block_start != EXTENT_MAP_HOLE)) { 6030 int type; 6031 int ret; 6032 u64 block_start; 6033 6034 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 6035 type = BTRFS_ORDERED_PREALLOC; 6036 else 6037 type = BTRFS_ORDERED_NOCOW; 6038 len = min(len, em->len - (start - em->start)); 6039 block_start = em->block_start + (start - em->start); 6040 6041 /* 6042 * we're not going to log anything, but we do need 6043 * to make sure the current transaction stays open 6044 * while we look for nocow cross refs 6045 */ 6046 trans = btrfs_join_transaction(root); 6047 if (IS_ERR(trans)) 6048 goto must_cow; 6049 6050 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6051 u64 orig_start = em->start; 6052 6053 if (type == BTRFS_ORDERED_PREALLOC) { 6054 free_extent_map(em); 6055 em = create_pinned_em(inode, start, len, 6056 orig_start, 6057 block_start, len, type); 6058 if (IS_ERR(em)) { 6059 btrfs_end_transaction(trans, root); 6060 goto unlock_err; 6061 } 6062 } 6063 6064 ret = btrfs_add_ordered_extent_dio(inode, start, 6065 block_start, len, len, type); 6066 btrfs_end_transaction(trans, root); 6067 if (ret) { 6068 free_extent_map(em); 6069 goto unlock_err; 6070 } 6071 goto unlock; 6072 } 6073 btrfs_end_transaction(trans, root); 6074 } 6075 must_cow: 6076 /* 6077 * this will cow the extent, reset the len in case we changed 6078 * it above 6079 */ 6080 len = bh_result->b_size; 6081 em = btrfs_new_extent_direct(inode, em, start, len); 6082 if (IS_ERR(em)) { 6083 ret = PTR_ERR(em); 6084 goto unlock_err; 6085 } 6086 len = min(len, em->len - (start - em->start)); 6087 unlock: 6088 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 6089 inode->i_blkbits; 6090 bh_result->b_size = len; 6091 bh_result->b_bdev = em->bdev; 6092 set_buffer_mapped(bh_result); 6093 if (create) { 6094 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 6095 set_buffer_new(bh_result); 6096 6097 /* 6098 * Need to update the i_size under the extent lock so buffered 6099 * readers will get the updated i_size when we unlock. 6100 */ 6101 if (start + len > i_size_read(inode)) 6102 i_size_write(inode, start + len); 6103 } 6104 6105 /* 6106 * In the case of write we need to clear and unlock the entire range, 6107 * in the case of read we need to unlock only the end area that we 6108 * aren't using if there is any left over space. 6109 */ 6110 if (lockstart < lockend) { 6111 if (create && len < lockend - lockstart) { 6112 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6113 lockstart + len - 1, 6114 unlock_bits | EXTENT_DEFRAG, 1, 0, 6115 &cached_state, GFP_NOFS); 6116 /* 6117 * Beside unlock, we also need to cleanup reserved space 6118 * for the left range by attaching EXTENT_DO_ACCOUNTING. 6119 */ 6120 clear_extent_bit(&BTRFS_I(inode)->io_tree, 6121 lockstart + len, lockend, 6122 unlock_bits | EXTENT_DO_ACCOUNTING | 6123 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); 6124 } else { 6125 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6126 lockend, unlock_bits, 1, 0, 6127 &cached_state, GFP_NOFS); 6128 } 6129 } else { 6130 free_extent_state(cached_state); 6131 } 6132 6133 free_extent_map(em); 6134 6135 return 0; 6136 6137 unlock_err: 6138 if (create) 6139 unlock_bits |= EXTENT_DO_ACCOUNTING; 6140 6141 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6142 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 6143 return ret; 6144 } 6145 6146 struct btrfs_dio_private { 6147 struct inode *inode; 6148 u64 logical_offset; 6149 u64 disk_bytenr; 6150 u64 bytes; 6151 void *private; 6152 6153 /* number of bios pending for this dio */ 6154 atomic_t pending_bios; 6155 6156 /* IO errors */ 6157 int errors; 6158 6159 struct bio *orig_bio; 6160 }; 6161 6162 static void btrfs_endio_direct_read(struct bio *bio, int err) 6163 { 6164 struct btrfs_dio_private *dip = bio->bi_private; 6165 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 6166 struct bio_vec *bvec = bio->bi_io_vec; 6167 struct inode *inode = dip->inode; 6168 struct btrfs_root *root = BTRFS_I(inode)->root; 6169 u64 start; 6170 6171 start = dip->logical_offset; 6172 do { 6173 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 6174 struct page *page = bvec->bv_page; 6175 char *kaddr; 6176 u32 csum = ~(u32)0; 6177 u64 private = ~(u32)0; 6178 unsigned long flags; 6179 6180 if (get_state_private(&BTRFS_I(inode)->io_tree, 6181 start, &private)) 6182 goto failed; 6183 local_irq_save(flags); 6184 kaddr = kmap_atomic(page); 6185 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, 6186 csum, bvec->bv_len); 6187 btrfs_csum_final(csum, (char *)&csum); 6188 kunmap_atomic(kaddr); 6189 local_irq_restore(flags); 6190 6191 flush_dcache_page(bvec->bv_page); 6192 if (csum != private) { 6193 failed: 6194 printk(KERN_ERR "btrfs csum failed ino %llu off" 6195 " %llu csum %u private %u\n", 6196 (unsigned long long)btrfs_ino(inode), 6197 (unsigned long long)start, 6198 csum, (unsigned)private); 6199 err = -EIO; 6200 } 6201 } 6202 6203 start += bvec->bv_len; 6204 bvec++; 6205 } while (bvec <= bvec_end); 6206 6207 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 6208 dip->logical_offset + dip->bytes - 1); 6209 bio->bi_private = dip->private; 6210 6211 kfree(dip); 6212 6213 /* If we had a csum failure make sure to clear the uptodate flag */ 6214 if (err) 6215 clear_bit(BIO_UPTODATE, &bio->bi_flags); 6216 dio_end_io(bio, err); 6217 } 6218 6219 static void btrfs_endio_direct_write(struct bio *bio, int err) 6220 { 6221 struct btrfs_dio_private *dip = bio->bi_private; 6222 struct inode *inode = dip->inode; 6223 struct btrfs_root *root = BTRFS_I(inode)->root; 6224 struct btrfs_ordered_extent *ordered = NULL; 6225 u64 ordered_offset = dip->logical_offset; 6226 u64 ordered_bytes = dip->bytes; 6227 int ret; 6228 6229 if (err) 6230 goto out_done; 6231 again: 6232 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 6233 &ordered_offset, 6234 ordered_bytes, !err); 6235 if (!ret) 6236 goto out_test; 6237 6238 ordered->work.func = finish_ordered_fn; 6239 ordered->work.flags = 0; 6240 btrfs_queue_worker(&root->fs_info->endio_write_workers, 6241 &ordered->work); 6242 out_test: 6243 /* 6244 * our bio might span multiple ordered extents. If we haven't 6245 * completed the accounting for the whole dio, go back and try again 6246 */ 6247 if (ordered_offset < dip->logical_offset + dip->bytes) { 6248 ordered_bytes = dip->logical_offset + dip->bytes - 6249 ordered_offset; 6250 ordered = NULL; 6251 goto again; 6252 } 6253 out_done: 6254 bio->bi_private = dip->private; 6255 6256 kfree(dip); 6257 6258 /* If we had an error make sure to clear the uptodate flag */ 6259 if (err) 6260 clear_bit(BIO_UPTODATE, &bio->bi_flags); 6261 dio_end_io(bio, err); 6262 } 6263 6264 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 6265 struct bio *bio, int mirror_num, 6266 unsigned long bio_flags, u64 offset) 6267 { 6268 int ret; 6269 struct btrfs_root *root = BTRFS_I(inode)->root; 6270 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 6271 BUG_ON(ret); /* -ENOMEM */ 6272 return 0; 6273 } 6274 6275 static void btrfs_end_dio_bio(struct bio *bio, int err) 6276 { 6277 struct btrfs_dio_private *dip = bio->bi_private; 6278 6279 if (err) { 6280 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " 6281 "sector %#Lx len %u err no %d\n", 6282 (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, 6283 (unsigned long long)bio->bi_sector, bio->bi_size, err); 6284 dip->errors = 1; 6285 6286 /* 6287 * before atomic variable goto zero, we must make sure 6288 * dip->errors is perceived to be set. 6289 */ 6290 smp_mb__before_atomic_dec(); 6291 } 6292 6293 /* if there are more bios still pending for this dio, just exit */ 6294 if (!atomic_dec_and_test(&dip->pending_bios)) 6295 goto out; 6296 6297 if (dip->errors) 6298 bio_io_error(dip->orig_bio); 6299 else { 6300 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); 6301 bio_endio(dip->orig_bio, 0); 6302 } 6303 out: 6304 bio_put(bio); 6305 } 6306 6307 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 6308 u64 first_sector, gfp_t gfp_flags) 6309 { 6310 int nr_vecs = bio_get_nr_vecs(bdev); 6311 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 6312 } 6313 6314 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 6315 int rw, u64 file_offset, int skip_sum, 6316 int async_submit) 6317 { 6318 int write = rw & REQ_WRITE; 6319 struct btrfs_root *root = BTRFS_I(inode)->root; 6320 int ret; 6321 6322 bio_get(bio); 6323 6324 if (!write) { 6325 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6326 if (ret) 6327 goto err; 6328 } 6329 6330 if (skip_sum) 6331 goto map; 6332 6333 if (write && async_submit) { 6334 ret = btrfs_wq_submit_bio(root->fs_info, 6335 inode, rw, bio, 0, 0, 6336 file_offset, 6337 __btrfs_submit_bio_start_direct_io, 6338 __btrfs_submit_bio_done); 6339 goto err; 6340 } else if (write) { 6341 /* 6342 * If we aren't doing async submit, calculate the csum of the 6343 * bio now. 6344 */ 6345 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 6346 if (ret) 6347 goto err; 6348 } else if (!skip_sum) { 6349 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset); 6350 if (ret) 6351 goto err; 6352 } 6353 6354 map: 6355 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 6356 err: 6357 bio_put(bio); 6358 return ret; 6359 } 6360 6361 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 6362 int skip_sum) 6363 { 6364 struct inode *inode = dip->inode; 6365 struct btrfs_root *root = BTRFS_I(inode)->root; 6366 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 6367 struct bio *bio; 6368 struct bio *orig_bio = dip->orig_bio; 6369 struct bio_vec *bvec = orig_bio->bi_io_vec; 6370 u64 start_sector = orig_bio->bi_sector; 6371 u64 file_offset = dip->logical_offset; 6372 u64 submit_len = 0; 6373 u64 map_length; 6374 int nr_pages = 0; 6375 int ret = 0; 6376 int async_submit = 0; 6377 6378 map_length = orig_bio->bi_size; 6379 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6380 &map_length, NULL, 0); 6381 if (ret) { 6382 bio_put(orig_bio); 6383 return -EIO; 6384 } 6385 6386 if (map_length >= orig_bio->bi_size) { 6387 bio = orig_bio; 6388 goto submit; 6389 } 6390 6391 async_submit = 1; 6392 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6393 if (!bio) 6394 return -ENOMEM; 6395 bio->bi_private = dip; 6396 bio->bi_end_io = btrfs_end_dio_bio; 6397 atomic_inc(&dip->pending_bios); 6398 6399 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6400 if (unlikely(map_length < submit_len + bvec->bv_len || 6401 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 6402 bvec->bv_offset) < bvec->bv_len)) { 6403 /* 6404 * inc the count before we submit the bio so 6405 * we know the end IO handler won't happen before 6406 * we inc the count. Otherwise, the dip might get freed 6407 * before we're done setting it up 6408 */ 6409 atomic_inc(&dip->pending_bios); 6410 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6411 file_offset, skip_sum, 6412 async_submit); 6413 if (ret) { 6414 bio_put(bio); 6415 atomic_dec(&dip->pending_bios); 6416 goto out_err; 6417 } 6418 6419 start_sector += submit_len >> 9; 6420 file_offset += submit_len; 6421 6422 submit_len = 0; 6423 nr_pages = 0; 6424 6425 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 6426 start_sector, GFP_NOFS); 6427 if (!bio) 6428 goto out_err; 6429 bio->bi_private = dip; 6430 bio->bi_end_io = btrfs_end_dio_bio; 6431 6432 map_length = orig_bio->bi_size; 6433 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6434 &map_length, NULL, 0); 6435 if (ret) { 6436 bio_put(bio); 6437 goto out_err; 6438 } 6439 } else { 6440 submit_len += bvec->bv_len; 6441 nr_pages ++; 6442 bvec++; 6443 } 6444 } 6445 6446 submit: 6447 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6448 async_submit); 6449 if (!ret) 6450 return 0; 6451 6452 bio_put(bio); 6453 out_err: 6454 dip->errors = 1; 6455 /* 6456 * before atomic variable goto zero, we must 6457 * make sure dip->errors is perceived to be set. 6458 */ 6459 smp_mb__before_atomic_dec(); 6460 if (atomic_dec_and_test(&dip->pending_bios)) 6461 bio_io_error(dip->orig_bio); 6462 6463 /* bio_end_io() will handle error, so we needn't return it */ 6464 return 0; 6465 } 6466 6467 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 6468 loff_t file_offset) 6469 { 6470 struct btrfs_root *root = BTRFS_I(inode)->root; 6471 struct btrfs_dio_private *dip; 6472 struct bio_vec *bvec = bio->bi_io_vec; 6473 int skip_sum; 6474 int write = rw & REQ_WRITE; 6475 int ret = 0; 6476 6477 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 6478 6479 dip = kmalloc(sizeof(*dip), GFP_NOFS); 6480 if (!dip) { 6481 ret = -ENOMEM; 6482 goto free_ordered; 6483 } 6484 6485 dip->private = bio->bi_private; 6486 dip->inode = inode; 6487 dip->logical_offset = file_offset; 6488 6489 dip->bytes = 0; 6490 do { 6491 dip->bytes += bvec->bv_len; 6492 bvec++; 6493 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); 6494 6495 dip->disk_bytenr = (u64)bio->bi_sector << 9; 6496 bio->bi_private = dip; 6497 dip->errors = 0; 6498 dip->orig_bio = bio; 6499 atomic_set(&dip->pending_bios, 0); 6500 6501 if (write) 6502 bio->bi_end_io = btrfs_endio_direct_write; 6503 else 6504 bio->bi_end_io = btrfs_endio_direct_read; 6505 6506 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 6507 if (!ret) 6508 return; 6509 free_ordered: 6510 /* 6511 * If this is a write, we need to clean up the reserved space and kill 6512 * the ordered extent. 6513 */ 6514 if (write) { 6515 struct btrfs_ordered_extent *ordered; 6516 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 6517 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 6518 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 6519 btrfs_free_reserved_extent(root, ordered->start, 6520 ordered->disk_len); 6521 btrfs_put_ordered_extent(ordered); 6522 btrfs_put_ordered_extent(ordered); 6523 } 6524 bio_endio(bio, ret); 6525 } 6526 6527 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 6528 const struct iovec *iov, loff_t offset, 6529 unsigned long nr_segs) 6530 { 6531 int seg; 6532 int i; 6533 size_t size; 6534 unsigned long addr; 6535 unsigned blocksize_mask = root->sectorsize - 1; 6536 ssize_t retval = -EINVAL; 6537 loff_t end = offset; 6538 6539 if (offset & blocksize_mask) 6540 goto out; 6541 6542 /* Check the memory alignment. Blocks cannot straddle pages */ 6543 for (seg = 0; seg < nr_segs; seg++) { 6544 addr = (unsigned long)iov[seg].iov_base; 6545 size = iov[seg].iov_len; 6546 end += size; 6547 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6548 goto out; 6549 6550 /* If this is a write we don't need to check anymore */ 6551 if (rw & WRITE) 6552 continue; 6553 6554 /* 6555 * Check to make sure we don't have duplicate iov_base's in this 6556 * iovec, if so return EINVAL, otherwise we'll get csum errors 6557 * when reading back. 6558 */ 6559 for (i = seg + 1; i < nr_segs; i++) { 6560 if (iov[seg].iov_base == iov[i].iov_base) 6561 goto out; 6562 } 6563 } 6564 retval = 0; 6565 out: 6566 return retval; 6567 } 6568 6569 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 6570 const struct iovec *iov, loff_t offset, 6571 unsigned long nr_segs) 6572 { 6573 struct file *file = iocb->ki_filp; 6574 struct inode *inode = file->f_mapping->host; 6575 6576 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6577 offset, nr_segs)) 6578 return 0; 6579 6580 return __blockdev_direct_IO(rw, iocb, inode, 6581 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6582 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6583 btrfs_submit_direct, 0); 6584 } 6585 6586 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6587 __u64 start, __u64 len) 6588 { 6589 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6590 } 6591 6592 int btrfs_readpage(struct file *file, struct page *page) 6593 { 6594 struct extent_io_tree *tree; 6595 tree = &BTRFS_I(page->mapping->host)->io_tree; 6596 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 6597 } 6598 6599 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6600 { 6601 struct extent_io_tree *tree; 6602 6603 6604 if (current->flags & PF_MEMALLOC) { 6605 redirty_page_for_writepage(wbc, page); 6606 unlock_page(page); 6607 return 0; 6608 } 6609 tree = &BTRFS_I(page->mapping->host)->io_tree; 6610 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 6611 } 6612 6613 int btrfs_writepages(struct address_space *mapping, 6614 struct writeback_control *wbc) 6615 { 6616 struct extent_io_tree *tree; 6617 6618 tree = &BTRFS_I(mapping->host)->io_tree; 6619 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 6620 } 6621 6622 static int 6623 btrfs_readpages(struct file *file, struct address_space *mapping, 6624 struct list_head *pages, unsigned nr_pages) 6625 { 6626 struct extent_io_tree *tree; 6627 tree = &BTRFS_I(mapping->host)->io_tree; 6628 return extent_readpages(tree, mapping, pages, nr_pages, 6629 btrfs_get_extent); 6630 } 6631 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6632 { 6633 struct extent_io_tree *tree; 6634 struct extent_map_tree *map; 6635 int ret; 6636 6637 tree = &BTRFS_I(page->mapping->host)->io_tree; 6638 map = &BTRFS_I(page->mapping->host)->extent_tree; 6639 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 6640 if (ret == 1) { 6641 ClearPagePrivate(page); 6642 set_page_private(page, 0); 6643 page_cache_release(page); 6644 } 6645 return ret; 6646 } 6647 6648 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6649 { 6650 if (PageWriteback(page) || PageDirty(page)) 6651 return 0; 6652 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 6653 } 6654 6655 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6656 { 6657 struct inode *inode = page->mapping->host; 6658 struct extent_io_tree *tree; 6659 struct btrfs_ordered_extent *ordered; 6660 struct extent_state *cached_state = NULL; 6661 u64 page_start = page_offset(page); 6662 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6663 6664 /* 6665 * we have the page locked, so new writeback can't start, 6666 * and the dirty bit won't be cleared while we are here. 6667 * 6668 * Wait for IO on this page so that we can safely clear 6669 * the PagePrivate2 bit and do ordered accounting 6670 */ 6671 wait_on_page_writeback(page); 6672 6673 tree = &BTRFS_I(inode)->io_tree; 6674 if (offset) { 6675 btrfs_releasepage(page, GFP_NOFS); 6676 return; 6677 } 6678 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6679 ordered = btrfs_lookup_ordered_extent(inode, 6680 page_offset(page)); 6681 if (ordered) { 6682 /* 6683 * IO on this page will never be started, so we need 6684 * to account for any ordered extents now 6685 */ 6686 clear_extent_bit(tree, page_start, page_end, 6687 EXTENT_DIRTY | EXTENT_DELALLOC | 6688 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 6689 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); 6690 /* 6691 * whoever cleared the private bit is responsible 6692 * for the finish_ordered_io 6693 */ 6694 if (TestClearPagePrivate2(page) && 6695 btrfs_dec_test_ordered_pending(inode, &ordered, page_start, 6696 PAGE_CACHE_SIZE, 1)) { 6697 btrfs_finish_ordered_io(ordered); 6698 } 6699 btrfs_put_ordered_extent(ordered); 6700 cached_state = NULL; 6701 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6702 } 6703 clear_extent_bit(tree, page_start, page_end, 6704 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6705 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, 6706 &cached_state, GFP_NOFS); 6707 __btrfs_releasepage(page, GFP_NOFS); 6708 6709 ClearPageChecked(page); 6710 if (PagePrivate(page)) { 6711 ClearPagePrivate(page); 6712 set_page_private(page, 0); 6713 page_cache_release(page); 6714 } 6715 } 6716 6717 /* 6718 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 6719 * called from a page fault handler when a page is first dirtied. Hence we must 6720 * be careful to check for EOF conditions here. We set the page up correctly 6721 * for a written page which means we get ENOSPC checking when writing into 6722 * holes and correct delalloc and unwritten extent mapping on filesystems that 6723 * support these features. 6724 * 6725 * We are not allowed to take the i_mutex here so we have to play games to 6726 * protect against truncate races as the page could now be beyond EOF. Because 6727 * vmtruncate() writes the inode size before removing pages, once we have the 6728 * page lock we can determine safely if the page is beyond EOF. If it is not 6729 * beyond EOF, then the page is guaranteed safe against truncation until we 6730 * unlock the page. 6731 */ 6732 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 6733 { 6734 struct page *page = vmf->page; 6735 struct inode *inode = fdentry(vma->vm_file)->d_inode; 6736 struct btrfs_root *root = BTRFS_I(inode)->root; 6737 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6738 struct btrfs_ordered_extent *ordered; 6739 struct extent_state *cached_state = NULL; 6740 char *kaddr; 6741 unsigned long zero_start; 6742 loff_t size; 6743 int ret; 6744 int reserved = 0; 6745 u64 page_start; 6746 u64 page_end; 6747 6748 sb_start_pagefault(inode->i_sb); 6749 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6750 if (!ret) { 6751 ret = file_update_time(vma->vm_file); 6752 reserved = 1; 6753 } 6754 if (ret) { 6755 if (ret == -ENOMEM) 6756 ret = VM_FAULT_OOM; 6757 else /* -ENOSPC, -EIO, etc */ 6758 ret = VM_FAULT_SIGBUS; 6759 if (reserved) 6760 goto out; 6761 goto out_noreserve; 6762 } 6763 6764 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6765 again: 6766 lock_page(page); 6767 size = i_size_read(inode); 6768 page_start = page_offset(page); 6769 page_end = page_start + PAGE_CACHE_SIZE - 1; 6770 6771 if ((page->mapping != inode->i_mapping) || 6772 (page_start >= size)) { 6773 /* page got truncated out from underneath us */ 6774 goto out_unlock; 6775 } 6776 wait_on_page_writeback(page); 6777 6778 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 6779 set_page_extent_mapped(page); 6780 6781 /* 6782 * we can't set the delalloc bits if there are pending ordered 6783 * extents. Drop our locks and wait for them to finish 6784 */ 6785 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6786 if (ordered) { 6787 unlock_extent_cached(io_tree, page_start, page_end, 6788 &cached_state, GFP_NOFS); 6789 unlock_page(page); 6790 btrfs_start_ordered_extent(inode, ordered, 1); 6791 btrfs_put_ordered_extent(ordered); 6792 goto again; 6793 } 6794 6795 /* 6796 * XXX - page_mkwrite gets called every time the page is dirtied, even 6797 * if it was already dirty, so for space accounting reasons we need to 6798 * clear any delalloc bits for the range we are fixing to save. There 6799 * is probably a better way to do this, but for now keep consistent with 6800 * prepare_pages in the normal write path. 6801 */ 6802 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6803 EXTENT_DIRTY | EXTENT_DELALLOC | 6804 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 6805 0, 0, &cached_state, GFP_NOFS); 6806 6807 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6808 &cached_state); 6809 if (ret) { 6810 unlock_extent_cached(io_tree, page_start, page_end, 6811 &cached_state, GFP_NOFS); 6812 ret = VM_FAULT_SIGBUS; 6813 goto out_unlock; 6814 } 6815 ret = 0; 6816 6817 /* page is wholly or partially inside EOF */ 6818 if (page_start + PAGE_CACHE_SIZE > size) 6819 zero_start = size & ~PAGE_CACHE_MASK; 6820 else 6821 zero_start = PAGE_CACHE_SIZE; 6822 6823 if (zero_start != PAGE_CACHE_SIZE) { 6824 kaddr = kmap(page); 6825 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 6826 flush_dcache_page(page); 6827 kunmap(page); 6828 } 6829 ClearPageChecked(page); 6830 set_page_dirty(page); 6831 SetPageUptodate(page); 6832 6833 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6834 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6835 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 6836 6837 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6838 6839 out_unlock: 6840 if (!ret) { 6841 sb_end_pagefault(inode->i_sb); 6842 return VM_FAULT_LOCKED; 6843 } 6844 unlock_page(page); 6845 out: 6846 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6847 out_noreserve: 6848 sb_end_pagefault(inode->i_sb); 6849 return ret; 6850 } 6851 6852 static int btrfs_truncate(struct inode *inode) 6853 { 6854 struct btrfs_root *root = BTRFS_I(inode)->root; 6855 struct btrfs_block_rsv *rsv; 6856 int ret; 6857 int err = 0; 6858 struct btrfs_trans_handle *trans; 6859 unsigned long nr; 6860 u64 mask = root->sectorsize - 1; 6861 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6862 6863 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); 6864 if (ret) 6865 return ret; 6866 6867 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6868 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6869 6870 /* 6871 * Yes ladies and gentelment, this is indeed ugly. The fact is we have 6872 * 3 things going on here 6873 * 6874 * 1) We need to reserve space for our orphan item and the space to 6875 * delete our orphan item. Lord knows we don't want to have a dangling 6876 * orphan item because we didn't reserve space to remove it. 6877 * 6878 * 2) We need to reserve space to update our inode. 6879 * 6880 * 3) We need to have something to cache all the space that is going to 6881 * be free'd up by the truncate operation, but also have some slack 6882 * space reserved in case it uses space during the truncate (thank you 6883 * very much snapshotting). 6884 * 6885 * And we need these to all be seperate. The fact is we can use alot of 6886 * space doing the truncate, and we have no earthly idea how much space 6887 * we will use, so we need the truncate reservation to be seperate so it 6888 * doesn't end up using space reserved for updating the inode or 6889 * removing the orphan item. We also need to be able to stop the 6890 * transaction and start a new one, which means we need to be able to 6891 * update the inode several times, and we have no idea of knowing how 6892 * many times that will be, so we can't just reserve 1 item for the 6893 * entirety of the opration, so that has to be done seperately as well. 6894 * Then there is the orphan item, which does indeed need to be held on 6895 * to for the whole operation, and we need nobody to touch this reserved 6896 * space except the orphan code. 6897 * 6898 * So that leaves us with 6899 * 6900 * 1) root->orphan_block_rsv - for the orphan deletion. 6901 * 2) rsv - for the truncate reservation, which we will steal from the 6902 * transaction reservation. 6903 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6904 * updating the inode. 6905 */ 6906 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 6907 if (!rsv) 6908 return -ENOMEM; 6909 rsv->size = min_size; 6910 rsv->failfast = 1; 6911 6912 /* 6913 * 1 for the truncate slack space 6914 * 1 for the orphan item we're going to add 6915 * 1 for the orphan item deletion 6916 * 1 for updating the inode. 6917 */ 6918 trans = btrfs_start_transaction(root, 4); 6919 if (IS_ERR(trans)) { 6920 err = PTR_ERR(trans); 6921 goto out; 6922 } 6923 6924 /* Migrate the slack space for the truncate to our reserve */ 6925 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 6926 min_size); 6927 BUG_ON(ret); 6928 6929 ret = btrfs_orphan_add(trans, inode); 6930 if (ret) { 6931 btrfs_end_transaction(trans, root); 6932 goto out; 6933 } 6934 6935 /* 6936 * setattr is responsible for setting the ordered_data_close flag, 6937 * but that is only tested during the last file release. That 6938 * could happen well after the next commit, leaving a great big 6939 * window where new writes may get lost if someone chooses to write 6940 * to this file after truncating to zero 6941 * 6942 * The inode doesn't have any dirty data here, and so if we commit 6943 * this is a noop. If someone immediately starts writing to the inode 6944 * it is very likely we'll catch some of their writes in this 6945 * transaction, and the commit will find this file on the ordered 6946 * data list with good things to send down. 6947 * 6948 * This is a best effort solution, there is still a window where 6949 * using truncate to replace the contents of the file will 6950 * end up with a zero length file after a crash. 6951 */ 6952 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 6953 &BTRFS_I(inode)->runtime_flags)) 6954 btrfs_add_ordered_operation(trans, root, inode); 6955 6956 /* 6957 * So if we truncate and then write and fsync we normally would just 6958 * write the extents that changed, which is a problem if we need to 6959 * first truncate that entire inode. So set this flag so we write out 6960 * all of the extents in the inode to the sync log so we're completely 6961 * safe. 6962 */ 6963 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 6964 trans->block_rsv = rsv; 6965 6966 while (1) { 6967 ret = btrfs_truncate_inode_items(trans, root, inode, 6968 inode->i_size, 6969 BTRFS_EXTENT_DATA_KEY); 6970 if (ret != -ENOSPC) { 6971 err = ret; 6972 break; 6973 } 6974 6975 trans->block_rsv = &root->fs_info->trans_block_rsv; 6976 ret = btrfs_update_inode(trans, root, inode); 6977 if (ret) { 6978 err = ret; 6979 break; 6980 } 6981 6982 nr = trans->blocks_used; 6983 btrfs_end_transaction(trans, root); 6984 btrfs_btree_balance_dirty(root, nr); 6985 6986 trans = btrfs_start_transaction(root, 2); 6987 if (IS_ERR(trans)) { 6988 ret = err = PTR_ERR(trans); 6989 trans = NULL; 6990 break; 6991 } 6992 6993 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, 6994 rsv, min_size); 6995 BUG_ON(ret); /* shouldn't happen */ 6996 trans->block_rsv = rsv; 6997 } 6998 6999 if (ret == 0 && inode->i_nlink > 0) { 7000 trans->block_rsv = root->orphan_block_rsv; 7001 ret = btrfs_orphan_del(trans, inode); 7002 if (ret) 7003 err = ret; 7004 } else if (ret && inode->i_nlink > 0) { 7005 /* 7006 * Failed to do the truncate, remove us from the in memory 7007 * orphan list. 7008 */ 7009 ret = btrfs_orphan_del(NULL, inode); 7010 } 7011 7012 if (trans) { 7013 trans->block_rsv = &root->fs_info->trans_block_rsv; 7014 ret = btrfs_update_inode(trans, root, inode); 7015 if (ret && !err) 7016 err = ret; 7017 7018 nr = trans->blocks_used; 7019 ret = btrfs_end_transaction(trans, root); 7020 btrfs_btree_balance_dirty(root, nr); 7021 } 7022 7023 out: 7024 btrfs_free_block_rsv(root, rsv); 7025 7026 if (ret && !err) 7027 err = ret; 7028 7029 return err; 7030 } 7031 7032 /* 7033 * create a new subvolume directory/inode (helper for the ioctl). 7034 */ 7035 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 7036 struct btrfs_root *new_root, u64 new_dirid) 7037 { 7038 struct inode *inode; 7039 int err; 7040 u64 index = 0; 7041 7042 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 7043 new_dirid, new_dirid, 7044 S_IFDIR | (~current_umask() & S_IRWXUGO), 7045 &index); 7046 if (IS_ERR(inode)) 7047 return PTR_ERR(inode); 7048 inode->i_op = &btrfs_dir_inode_operations; 7049 inode->i_fop = &btrfs_dir_file_operations; 7050 7051 set_nlink(inode, 1); 7052 btrfs_i_size_write(inode, 0); 7053 7054 err = btrfs_update_inode(trans, new_root, inode); 7055 7056 iput(inode); 7057 return err; 7058 } 7059 7060 struct inode *btrfs_alloc_inode(struct super_block *sb) 7061 { 7062 struct btrfs_inode *ei; 7063 struct inode *inode; 7064 7065 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 7066 if (!ei) 7067 return NULL; 7068 7069 ei->root = NULL; 7070 ei->generation = 0; 7071 ei->last_trans = 0; 7072 ei->last_sub_trans = 0; 7073 ei->logged_trans = 0; 7074 ei->delalloc_bytes = 0; 7075 ei->disk_i_size = 0; 7076 ei->flags = 0; 7077 ei->csum_bytes = 0; 7078 ei->index_cnt = (u64)-1; 7079 ei->last_unlink_trans = 0; 7080 ei->last_log_commit = 0; 7081 7082 spin_lock_init(&ei->lock); 7083 ei->outstanding_extents = 0; 7084 ei->reserved_extents = 0; 7085 7086 ei->runtime_flags = 0; 7087 ei->force_compress = BTRFS_COMPRESS_NONE; 7088 7089 ei->delayed_node = NULL; 7090 7091 inode = &ei->vfs_inode; 7092 extent_map_tree_init(&ei->extent_tree); 7093 extent_io_tree_init(&ei->io_tree, &inode->i_data); 7094 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7095 ei->io_tree.track_uptodate = 1; 7096 ei->io_failure_tree.track_uptodate = 1; 7097 mutex_init(&ei->log_mutex); 7098 mutex_init(&ei->delalloc_mutex); 7099 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 7100 INIT_LIST_HEAD(&ei->delalloc_inodes); 7101 INIT_LIST_HEAD(&ei->ordered_operations); 7102 RB_CLEAR_NODE(&ei->rb_node); 7103 7104 return inode; 7105 } 7106 7107 static void btrfs_i_callback(struct rcu_head *head) 7108 { 7109 struct inode *inode = container_of(head, struct inode, i_rcu); 7110 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 7111 } 7112 7113 void btrfs_destroy_inode(struct inode *inode) 7114 { 7115 struct btrfs_ordered_extent *ordered; 7116 struct btrfs_root *root = BTRFS_I(inode)->root; 7117 7118 WARN_ON(!hlist_empty(&inode->i_dentry)); 7119 WARN_ON(inode->i_data.nrpages); 7120 WARN_ON(BTRFS_I(inode)->outstanding_extents); 7121 WARN_ON(BTRFS_I(inode)->reserved_extents); 7122 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 7123 WARN_ON(BTRFS_I(inode)->csum_bytes); 7124 7125 /* 7126 * This can happen where we create an inode, but somebody else also 7127 * created the same inode and we need to destroy the one we already 7128 * created. 7129 */ 7130 if (!root) 7131 goto free; 7132 7133 /* 7134 * Make sure we're properly removed from the ordered operation 7135 * lists. 7136 */ 7137 smp_mb(); 7138 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7139 spin_lock(&root->fs_info->ordered_extent_lock); 7140 list_del_init(&BTRFS_I(inode)->ordered_operations); 7141 spin_unlock(&root->fs_info->ordered_extent_lock); 7142 } 7143 7144 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7145 &BTRFS_I(inode)->runtime_flags)) { 7146 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 7147 (unsigned long long)btrfs_ino(inode)); 7148 atomic_dec(&root->orphan_inodes); 7149 } 7150 7151 while (1) { 7152 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 7153 if (!ordered) 7154 break; 7155 else { 7156 printk(KERN_ERR "btrfs found ordered " 7157 "extent %llu %llu on inode cleanup\n", 7158 (unsigned long long)ordered->file_offset, 7159 (unsigned long long)ordered->len); 7160 btrfs_remove_ordered_extent(inode, ordered); 7161 btrfs_put_ordered_extent(ordered); 7162 btrfs_put_ordered_extent(ordered); 7163 } 7164 } 7165 inode_tree_del(inode); 7166 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 7167 free: 7168 btrfs_remove_delayed_node(inode); 7169 call_rcu(&inode->i_rcu, btrfs_i_callback); 7170 } 7171 7172 int btrfs_drop_inode(struct inode *inode) 7173 { 7174 struct btrfs_root *root = BTRFS_I(inode)->root; 7175 7176 if (btrfs_root_refs(&root->root_item) == 0 && 7177 !btrfs_is_free_space_inode(inode)) 7178 return 1; 7179 else 7180 return generic_drop_inode(inode); 7181 } 7182 7183 static void init_once(void *foo) 7184 { 7185 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 7186 7187 inode_init_once(&ei->vfs_inode); 7188 } 7189 7190 void btrfs_destroy_cachep(void) 7191 { 7192 /* 7193 * Make sure all delayed rcu free inodes are flushed before we 7194 * destroy cache. 7195 */ 7196 rcu_barrier(); 7197 if (btrfs_inode_cachep) 7198 kmem_cache_destroy(btrfs_inode_cachep); 7199 if (btrfs_trans_handle_cachep) 7200 kmem_cache_destroy(btrfs_trans_handle_cachep); 7201 if (btrfs_transaction_cachep) 7202 kmem_cache_destroy(btrfs_transaction_cachep); 7203 if (btrfs_path_cachep) 7204 kmem_cache_destroy(btrfs_path_cachep); 7205 if (btrfs_free_space_cachep) 7206 kmem_cache_destroy(btrfs_free_space_cachep); 7207 } 7208 7209 int btrfs_init_cachep(void) 7210 { 7211 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 7212 sizeof(struct btrfs_inode), 0, 7213 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7214 if (!btrfs_inode_cachep) 7215 goto fail; 7216 7217 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 7218 sizeof(struct btrfs_trans_handle), 0, 7219 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7220 if (!btrfs_trans_handle_cachep) 7221 goto fail; 7222 7223 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", 7224 sizeof(struct btrfs_transaction), 0, 7225 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7226 if (!btrfs_transaction_cachep) 7227 goto fail; 7228 7229 btrfs_path_cachep = kmem_cache_create("btrfs_path", 7230 sizeof(struct btrfs_path), 0, 7231 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7232 if (!btrfs_path_cachep) 7233 goto fail; 7234 7235 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 7236 sizeof(struct btrfs_free_space), 0, 7237 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7238 if (!btrfs_free_space_cachep) 7239 goto fail; 7240 7241 return 0; 7242 fail: 7243 btrfs_destroy_cachep(); 7244 return -ENOMEM; 7245 } 7246 7247 static int btrfs_getattr(struct vfsmount *mnt, 7248 struct dentry *dentry, struct kstat *stat) 7249 { 7250 struct inode *inode = dentry->d_inode; 7251 u32 blocksize = inode->i_sb->s_blocksize; 7252 7253 generic_fillattr(inode, stat); 7254 stat->dev = BTRFS_I(inode)->root->anon_dev; 7255 stat->blksize = PAGE_CACHE_SIZE; 7256 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 7257 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; 7258 return 0; 7259 } 7260 7261 /* 7262 * If a file is moved, it will inherit the cow and compression flags of the new 7263 * directory. 7264 */ 7265 static void fixup_inode_flags(struct inode *dir, struct inode *inode) 7266 { 7267 struct btrfs_inode *b_dir = BTRFS_I(dir); 7268 struct btrfs_inode *b_inode = BTRFS_I(inode); 7269 7270 if (b_dir->flags & BTRFS_INODE_NODATACOW) 7271 b_inode->flags |= BTRFS_INODE_NODATACOW; 7272 else 7273 b_inode->flags &= ~BTRFS_INODE_NODATACOW; 7274 7275 if (b_dir->flags & BTRFS_INODE_COMPRESS) { 7276 b_inode->flags |= BTRFS_INODE_COMPRESS; 7277 b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; 7278 } else { 7279 b_inode->flags &= ~(BTRFS_INODE_COMPRESS | 7280 BTRFS_INODE_NOCOMPRESS); 7281 } 7282 } 7283 7284 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 7285 struct inode *new_dir, struct dentry *new_dentry) 7286 { 7287 struct btrfs_trans_handle *trans; 7288 struct btrfs_root *root = BTRFS_I(old_dir)->root; 7289 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 7290 struct inode *new_inode = new_dentry->d_inode; 7291 struct inode *old_inode = old_dentry->d_inode; 7292 struct timespec ctime = CURRENT_TIME; 7293 u64 index = 0; 7294 u64 root_objectid; 7295 int ret; 7296 u64 old_ino = btrfs_ino(old_inode); 7297 7298 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 7299 return -EPERM; 7300 7301 /* we only allow rename subvolume link between subvolumes */ 7302 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 7303 return -EXDEV; 7304 7305 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 7306 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) 7307 return -ENOTEMPTY; 7308 7309 if (S_ISDIR(old_inode->i_mode) && new_inode && 7310 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7311 return -ENOTEMPTY; 7312 /* 7313 * we're using rename to replace one file with another. 7314 * and the replacement file is large. Start IO on it now so 7315 * we don't add too much work to the end of the transaction 7316 */ 7317 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 7318 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 7319 filemap_flush(old_inode->i_mapping); 7320 7321 /* close the racy window with snapshot create/destroy ioctl */ 7322 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7323 down_read(&root->fs_info->subvol_sem); 7324 /* 7325 * We want to reserve the absolute worst case amount of items. So if 7326 * both inodes are subvols and we need to unlink them then that would 7327 * require 4 item modifications, but if they are both normal inodes it 7328 * would require 5 item modifications, so we'll assume their normal 7329 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 7330 * should cover the worst case number of items we'll modify. 7331 */ 7332 trans = btrfs_start_transaction(root, 20); 7333 if (IS_ERR(trans)) { 7334 ret = PTR_ERR(trans); 7335 goto out_notrans; 7336 } 7337 7338 if (dest != root) 7339 btrfs_record_root_in_trans(trans, dest); 7340 7341 ret = btrfs_set_inode_index(new_dir, &index); 7342 if (ret) 7343 goto out_fail; 7344 7345 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7346 /* force full log commit if subvolume involved. */ 7347 root->fs_info->last_trans_log_full_commit = trans->transid; 7348 } else { 7349 ret = btrfs_insert_inode_ref(trans, dest, 7350 new_dentry->d_name.name, 7351 new_dentry->d_name.len, 7352 old_ino, 7353 btrfs_ino(new_dir), index); 7354 if (ret) 7355 goto out_fail; 7356 /* 7357 * this is an ugly little race, but the rename is required 7358 * to make sure that if we crash, the inode is either at the 7359 * old name or the new one. pinning the log transaction lets 7360 * us make sure we don't allow a log commit to come in after 7361 * we unlink the name but before we add the new name back in. 7362 */ 7363 btrfs_pin_log_trans(root); 7364 } 7365 /* 7366 * make sure the inode gets flushed if it is replacing 7367 * something. 7368 */ 7369 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 7370 btrfs_add_ordered_operation(trans, root, old_inode); 7371 7372 inode_inc_iversion(old_dir); 7373 inode_inc_iversion(new_dir); 7374 inode_inc_iversion(old_inode); 7375 old_dir->i_ctime = old_dir->i_mtime = ctime; 7376 new_dir->i_ctime = new_dir->i_mtime = ctime; 7377 old_inode->i_ctime = ctime; 7378 7379 if (old_dentry->d_parent != new_dentry->d_parent) 7380 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 7381 7382 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7383 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 7384 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 7385 old_dentry->d_name.name, 7386 old_dentry->d_name.len); 7387 } else { 7388 ret = __btrfs_unlink_inode(trans, root, old_dir, 7389 old_dentry->d_inode, 7390 old_dentry->d_name.name, 7391 old_dentry->d_name.len); 7392 if (!ret) 7393 ret = btrfs_update_inode(trans, root, old_inode); 7394 } 7395 if (ret) { 7396 btrfs_abort_transaction(trans, root, ret); 7397 goto out_fail; 7398 } 7399 7400 if (new_inode) { 7401 inode_inc_iversion(new_inode); 7402 new_inode->i_ctime = CURRENT_TIME; 7403 if (unlikely(btrfs_ino(new_inode) == 7404 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7405 root_objectid = BTRFS_I(new_inode)->location.objectid; 7406 ret = btrfs_unlink_subvol(trans, dest, new_dir, 7407 root_objectid, 7408 new_dentry->d_name.name, 7409 new_dentry->d_name.len); 7410 BUG_ON(new_inode->i_nlink == 0); 7411 } else { 7412 ret = btrfs_unlink_inode(trans, dest, new_dir, 7413 new_dentry->d_inode, 7414 new_dentry->d_name.name, 7415 new_dentry->d_name.len); 7416 } 7417 if (!ret && new_inode->i_nlink == 0) { 7418 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 7419 BUG_ON(ret); 7420 } 7421 if (ret) { 7422 btrfs_abort_transaction(trans, root, ret); 7423 goto out_fail; 7424 } 7425 } 7426 7427 fixup_inode_flags(new_dir, old_inode); 7428 7429 ret = btrfs_add_link(trans, new_dir, old_inode, 7430 new_dentry->d_name.name, 7431 new_dentry->d_name.len, 0, index); 7432 if (ret) { 7433 btrfs_abort_transaction(trans, root, ret); 7434 goto out_fail; 7435 } 7436 7437 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 7438 struct dentry *parent = new_dentry->d_parent; 7439 btrfs_log_new_name(trans, old_inode, old_dir, parent); 7440 btrfs_end_log_trans(root); 7441 } 7442 out_fail: 7443 btrfs_end_transaction(trans, root); 7444 out_notrans: 7445 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7446 up_read(&root->fs_info->subvol_sem); 7447 7448 return ret; 7449 } 7450 7451 /* 7452 * some fairly slow code that needs optimization. This walks the list 7453 * of all the inodes with pending delalloc and forces them to disk. 7454 */ 7455 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 7456 { 7457 struct list_head *head = &root->fs_info->delalloc_inodes; 7458 struct btrfs_inode *binode; 7459 struct inode *inode; 7460 7461 if (root->fs_info->sb->s_flags & MS_RDONLY) 7462 return -EROFS; 7463 7464 spin_lock(&root->fs_info->delalloc_lock); 7465 while (!list_empty(head)) { 7466 binode = list_entry(head->next, struct btrfs_inode, 7467 delalloc_inodes); 7468 inode = igrab(&binode->vfs_inode); 7469 if (!inode) 7470 list_del_init(&binode->delalloc_inodes); 7471 spin_unlock(&root->fs_info->delalloc_lock); 7472 if (inode) { 7473 filemap_flush(inode->i_mapping); 7474 if (delay_iput) 7475 btrfs_add_delayed_iput(inode); 7476 else 7477 iput(inode); 7478 } 7479 cond_resched(); 7480 spin_lock(&root->fs_info->delalloc_lock); 7481 } 7482 spin_unlock(&root->fs_info->delalloc_lock); 7483 7484 /* the filemap_flush will queue IO into the worker threads, but 7485 * we have to make sure the IO is actually started and that 7486 * ordered extents get created before we return 7487 */ 7488 atomic_inc(&root->fs_info->async_submit_draining); 7489 while (atomic_read(&root->fs_info->nr_async_submits) || 7490 atomic_read(&root->fs_info->async_delalloc_pages)) { 7491 wait_event(root->fs_info->async_submit_wait, 7492 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 7493 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7494 } 7495 atomic_dec(&root->fs_info->async_submit_draining); 7496 return 0; 7497 } 7498 7499 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7500 const char *symname) 7501 { 7502 struct btrfs_trans_handle *trans; 7503 struct btrfs_root *root = BTRFS_I(dir)->root; 7504 struct btrfs_path *path; 7505 struct btrfs_key key; 7506 struct inode *inode = NULL; 7507 int err; 7508 int drop_inode = 0; 7509 u64 objectid; 7510 u64 index = 0 ; 7511 int name_len; 7512 int datasize; 7513 unsigned long ptr; 7514 struct btrfs_file_extent_item *ei; 7515 struct extent_buffer *leaf; 7516 unsigned long nr = 0; 7517 7518 name_len = strlen(symname) + 1; 7519 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7520 return -ENAMETOOLONG; 7521 7522 /* 7523 * 2 items for inode item and ref 7524 * 2 items for dir items 7525 * 1 item for xattr if selinux is on 7526 */ 7527 trans = btrfs_start_transaction(root, 5); 7528 if (IS_ERR(trans)) 7529 return PTR_ERR(trans); 7530 7531 err = btrfs_find_free_ino(root, &objectid); 7532 if (err) 7533 goto out_unlock; 7534 7535 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 7536 dentry->d_name.len, btrfs_ino(dir), objectid, 7537 S_IFLNK|S_IRWXUGO, &index); 7538 if (IS_ERR(inode)) { 7539 err = PTR_ERR(inode); 7540 goto out_unlock; 7541 } 7542 7543 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 7544 if (err) { 7545 drop_inode = 1; 7546 goto out_unlock; 7547 } 7548 7549 /* 7550 * If the active LSM wants to access the inode during 7551 * d_instantiate it needs these. Smack checks to see 7552 * if the filesystem supports xattrs by looking at the 7553 * ops vector. 7554 */ 7555 inode->i_fop = &btrfs_file_operations; 7556 inode->i_op = &btrfs_file_inode_operations; 7557 7558 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7559 if (err) 7560 drop_inode = 1; 7561 else { 7562 inode->i_mapping->a_ops = &btrfs_aops; 7563 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7564 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7565 } 7566 if (drop_inode) 7567 goto out_unlock; 7568 7569 path = btrfs_alloc_path(); 7570 if (!path) { 7571 err = -ENOMEM; 7572 drop_inode = 1; 7573 goto out_unlock; 7574 } 7575 key.objectid = btrfs_ino(inode); 7576 key.offset = 0; 7577 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7578 datasize = btrfs_file_extent_calc_inline_size(name_len); 7579 err = btrfs_insert_empty_item(trans, root, path, &key, 7580 datasize); 7581 if (err) { 7582 drop_inode = 1; 7583 btrfs_free_path(path); 7584 goto out_unlock; 7585 } 7586 leaf = path->nodes[0]; 7587 ei = btrfs_item_ptr(leaf, path->slots[0], 7588 struct btrfs_file_extent_item); 7589 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 7590 btrfs_set_file_extent_type(leaf, ei, 7591 BTRFS_FILE_EXTENT_INLINE); 7592 btrfs_set_file_extent_encryption(leaf, ei, 0); 7593 btrfs_set_file_extent_compression(leaf, ei, 0); 7594 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 7595 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 7596 7597 ptr = btrfs_file_extent_inline_start(ei); 7598 write_extent_buffer(leaf, symname, ptr, name_len); 7599 btrfs_mark_buffer_dirty(leaf); 7600 btrfs_free_path(path); 7601 7602 inode->i_op = &btrfs_symlink_inode_operations; 7603 inode->i_mapping->a_ops = &btrfs_symlink_aops; 7604 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7605 inode_set_bytes(inode, name_len); 7606 btrfs_i_size_write(inode, name_len - 1); 7607 err = btrfs_update_inode(trans, root, inode); 7608 if (err) 7609 drop_inode = 1; 7610 7611 out_unlock: 7612 if (!err) 7613 d_instantiate(dentry, inode); 7614 nr = trans->blocks_used; 7615 btrfs_end_transaction(trans, root); 7616 if (drop_inode) { 7617 inode_dec_link_count(inode); 7618 iput(inode); 7619 } 7620 btrfs_btree_balance_dirty(root, nr); 7621 return err; 7622 } 7623 7624 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 7625 u64 start, u64 num_bytes, u64 min_size, 7626 loff_t actual_len, u64 *alloc_hint, 7627 struct btrfs_trans_handle *trans) 7628 { 7629 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 7630 struct extent_map *em; 7631 struct btrfs_root *root = BTRFS_I(inode)->root; 7632 struct btrfs_key ins; 7633 u64 cur_offset = start; 7634 u64 i_size; 7635 int ret = 0; 7636 bool own_trans = true; 7637 7638 if (trans) 7639 own_trans = false; 7640 while (num_bytes > 0) { 7641 if (own_trans) { 7642 trans = btrfs_start_transaction(root, 3); 7643 if (IS_ERR(trans)) { 7644 ret = PTR_ERR(trans); 7645 break; 7646 } 7647 } 7648 7649 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 7650 0, *alloc_hint, &ins, 1); 7651 if (ret) { 7652 if (own_trans) 7653 btrfs_end_transaction(trans, root); 7654 break; 7655 } 7656 7657 ret = insert_reserved_file_extent(trans, inode, 7658 cur_offset, ins.objectid, 7659 ins.offset, ins.offset, 7660 ins.offset, 0, 0, 0, 7661 BTRFS_FILE_EXTENT_PREALLOC); 7662 if (ret) { 7663 btrfs_abort_transaction(trans, root, ret); 7664 if (own_trans) 7665 btrfs_end_transaction(trans, root); 7666 break; 7667 } 7668 btrfs_drop_extent_cache(inode, cur_offset, 7669 cur_offset + ins.offset -1, 0); 7670 7671 em = alloc_extent_map(); 7672 if (!em) { 7673 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 7674 &BTRFS_I(inode)->runtime_flags); 7675 goto next; 7676 } 7677 7678 em->start = cur_offset; 7679 em->orig_start = cur_offset; 7680 em->len = ins.offset; 7681 em->block_start = ins.objectid; 7682 em->block_len = ins.offset; 7683 em->bdev = root->fs_info->fs_devices->latest_bdev; 7684 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7685 em->generation = trans->transid; 7686 7687 while (1) { 7688 write_lock(&em_tree->lock); 7689 ret = add_extent_mapping(em_tree, em); 7690 if (!ret) 7691 list_move(&em->list, 7692 &em_tree->modified_extents); 7693 write_unlock(&em_tree->lock); 7694 if (ret != -EEXIST) 7695 break; 7696 btrfs_drop_extent_cache(inode, cur_offset, 7697 cur_offset + ins.offset - 1, 7698 0); 7699 } 7700 free_extent_map(em); 7701 next: 7702 num_bytes -= ins.offset; 7703 cur_offset += ins.offset; 7704 *alloc_hint = ins.objectid + ins.offset; 7705 7706 inode_inc_iversion(inode); 7707 inode->i_ctime = CURRENT_TIME; 7708 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7709 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7710 (actual_len > inode->i_size) && 7711 (cur_offset > inode->i_size)) { 7712 if (cur_offset > actual_len) 7713 i_size = actual_len; 7714 else 7715 i_size = cur_offset; 7716 i_size_write(inode, i_size); 7717 btrfs_ordered_update_i_size(inode, i_size, NULL); 7718 } 7719 7720 ret = btrfs_update_inode(trans, root, inode); 7721 7722 if (ret) { 7723 btrfs_abort_transaction(trans, root, ret); 7724 if (own_trans) 7725 btrfs_end_transaction(trans, root); 7726 break; 7727 } 7728 7729 if (own_trans) 7730 btrfs_end_transaction(trans, root); 7731 } 7732 return ret; 7733 } 7734 7735 int btrfs_prealloc_file_range(struct inode *inode, int mode, 7736 u64 start, u64 num_bytes, u64 min_size, 7737 loff_t actual_len, u64 *alloc_hint) 7738 { 7739 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7740 min_size, actual_len, alloc_hint, 7741 NULL); 7742 } 7743 7744 int btrfs_prealloc_file_range_trans(struct inode *inode, 7745 struct btrfs_trans_handle *trans, int mode, 7746 u64 start, u64 num_bytes, u64 min_size, 7747 loff_t actual_len, u64 *alloc_hint) 7748 { 7749 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7750 min_size, actual_len, alloc_hint, trans); 7751 } 7752 7753 static int btrfs_set_page_dirty(struct page *page) 7754 { 7755 return __set_page_dirty_nobuffers(page); 7756 } 7757 7758 static int btrfs_permission(struct inode *inode, int mask) 7759 { 7760 struct btrfs_root *root = BTRFS_I(inode)->root; 7761 umode_t mode = inode->i_mode; 7762 7763 if (mask & MAY_WRITE && 7764 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 7765 if (btrfs_root_readonly(root)) 7766 return -EROFS; 7767 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 7768 return -EACCES; 7769 } 7770 return generic_permission(inode, mask); 7771 } 7772 7773 static const struct inode_operations btrfs_dir_inode_operations = { 7774 .getattr = btrfs_getattr, 7775 .lookup = btrfs_lookup, 7776 .create = btrfs_create, 7777 .unlink = btrfs_unlink, 7778 .link = btrfs_link, 7779 .mkdir = btrfs_mkdir, 7780 .rmdir = btrfs_rmdir, 7781 .rename = btrfs_rename, 7782 .symlink = btrfs_symlink, 7783 .setattr = btrfs_setattr, 7784 .mknod = btrfs_mknod, 7785 .setxattr = btrfs_setxattr, 7786 .getxattr = btrfs_getxattr, 7787 .listxattr = btrfs_listxattr, 7788 .removexattr = btrfs_removexattr, 7789 .permission = btrfs_permission, 7790 .get_acl = btrfs_get_acl, 7791 }; 7792 static const struct inode_operations btrfs_dir_ro_inode_operations = { 7793 .lookup = btrfs_lookup, 7794 .permission = btrfs_permission, 7795 .get_acl = btrfs_get_acl, 7796 }; 7797 7798 static const struct file_operations btrfs_dir_file_operations = { 7799 .llseek = generic_file_llseek, 7800 .read = generic_read_dir, 7801 .readdir = btrfs_real_readdir, 7802 .unlocked_ioctl = btrfs_ioctl, 7803 #ifdef CONFIG_COMPAT 7804 .compat_ioctl = btrfs_ioctl, 7805 #endif 7806 .release = btrfs_release_file, 7807 .fsync = btrfs_sync_file, 7808 }; 7809 7810 static struct extent_io_ops btrfs_extent_io_ops = { 7811 .fill_delalloc = run_delalloc_range, 7812 .submit_bio_hook = btrfs_submit_bio_hook, 7813 .merge_bio_hook = btrfs_merge_bio_hook, 7814 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7815 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7816 .writepage_start_hook = btrfs_writepage_start_hook, 7817 .set_bit_hook = btrfs_set_bit_hook, 7818 .clear_bit_hook = btrfs_clear_bit_hook, 7819 .merge_extent_hook = btrfs_merge_extent_hook, 7820 .split_extent_hook = btrfs_split_extent_hook, 7821 }; 7822 7823 /* 7824 * btrfs doesn't support the bmap operation because swapfiles 7825 * use bmap to make a mapping of extents in the file. They assume 7826 * these extents won't change over the life of the file and they 7827 * use the bmap result to do IO directly to the drive. 7828 * 7829 * the btrfs bmap call would return logical addresses that aren't 7830 * suitable for IO and they also will change frequently as COW 7831 * operations happen. So, swapfile + btrfs == corruption. 7832 * 7833 * For now we're avoiding this by dropping bmap. 7834 */ 7835 static const struct address_space_operations btrfs_aops = { 7836 .readpage = btrfs_readpage, 7837 .writepage = btrfs_writepage, 7838 .writepages = btrfs_writepages, 7839 .readpages = btrfs_readpages, 7840 .direct_IO = btrfs_direct_IO, 7841 .invalidatepage = btrfs_invalidatepage, 7842 .releasepage = btrfs_releasepage, 7843 .set_page_dirty = btrfs_set_page_dirty, 7844 .error_remove_page = generic_error_remove_page, 7845 }; 7846 7847 static const struct address_space_operations btrfs_symlink_aops = { 7848 .readpage = btrfs_readpage, 7849 .writepage = btrfs_writepage, 7850 .invalidatepage = btrfs_invalidatepage, 7851 .releasepage = btrfs_releasepage, 7852 }; 7853 7854 static const struct inode_operations btrfs_file_inode_operations = { 7855 .getattr = btrfs_getattr, 7856 .setattr = btrfs_setattr, 7857 .setxattr = btrfs_setxattr, 7858 .getxattr = btrfs_getxattr, 7859 .listxattr = btrfs_listxattr, 7860 .removexattr = btrfs_removexattr, 7861 .permission = btrfs_permission, 7862 .fiemap = btrfs_fiemap, 7863 .get_acl = btrfs_get_acl, 7864 .update_time = btrfs_update_time, 7865 }; 7866 static const struct inode_operations btrfs_special_inode_operations = { 7867 .getattr = btrfs_getattr, 7868 .setattr = btrfs_setattr, 7869 .permission = btrfs_permission, 7870 .setxattr = btrfs_setxattr, 7871 .getxattr = btrfs_getxattr, 7872 .listxattr = btrfs_listxattr, 7873 .removexattr = btrfs_removexattr, 7874 .get_acl = btrfs_get_acl, 7875 .update_time = btrfs_update_time, 7876 }; 7877 static const struct inode_operations btrfs_symlink_inode_operations = { 7878 .readlink = generic_readlink, 7879 .follow_link = page_follow_link_light, 7880 .put_link = page_put_link, 7881 .getattr = btrfs_getattr, 7882 .setattr = btrfs_setattr, 7883 .permission = btrfs_permission, 7884 .setxattr = btrfs_setxattr, 7885 .getxattr = btrfs_getxattr, 7886 .listxattr = btrfs_listxattr, 7887 .removexattr = btrfs_removexattr, 7888 .get_acl = btrfs_get_acl, 7889 .update_time = btrfs_update_time, 7890 }; 7891 7892 const struct dentry_operations btrfs_dentry_operations = { 7893 .d_delete = btrfs_dentry_delete, 7894 .d_release = btrfs_dentry_release, 7895 }; 7896