1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include <linux/slab.h> 40 #include <linux/ratelimit.h> 41 #include <linux/mount.h> 42 #include "compat.h" 43 #include "ctree.h" 44 #include "disk-io.h" 45 #include "transaction.h" 46 #include "btrfs_inode.h" 47 #include "ioctl.h" 48 #include "print-tree.h" 49 #include "ordered-data.h" 50 #include "xattr.h" 51 #include "tree-log.h" 52 #include "volumes.h" 53 #include "compression.h" 54 #include "locking.h" 55 #include "free-space-cache.h" 56 #include "inode-map.h" 57 58 struct btrfs_iget_args { 59 u64 ino; 60 struct btrfs_root *root; 61 }; 62 63 static const struct inode_operations btrfs_dir_inode_operations; 64 static const struct inode_operations btrfs_symlink_inode_operations; 65 static const struct inode_operations btrfs_dir_ro_inode_operations; 66 static const struct inode_operations btrfs_special_inode_operations; 67 static const struct inode_operations btrfs_file_inode_operations; 68 static const struct address_space_operations btrfs_aops; 69 static const struct address_space_operations btrfs_symlink_aops; 70 static const struct file_operations btrfs_dir_file_operations; 71 static struct extent_io_ops btrfs_extent_io_ops; 72 73 static struct kmem_cache *btrfs_inode_cachep; 74 struct kmem_cache *btrfs_trans_handle_cachep; 75 struct kmem_cache *btrfs_transaction_cachep; 76 struct kmem_cache *btrfs_path_cachep; 77 struct kmem_cache *btrfs_free_space_cachep; 78 79 #define S_SHIFT 12 80 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 81 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 82 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 83 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 84 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 85 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 86 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 87 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 88 }; 89 90 static int btrfs_setsize(struct inode *inode, loff_t newsize); 91 static int btrfs_truncate(struct inode *inode); 92 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 93 static noinline int cow_file_range(struct inode *inode, 94 struct page *locked_page, 95 u64 start, u64 end, int *page_started, 96 unsigned long *nr_written, int unlock); 97 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, struct inode *inode); 99 100 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 101 struct inode *inode, struct inode *dir, 102 const struct qstr *qstr) 103 { 104 int err; 105 106 err = btrfs_init_acl(trans, inode, dir); 107 if (!err) 108 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 109 return err; 110 } 111 112 /* 113 * this does all the hard work for inserting an inline extent into 114 * the btree. The caller should have done a btrfs_drop_extents so that 115 * no overlapping inline items exist in the btree 116 */ 117 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 118 struct btrfs_root *root, struct inode *inode, 119 u64 start, size_t size, size_t compressed_size, 120 int compress_type, 121 struct page **compressed_pages) 122 { 123 struct btrfs_key key; 124 struct btrfs_path *path; 125 struct extent_buffer *leaf; 126 struct page *page = NULL; 127 char *kaddr; 128 unsigned long ptr; 129 struct btrfs_file_extent_item *ei; 130 int err = 0; 131 int ret; 132 size_t cur_size = size; 133 size_t datasize; 134 unsigned long offset; 135 136 if (compressed_size && compressed_pages) 137 cur_size = compressed_size; 138 139 path = btrfs_alloc_path(); 140 if (!path) 141 return -ENOMEM; 142 143 path->leave_spinning = 1; 144 145 key.objectid = btrfs_ino(inode); 146 key.offset = start; 147 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 148 datasize = btrfs_file_extent_calc_inline_size(cur_size); 149 150 inode_add_bytes(inode, size); 151 ret = btrfs_insert_empty_item(trans, root, path, &key, 152 datasize); 153 if (ret) { 154 err = ret; 155 goto fail; 156 } 157 leaf = path->nodes[0]; 158 ei = btrfs_item_ptr(leaf, path->slots[0], 159 struct btrfs_file_extent_item); 160 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 161 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 162 btrfs_set_file_extent_encryption(leaf, ei, 0); 163 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 164 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 165 ptr = btrfs_file_extent_inline_start(ei); 166 167 if (compress_type != BTRFS_COMPRESS_NONE) { 168 struct page *cpage; 169 int i = 0; 170 while (compressed_size > 0) { 171 cpage = compressed_pages[i]; 172 cur_size = min_t(unsigned long, compressed_size, 173 PAGE_CACHE_SIZE); 174 175 kaddr = kmap_atomic(cpage); 176 write_extent_buffer(leaf, kaddr, ptr, cur_size); 177 kunmap_atomic(kaddr); 178 179 i++; 180 ptr += cur_size; 181 compressed_size -= cur_size; 182 } 183 btrfs_set_file_extent_compression(leaf, ei, 184 compress_type); 185 } else { 186 page = find_get_page(inode->i_mapping, 187 start >> PAGE_CACHE_SHIFT); 188 btrfs_set_file_extent_compression(leaf, ei, 0); 189 kaddr = kmap_atomic(page); 190 offset = start & (PAGE_CACHE_SIZE - 1); 191 write_extent_buffer(leaf, kaddr + offset, ptr, size); 192 kunmap_atomic(kaddr); 193 page_cache_release(page); 194 } 195 btrfs_mark_buffer_dirty(leaf); 196 btrfs_free_path(path); 197 198 /* 199 * we're an inline extent, so nobody can 200 * extend the file past i_size without locking 201 * a page we already have locked. 202 * 203 * We must do any isize and inode updates 204 * before we unlock the pages. Otherwise we 205 * could end up racing with unlink. 206 */ 207 BTRFS_I(inode)->disk_i_size = inode->i_size; 208 ret = btrfs_update_inode(trans, root, inode); 209 210 return ret; 211 fail: 212 btrfs_free_path(path); 213 return err; 214 } 215 216 217 /* 218 * conditionally insert an inline extent into the file. This 219 * does the checks required to make sure the data is small enough 220 * to fit as an inline extent. 221 */ 222 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 223 struct btrfs_root *root, 224 struct inode *inode, u64 start, u64 end, 225 size_t compressed_size, int compress_type, 226 struct page **compressed_pages) 227 { 228 u64 isize = i_size_read(inode); 229 u64 actual_end = min(end + 1, isize); 230 u64 inline_len = actual_end - start; 231 u64 aligned_end = (end + root->sectorsize - 1) & 232 ~((u64)root->sectorsize - 1); 233 u64 hint_byte; 234 u64 data_len = inline_len; 235 int ret; 236 237 if (compressed_size) 238 data_len = compressed_size; 239 240 if (start > 0 || 241 actual_end >= PAGE_CACHE_SIZE || 242 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 243 (!compressed_size && 244 (actual_end & (root->sectorsize - 1)) == 0) || 245 end + 1 < isize || 246 data_len > root->fs_info->max_inline) { 247 return 1; 248 } 249 250 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 251 &hint_byte, 1); 252 if (ret) 253 return ret; 254 255 if (isize > actual_end) 256 inline_len = min_t(u64, isize, actual_end); 257 ret = insert_inline_extent(trans, root, inode, start, 258 inline_len, compressed_size, 259 compress_type, compressed_pages); 260 if (ret && ret != -ENOSPC) { 261 btrfs_abort_transaction(trans, root, ret); 262 return ret; 263 } else if (ret == -ENOSPC) { 264 return 1; 265 } 266 267 btrfs_delalloc_release_metadata(inode, end + 1 - start); 268 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 269 return 0; 270 } 271 272 struct async_extent { 273 u64 start; 274 u64 ram_size; 275 u64 compressed_size; 276 struct page **pages; 277 unsigned long nr_pages; 278 int compress_type; 279 struct list_head list; 280 }; 281 282 struct async_cow { 283 struct inode *inode; 284 struct btrfs_root *root; 285 struct page *locked_page; 286 u64 start; 287 u64 end; 288 struct list_head extents; 289 struct btrfs_work work; 290 }; 291 292 static noinline int add_async_extent(struct async_cow *cow, 293 u64 start, u64 ram_size, 294 u64 compressed_size, 295 struct page **pages, 296 unsigned long nr_pages, 297 int compress_type) 298 { 299 struct async_extent *async_extent; 300 301 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 302 BUG_ON(!async_extent); /* -ENOMEM */ 303 async_extent->start = start; 304 async_extent->ram_size = ram_size; 305 async_extent->compressed_size = compressed_size; 306 async_extent->pages = pages; 307 async_extent->nr_pages = nr_pages; 308 async_extent->compress_type = compress_type; 309 list_add_tail(&async_extent->list, &cow->extents); 310 return 0; 311 } 312 313 /* 314 * we create compressed extents in two phases. The first 315 * phase compresses a range of pages that have already been 316 * locked (both pages and state bits are locked). 317 * 318 * This is done inside an ordered work queue, and the compression 319 * is spread across many cpus. The actual IO submission is step 320 * two, and the ordered work queue takes care of making sure that 321 * happens in the same order things were put onto the queue by 322 * writepages and friends. 323 * 324 * If this code finds it can't get good compression, it puts an 325 * entry onto the work queue to write the uncompressed bytes. This 326 * makes sure that both compressed inodes and uncompressed inodes 327 * are written in the same order that the flusher thread sent them 328 * down. 329 */ 330 static noinline int compress_file_range(struct inode *inode, 331 struct page *locked_page, 332 u64 start, u64 end, 333 struct async_cow *async_cow, 334 int *num_added) 335 { 336 struct btrfs_root *root = BTRFS_I(inode)->root; 337 struct btrfs_trans_handle *trans; 338 u64 num_bytes; 339 u64 blocksize = root->sectorsize; 340 u64 actual_end; 341 u64 isize = i_size_read(inode); 342 int ret = 0; 343 struct page **pages = NULL; 344 unsigned long nr_pages; 345 unsigned long nr_pages_ret = 0; 346 unsigned long total_compressed = 0; 347 unsigned long total_in = 0; 348 unsigned long max_compressed = 128 * 1024; 349 unsigned long max_uncompressed = 128 * 1024; 350 int i; 351 int will_compress; 352 int compress_type = root->fs_info->compress_type; 353 354 /* if this is a small write inside eof, kick off a defrag */ 355 if ((end - start + 1) < 16 * 1024 && 356 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 357 btrfs_add_inode_defrag(NULL, inode); 358 359 actual_end = min_t(u64, isize, end + 1); 360 again: 361 will_compress = 0; 362 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 363 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 364 365 /* 366 * we don't want to send crud past the end of i_size through 367 * compression, that's just a waste of CPU time. So, if the 368 * end of the file is before the start of our current 369 * requested range of bytes, we bail out to the uncompressed 370 * cleanup code that can deal with all of this. 371 * 372 * It isn't really the fastest way to fix things, but this is a 373 * very uncommon corner. 374 */ 375 if (actual_end <= start) 376 goto cleanup_and_bail_uncompressed; 377 378 total_compressed = actual_end - start; 379 380 /* we want to make sure that amount of ram required to uncompress 381 * an extent is reasonable, so we limit the total size in ram 382 * of a compressed extent to 128k. This is a crucial number 383 * because it also controls how easily we can spread reads across 384 * cpus for decompression. 385 * 386 * We also want to make sure the amount of IO required to do 387 * a random read is reasonably small, so we limit the size of 388 * a compressed extent to 128k. 389 */ 390 total_compressed = min(total_compressed, max_uncompressed); 391 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 392 num_bytes = max(blocksize, num_bytes); 393 total_in = 0; 394 ret = 0; 395 396 /* 397 * we do compression for mount -o compress and when the 398 * inode has not been flagged as nocompress. This flag can 399 * change at any time if we discover bad compression ratios. 400 */ 401 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 402 (btrfs_test_opt(root, COMPRESS) || 403 (BTRFS_I(inode)->force_compress) || 404 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 405 WARN_ON(pages); 406 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 407 if (!pages) { 408 /* just bail out to the uncompressed code */ 409 goto cont; 410 } 411 412 if (BTRFS_I(inode)->force_compress) 413 compress_type = BTRFS_I(inode)->force_compress; 414 415 ret = btrfs_compress_pages(compress_type, 416 inode->i_mapping, start, 417 total_compressed, pages, 418 nr_pages, &nr_pages_ret, 419 &total_in, 420 &total_compressed, 421 max_compressed); 422 423 if (!ret) { 424 unsigned long offset = total_compressed & 425 (PAGE_CACHE_SIZE - 1); 426 struct page *page = pages[nr_pages_ret - 1]; 427 char *kaddr; 428 429 /* zero the tail end of the last page, we might be 430 * sending it down to disk 431 */ 432 if (offset) { 433 kaddr = kmap_atomic(page); 434 memset(kaddr + offset, 0, 435 PAGE_CACHE_SIZE - offset); 436 kunmap_atomic(kaddr); 437 } 438 will_compress = 1; 439 } 440 } 441 cont: 442 if (start == 0) { 443 trans = btrfs_join_transaction(root); 444 if (IS_ERR(trans)) { 445 ret = PTR_ERR(trans); 446 trans = NULL; 447 goto cleanup_and_out; 448 } 449 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 450 451 /* lets try to make an inline extent */ 452 if (ret || total_in < (actual_end - start)) { 453 /* we didn't compress the entire range, try 454 * to make an uncompressed inline extent. 455 */ 456 ret = cow_file_range_inline(trans, root, inode, 457 start, end, 0, 0, NULL); 458 } else { 459 /* try making a compressed inline extent */ 460 ret = cow_file_range_inline(trans, root, inode, 461 start, end, 462 total_compressed, 463 compress_type, pages); 464 } 465 if (ret <= 0) { 466 /* 467 * inline extent creation worked or returned error, 468 * we don't need to create any more async work items. 469 * Unlock and free up our temp pages. 470 */ 471 extent_clear_unlock_delalloc(inode, 472 &BTRFS_I(inode)->io_tree, 473 start, end, NULL, 474 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 475 EXTENT_CLEAR_DELALLOC | 476 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 477 478 btrfs_end_transaction(trans, root); 479 goto free_pages_out; 480 } 481 btrfs_end_transaction(trans, root); 482 } 483 484 if (will_compress) { 485 /* 486 * we aren't doing an inline extent round the compressed size 487 * up to a block size boundary so the allocator does sane 488 * things 489 */ 490 total_compressed = (total_compressed + blocksize - 1) & 491 ~(blocksize - 1); 492 493 /* 494 * one last check to make sure the compression is really a 495 * win, compare the page count read with the blocks on disk 496 */ 497 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 498 ~(PAGE_CACHE_SIZE - 1); 499 if (total_compressed >= total_in) { 500 will_compress = 0; 501 } else { 502 num_bytes = total_in; 503 } 504 } 505 if (!will_compress && pages) { 506 /* 507 * the compression code ran but failed to make things smaller, 508 * free any pages it allocated and our page pointer array 509 */ 510 for (i = 0; i < nr_pages_ret; i++) { 511 WARN_ON(pages[i]->mapping); 512 page_cache_release(pages[i]); 513 } 514 kfree(pages); 515 pages = NULL; 516 total_compressed = 0; 517 nr_pages_ret = 0; 518 519 /* flag the file so we don't compress in the future */ 520 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 521 !(BTRFS_I(inode)->force_compress)) { 522 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 523 } 524 } 525 if (will_compress) { 526 *num_added += 1; 527 528 /* the async work queues will take care of doing actual 529 * allocation on disk for these compressed pages, 530 * and will submit them to the elevator. 531 */ 532 add_async_extent(async_cow, start, num_bytes, 533 total_compressed, pages, nr_pages_ret, 534 compress_type); 535 536 if (start + num_bytes < end) { 537 start += num_bytes; 538 pages = NULL; 539 cond_resched(); 540 goto again; 541 } 542 } else { 543 cleanup_and_bail_uncompressed: 544 /* 545 * No compression, but we still need to write the pages in 546 * the file we've been given so far. redirty the locked 547 * page if it corresponds to our extent and set things up 548 * for the async work queue to run cow_file_range to do 549 * the normal delalloc dance 550 */ 551 if (page_offset(locked_page) >= start && 552 page_offset(locked_page) <= end) { 553 __set_page_dirty_nobuffers(locked_page); 554 /* unlocked later on in the async handlers */ 555 } 556 add_async_extent(async_cow, start, end - start + 1, 557 0, NULL, 0, BTRFS_COMPRESS_NONE); 558 *num_added += 1; 559 } 560 561 out: 562 return ret; 563 564 free_pages_out: 565 for (i = 0; i < nr_pages_ret; i++) { 566 WARN_ON(pages[i]->mapping); 567 page_cache_release(pages[i]); 568 } 569 kfree(pages); 570 571 goto out; 572 573 cleanup_and_out: 574 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 575 start, end, NULL, 576 EXTENT_CLEAR_UNLOCK_PAGE | 577 EXTENT_CLEAR_DIRTY | 578 EXTENT_CLEAR_DELALLOC | 579 EXTENT_SET_WRITEBACK | 580 EXTENT_END_WRITEBACK); 581 if (!trans || IS_ERR(trans)) 582 btrfs_error(root->fs_info, ret, "Failed to join transaction"); 583 else 584 btrfs_abort_transaction(trans, root, ret); 585 goto free_pages_out; 586 } 587 588 /* 589 * phase two of compressed writeback. This is the ordered portion 590 * of the code, which only gets called in the order the work was 591 * queued. We walk all the async extents created by compress_file_range 592 * and send them down to the disk. 593 */ 594 static noinline int submit_compressed_extents(struct inode *inode, 595 struct async_cow *async_cow) 596 { 597 struct async_extent *async_extent; 598 u64 alloc_hint = 0; 599 struct btrfs_trans_handle *trans; 600 struct btrfs_key ins; 601 struct extent_map *em; 602 struct btrfs_root *root = BTRFS_I(inode)->root; 603 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 604 struct extent_io_tree *io_tree; 605 int ret = 0; 606 607 if (list_empty(&async_cow->extents)) 608 return 0; 609 610 611 while (!list_empty(&async_cow->extents)) { 612 async_extent = list_entry(async_cow->extents.next, 613 struct async_extent, list); 614 list_del(&async_extent->list); 615 616 io_tree = &BTRFS_I(inode)->io_tree; 617 618 retry: 619 /* did the compression code fall back to uncompressed IO? */ 620 if (!async_extent->pages) { 621 int page_started = 0; 622 unsigned long nr_written = 0; 623 624 lock_extent(io_tree, async_extent->start, 625 async_extent->start + 626 async_extent->ram_size - 1); 627 628 /* allocate blocks */ 629 ret = cow_file_range(inode, async_cow->locked_page, 630 async_extent->start, 631 async_extent->start + 632 async_extent->ram_size - 1, 633 &page_started, &nr_written, 0); 634 635 /* JDM XXX */ 636 637 /* 638 * if page_started, cow_file_range inserted an 639 * inline extent and took care of all the unlocking 640 * and IO for us. Otherwise, we need to submit 641 * all those pages down to the drive. 642 */ 643 if (!page_started && !ret) 644 extent_write_locked_range(io_tree, 645 inode, async_extent->start, 646 async_extent->start + 647 async_extent->ram_size - 1, 648 btrfs_get_extent, 649 WB_SYNC_ALL); 650 kfree(async_extent); 651 cond_resched(); 652 continue; 653 } 654 655 lock_extent(io_tree, async_extent->start, 656 async_extent->start + async_extent->ram_size - 1); 657 658 trans = btrfs_join_transaction(root); 659 if (IS_ERR(trans)) { 660 ret = PTR_ERR(trans); 661 } else { 662 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 663 ret = btrfs_reserve_extent(trans, root, 664 async_extent->compressed_size, 665 async_extent->compressed_size, 666 0, alloc_hint, &ins, 1); 667 if (ret) 668 btrfs_abort_transaction(trans, root, ret); 669 btrfs_end_transaction(trans, root); 670 } 671 672 if (ret) { 673 int i; 674 for (i = 0; i < async_extent->nr_pages; i++) { 675 WARN_ON(async_extent->pages[i]->mapping); 676 page_cache_release(async_extent->pages[i]); 677 } 678 kfree(async_extent->pages); 679 async_extent->nr_pages = 0; 680 async_extent->pages = NULL; 681 unlock_extent(io_tree, async_extent->start, 682 async_extent->start + 683 async_extent->ram_size - 1); 684 if (ret == -ENOSPC) 685 goto retry; 686 goto out_free; /* JDM: Requeue? */ 687 } 688 689 /* 690 * here we're doing allocation and writeback of the 691 * compressed pages 692 */ 693 btrfs_drop_extent_cache(inode, async_extent->start, 694 async_extent->start + 695 async_extent->ram_size - 1, 0); 696 697 em = alloc_extent_map(); 698 BUG_ON(!em); /* -ENOMEM */ 699 em->start = async_extent->start; 700 em->len = async_extent->ram_size; 701 em->orig_start = em->start; 702 703 em->block_start = ins.objectid; 704 em->block_len = ins.offset; 705 em->bdev = root->fs_info->fs_devices->latest_bdev; 706 em->compress_type = async_extent->compress_type; 707 set_bit(EXTENT_FLAG_PINNED, &em->flags); 708 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 709 710 while (1) { 711 write_lock(&em_tree->lock); 712 ret = add_extent_mapping(em_tree, em); 713 write_unlock(&em_tree->lock); 714 if (ret != -EEXIST) { 715 free_extent_map(em); 716 break; 717 } 718 btrfs_drop_extent_cache(inode, async_extent->start, 719 async_extent->start + 720 async_extent->ram_size - 1, 0); 721 } 722 723 ret = btrfs_add_ordered_extent_compress(inode, 724 async_extent->start, 725 ins.objectid, 726 async_extent->ram_size, 727 ins.offset, 728 BTRFS_ORDERED_COMPRESSED, 729 async_extent->compress_type); 730 BUG_ON(ret); /* -ENOMEM */ 731 732 /* 733 * clear dirty, set writeback and unlock the pages. 734 */ 735 extent_clear_unlock_delalloc(inode, 736 &BTRFS_I(inode)->io_tree, 737 async_extent->start, 738 async_extent->start + 739 async_extent->ram_size - 1, 740 NULL, EXTENT_CLEAR_UNLOCK_PAGE | 741 EXTENT_CLEAR_UNLOCK | 742 EXTENT_CLEAR_DELALLOC | 743 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); 744 745 ret = btrfs_submit_compressed_write(inode, 746 async_extent->start, 747 async_extent->ram_size, 748 ins.objectid, 749 ins.offset, async_extent->pages, 750 async_extent->nr_pages); 751 752 BUG_ON(ret); /* -ENOMEM */ 753 alloc_hint = ins.objectid + ins.offset; 754 kfree(async_extent); 755 cond_resched(); 756 } 757 ret = 0; 758 out: 759 return ret; 760 out_free: 761 kfree(async_extent); 762 goto out; 763 } 764 765 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 766 u64 num_bytes) 767 { 768 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 769 struct extent_map *em; 770 u64 alloc_hint = 0; 771 772 read_lock(&em_tree->lock); 773 em = search_extent_mapping(em_tree, start, num_bytes); 774 if (em) { 775 /* 776 * if block start isn't an actual block number then find the 777 * first block in this inode and use that as a hint. If that 778 * block is also bogus then just don't worry about it. 779 */ 780 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 781 free_extent_map(em); 782 em = search_extent_mapping(em_tree, 0, 0); 783 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 784 alloc_hint = em->block_start; 785 if (em) 786 free_extent_map(em); 787 } else { 788 alloc_hint = em->block_start; 789 free_extent_map(em); 790 } 791 } 792 read_unlock(&em_tree->lock); 793 794 return alloc_hint; 795 } 796 797 /* 798 * when extent_io.c finds a delayed allocation range in the file, 799 * the call backs end up in this code. The basic idea is to 800 * allocate extents on disk for the range, and create ordered data structs 801 * in ram to track those extents. 802 * 803 * locked_page is the page that writepage had locked already. We use 804 * it to make sure we don't do extra locks or unlocks. 805 * 806 * *page_started is set to one if we unlock locked_page and do everything 807 * required to start IO on it. It may be clean and already done with 808 * IO when we return. 809 */ 810 static noinline int cow_file_range(struct inode *inode, 811 struct page *locked_page, 812 u64 start, u64 end, int *page_started, 813 unsigned long *nr_written, 814 int unlock) 815 { 816 struct btrfs_root *root = BTRFS_I(inode)->root; 817 struct btrfs_trans_handle *trans; 818 u64 alloc_hint = 0; 819 u64 num_bytes; 820 unsigned long ram_size; 821 u64 disk_num_bytes; 822 u64 cur_alloc_size; 823 u64 blocksize = root->sectorsize; 824 struct btrfs_key ins; 825 struct extent_map *em; 826 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 827 int ret = 0; 828 829 BUG_ON(btrfs_is_free_space_inode(inode)); 830 trans = btrfs_join_transaction(root); 831 if (IS_ERR(trans)) { 832 extent_clear_unlock_delalloc(inode, 833 &BTRFS_I(inode)->io_tree, 834 start, end, locked_page, 835 EXTENT_CLEAR_UNLOCK_PAGE | 836 EXTENT_CLEAR_UNLOCK | 837 EXTENT_CLEAR_DELALLOC | 838 EXTENT_CLEAR_DIRTY | 839 EXTENT_SET_WRITEBACK | 840 EXTENT_END_WRITEBACK); 841 return PTR_ERR(trans); 842 } 843 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 844 845 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 846 num_bytes = max(blocksize, num_bytes); 847 disk_num_bytes = num_bytes; 848 ret = 0; 849 850 /* if this is a small write inside eof, kick off defrag */ 851 if (num_bytes < 64 * 1024 && 852 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 853 btrfs_add_inode_defrag(trans, inode); 854 855 if (start == 0) { 856 /* lets try to make an inline extent */ 857 ret = cow_file_range_inline(trans, root, inode, 858 start, end, 0, 0, NULL); 859 if (ret == 0) { 860 extent_clear_unlock_delalloc(inode, 861 &BTRFS_I(inode)->io_tree, 862 start, end, NULL, 863 EXTENT_CLEAR_UNLOCK_PAGE | 864 EXTENT_CLEAR_UNLOCK | 865 EXTENT_CLEAR_DELALLOC | 866 EXTENT_CLEAR_DIRTY | 867 EXTENT_SET_WRITEBACK | 868 EXTENT_END_WRITEBACK); 869 870 *nr_written = *nr_written + 871 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 872 *page_started = 1; 873 goto out; 874 } else if (ret < 0) { 875 btrfs_abort_transaction(trans, root, ret); 876 goto out_unlock; 877 } 878 } 879 880 BUG_ON(disk_num_bytes > 881 btrfs_super_total_bytes(root->fs_info->super_copy)); 882 883 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 884 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 885 886 while (disk_num_bytes > 0) { 887 unsigned long op; 888 889 cur_alloc_size = disk_num_bytes; 890 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 891 root->sectorsize, 0, alloc_hint, 892 &ins, 1); 893 if (ret < 0) { 894 btrfs_abort_transaction(trans, root, ret); 895 goto out_unlock; 896 } 897 898 em = alloc_extent_map(); 899 BUG_ON(!em); /* -ENOMEM */ 900 em->start = start; 901 em->orig_start = em->start; 902 ram_size = ins.offset; 903 em->len = ins.offset; 904 905 em->block_start = ins.objectid; 906 em->block_len = ins.offset; 907 em->bdev = root->fs_info->fs_devices->latest_bdev; 908 set_bit(EXTENT_FLAG_PINNED, &em->flags); 909 910 while (1) { 911 write_lock(&em_tree->lock); 912 ret = add_extent_mapping(em_tree, em); 913 write_unlock(&em_tree->lock); 914 if (ret != -EEXIST) { 915 free_extent_map(em); 916 break; 917 } 918 btrfs_drop_extent_cache(inode, start, 919 start + ram_size - 1, 0); 920 } 921 922 cur_alloc_size = ins.offset; 923 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 924 ram_size, cur_alloc_size, 0); 925 BUG_ON(ret); /* -ENOMEM */ 926 927 if (root->root_key.objectid == 928 BTRFS_DATA_RELOC_TREE_OBJECTID) { 929 ret = btrfs_reloc_clone_csums(inode, start, 930 cur_alloc_size); 931 if (ret) { 932 btrfs_abort_transaction(trans, root, ret); 933 goto out_unlock; 934 } 935 } 936 937 if (disk_num_bytes < cur_alloc_size) 938 break; 939 940 /* we're not doing compressed IO, don't unlock the first 941 * page (which the caller expects to stay locked), don't 942 * clear any dirty bits and don't set any writeback bits 943 * 944 * Do set the Private2 bit so we know this page was properly 945 * setup for writepage 946 */ 947 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; 948 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 949 EXTENT_SET_PRIVATE2; 950 951 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 952 start, start + ram_size - 1, 953 locked_page, op); 954 disk_num_bytes -= cur_alloc_size; 955 num_bytes -= cur_alloc_size; 956 alloc_hint = ins.objectid + ins.offset; 957 start += cur_alloc_size; 958 } 959 ret = 0; 960 out: 961 btrfs_end_transaction(trans, root); 962 963 return ret; 964 out_unlock: 965 extent_clear_unlock_delalloc(inode, 966 &BTRFS_I(inode)->io_tree, 967 start, end, locked_page, 968 EXTENT_CLEAR_UNLOCK_PAGE | 969 EXTENT_CLEAR_UNLOCK | 970 EXTENT_CLEAR_DELALLOC | 971 EXTENT_CLEAR_DIRTY | 972 EXTENT_SET_WRITEBACK | 973 EXTENT_END_WRITEBACK); 974 975 goto out; 976 } 977 978 /* 979 * work queue call back to started compression on a file and pages 980 */ 981 static noinline void async_cow_start(struct btrfs_work *work) 982 { 983 struct async_cow *async_cow; 984 int num_added = 0; 985 async_cow = container_of(work, struct async_cow, work); 986 987 compress_file_range(async_cow->inode, async_cow->locked_page, 988 async_cow->start, async_cow->end, async_cow, 989 &num_added); 990 if (num_added == 0) { 991 btrfs_add_delayed_iput(async_cow->inode); 992 async_cow->inode = NULL; 993 } 994 } 995 996 /* 997 * work queue call back to submit previously compressed pages 998 */ 999 static noinline void async_cow_submit(struct btrfs_work *work) 1000 { 1001 struct async_cow *async_cow; 1002 struct btrfs_root *root; 1003 unsigned long nr_pages; 1004 1005 async_cow = container_of(work, struct async_cow, work); 1006 1007 root = async_cow->root; 1008 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 1009 PAGE_CACHE_SHIFT; 1010 1011 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 1012 5 * 1024 * 1024 && 1013 waitqueue_active(&root->fs_info->async_submit_wait)) 1014 wake_up(&root->fs_info->async_submit_wait); 1015 1016 if (async_cow->inode) 1017 submit_compressed_extents(async_cow->inode, async_cow); 1018 } 1019 1020 static noinline void async_cow_free(struct btrfs_work *work) 1021 { 1022 struct async_cow *async_cow; 1023 async_cow = container_of(work, struct async_cow, work); 1024 if (async_cow->inode) 1025 btrfs_add_delayed_iput(async_cow->inode); 1026 kfree(async_cow); 1027 } 1028 1029 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1030 u64 start, u64 end, int *page_started, 1031 unsigned long *nr_written) 1032 { 1033 struct async_cow *async_cow; 1034 struct btrfs_root *root = BTRFS_I(inode)->root; 1035 unsigned long nr_pages; 1036 u64 cur_end; 1037 int limit = 10 * 1024 * 1024; 1038 1039 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1040 1, 0, NULL, GFP_NOFS); 1041 while (start < end) { 1042 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1043 BUG_ON(!async_cow); /* -ENOMEM */ 1044 async_cow->inode = igrab(inode); 1045 async_cow->root = root; 1046 async_cow->locked_page = locked_page; 1047 async_cow->start = start; 1048 1049 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1050 cur_end = end; 1051 else 1052 cur_end = min(end, start + 512 * 1024 - 1); 1053 1054 async_cow->end = cur_end; 1055 INIT_LIST_HEAD(&async_cow->extents); 1056 1057 async_cow->work.func = async_cow_start; 1058 async_cow->work.ordered_func = async_cow_submit; 1059 async_cow->work.ordered_free = async_cow_free; 1060 async_cow->work.flags = 0; 1061 1062 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1063 PAGE_CACHE_SHIFT; 1064 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1065 1066 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1067 &async_cow->work); 1068 1069 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1070 wait_event(root->fs_info->async_submit_wait, 1071 (atomic_read(&root->fs_info->async_delalloc_pages) < 1072 limit)); 1073 } 1074 1075 while (atomic_read(&root->fs_info->async_submit_draining) && 1076 atomic_read(&root->fs_info->async_delalloc_pages)) { 1077 wait_event(root->fs_info->async_submit_wait, 1078 (atomic_read(&root->fs_info->async_delalloc_pages) == 1079 0)); 1080 } 1081 1082 *nr_written += nr_pages; 1083 start = cur_end + 1; 1084 } 1085 *page_started = 1; 1086 return 0; 1087 } 1088 1089 static noinline int csum_exist_in_range(struct btrfs_root *root, 1090 u64 bytenr, u64 num_bytes) 1091 { 1092 int ret; 1093 struct btrfs_ordered_sum *sums; 1094 LIST_HEAD(list); 1095 1096 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1097 bytenr + num_bytes - 1, &list, 0); 1098 if (ret == 0 && list_empty(&list)) 1099 return 0; 1100 1101 while (!list_empty(&list)) { 1102 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1103 list_del(&sums->list); 1104 kfree(sums); 1105 } 1106 return 1; 1107 } 1108 1109 /* 1110 * when nowcow writeback call back. This checks for snapshots or COW copies 1111 * of the extents that exist in the file, and COWs the file as required. 1112 * 1113 * If no cow copies or snapshots exist, we write directly to the existing 1114 * blocks on disk 1115 */ 1116 static noinline int run_delalloc_nocow(struct inode *inode, 1117 struct page *locked_page, 1118 u64 start, u64 end, int *page_started, int force, 1119 unsigned long *nr_written) 1120 { 1121 struct btrfs_root *root = BTRFS_I(inode)->root; 1122 struct btrfs_trans_handle *trans; 1123 struct extent_buffer *leaf; 1124 struct btrfs_path *path; 1125 struct btrfs_file_extent_item *fi; 1126 struct btrfs_key found_key; 1127 u64 cow_start; 1128 u64 cur_offset; 1129 u64 extent_end; 1130 u64 extent_offset; 1131 u64 disk_bytenr; 1132 u64 num_bytes; 1133 int extent_type; 1134 int ret, err; 1135 int type; 1136 int nocow; 1137 int check_prev = 1; 1138 bool nolock; 1139 u64 ino = btrfs_ino(inode); 1140 1141 path = btrfs_alloc_path(); 1142 if (!path) { 1143 extent_clear_unlock_delalloc(inode, 1144 &BTRFS_I(inode)->io_tree, 1145 start, end, locked_page, 1146 EXTENT_CLEAR_UNLOCK_PAGE | 1147 EXTENT_CLEAR_UNLOCK | 1148 EXTENT_CLEAR_DELALLOC | 1149 EXTENT_CLEAR_DIRTY | 1150 EXTENT_SET_WRITEBACK | 1151 EXTENT_END_WRITEBACK); 1152 return -ENOMEM; 1153 } 1154 1155 nolock = btrfs_is_free_space_inode(inode); 1156 1157 if (nolock) 1158 trans = btrfs_join_transaction_nolock(root); 1159 else 1160 trans = btrfs_join_transaction(root); 1161 1162 if (IS_ERR(trans)) { 1163 extent_clear_unlock_delalloc(inode, 1164 &BTRFS_I(inode)->io_tree, 1165 start, end, locked_page, 1166 EXTENT_CLEAR_UNLOCK_PAGE | 1167 EXTENT_CLEAR_UNLOCK | 1168 EXTENT_CLEAR_DELALLOC | 1169 EXTENT_CLEAR_DIRTY | 1170 EXTENT_SET_WRITEBACK | 1171 EXTENT_END_WRITEBACK); 1172 btrfs_free_path(path); 1173 return PTR_ERR(trans); 1174 } 1175 1176 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1177 1178 cow_start = (u64)-1; 1179 cur_offset = start; 1180 while (1) { 1181 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1182 cur_offset, 0); 1183 if (ret < 0) { 1184 btrfs_abort_transaction(trans, root, ret); 1185 goto error; 1186 } 1187 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1188 leaf = path->nodes[0]; 1189 btrfs_item_key_to_cpu(leaf, &found_key, 1190 path->slots[0] - 1); 1191 if (found_key.objectid == ino && 1192 found_key.type == BTRFS_EXTENT_DATA_KEY) 1193 path->slots[0]--; 1194 } 1195 check_prev = 0; 1196 next_slot: 1197 leaf = path->nodes[0]; 1198 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1199 ret = btrfs_next_leaf(root, path); 1200 if (ret < 0) { 1201 btrfs_abort_transaction(trans, root, ret); 1202 goto error; 1203 } 1204 if (ret > 0) 1205 break; 1206 leaf = path->nodes[0]; 1207 } 1208 1209 nocow = 0; 1210 disk_bytenr = 0; 1211 num_bytes = 0; 1212 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1213 1214 if (found_key.objectid > ino || 1215 found_key.type > BTRFS_EXTENT_DATA_KEY || 1216 found_key.offset > end) 1217 break; 1218 1219 if (found_key.offset > cur_offset) { 1220 extent_end = found_key.offset; 1221 extent_type = 0; 1222 goto out_check; 1223 } 1224 1225 fi = btrfs_item_ptr(leaf, path->slots[0], 1226 struct btrfs_file_extent_item); 1227 extent_type = btrfs_file_extent_type(leaf, fi); 1228 1229 if (extent_type == BTRFS_FILE_EXTENT_REG || 1230 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1231 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1232 extent_offset = btrfs_file_extent_offset(leaf, fi); 1233 extent_end = found_key.offset + 1234 btrfs_file_extent_num_bytes(leaf, fi); 1235 if (extent_end <= start) { 1236 path->slots[0]++; 1237 goto next_slot; 1238 } 1239 if (disk_bytenr == 0) 1240 goto out_check; 1241 if (btrfs_file_extent_compression(leaf, fi) || 1242 btrfs_file_extent_encryption(leaf, fi) || 1243 btrfs_file_extent_other_encoding(leaf, fi)) 1244 goto out_check; 1245 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1246 goto out_check; 1247 if (btrfs_extent_readonly(root, disk_bytenr)) 1248 goto out_check; 1249 if (btrfs_cross_ref_exist(trans, root, ino, 1250 found_key.offset - 1251 extent_offset, disk_bytenr)) 1252 goto out_check; 1253 disk_bytenr += extent_offset; 1254 disk_bytenr += cur_offset - found_key.offset; 1255 num_bytes = min(end + 1, extent_end) - cur_offset; 1256 /* 1257 * force cow if csum exists in the range. 1258 * this ensure that csum for a given extent are 1259 * either valid or do not exist. 1260 */ 1261 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1262 goto out_check; 1263 nocow = 1; 1264 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1265 extent_end = found_key.offset + 1266 btrfs_file_extent_inline_len(leaf, fi); 1267 extent_end = ALIGN(extent_end, root->sectorsize); 1268 } else { 1269 BUG_ON(1); 1270 } 1271 out_check: 1272 if (extent_end <= start) { 1273 path->slots[0]++; 1274 goto next_slot; 1275 } 1276 if (!nocow) { 1277 if (cow_start == (u64)-1) 1278 cow_start = cur_offset; 1279 cur_offset = extent_end; 1280 if (cur_offset > end) 1281 break; 1282 path->slots[0]++; 1283 goto next_slot; 1284 } 1285 1286 btrfs_release_path(path); 1287 if (cow_start != (u64)-1) { 1288 ret = cow_file_range(inode, locked_page, cow_start, 1289 found_key.offset - 1, page_started, 1290 nr_written, 1); 1291 if (ret) { 1292 btrfs_abort_transaction(trans, root, ret); 1293 goto error; 1294 } 1295 cow_start = (u64)-1; 1296 } 1297 1298 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1299 struct extent_map *em; 1300 struct extent_map_tree *em_tree; 1301 em_tree = &BTRFS_I(inode)->extent_tree; 1302 em = alloc_extent_map(); 1303 BUG_ON(!em); /* -ENOMEM */ 1304 em->start = cur_offset; 1305 em->orig_start = em->start; 1306 em->len = num_bytes; 1307 em->block_len = num_bytes; 1308 em->block_start = disk_bytenr; 1309 em->bdev = root->fs_info->fs_devices->latest_bdev; 1310 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1311 while (1) { 1312 write_lock(&em_tree->lock); 1313 ret = add_extent_mapping(em_tree, em); 1314 write_unlock(&em_tree->lock); 1315 if (ret != -EEXIST) { 1316 free_extent_map(em); 1317 break; 1318 } 1319 btrfs_drop_extent_cache(inode, em->start, 1320 em->start + em->len - 1, 0); 1321 } 1322 type = BTRFS_ORDERED_PREALLOC; 1323 } else { 1324 type = BTRFS_ORDERED_NOCOW; 1325 } 1326 1327 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1328 num_bytes, num_bytes, type); 1329 BUG_ON(ret); /* -ENOMEM */ 1330 1331 if (root->root_key.objectid == 1332 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1333 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1334 num_bytes); 1335 if (ret) { 1336 btrfs_abort_transaction(trans, root, ret); 1337 goto error; 1338 } 1339 } 1340 1341 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1342 cur_offset, cur_offset + num_bytes - 1, 1343 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1344 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 1345 EXTENT_SET_PRIVATE2); 1346 cur_offset = extent_end; 1347 if (cur_offset > end) 1348 break; 1349 } 1350 btrfs_release_path(path); 1351 1352 if (cur_offset <= end && cow_start == (u64)-1) { 1353 cow_start = cur_offset; 1354 cur_offset = end; 1355 } 1356 1357 if (cow_start != (u64)-1) { 1358 ret = cow_file_range(inode, locked_page, cow_start, end, 1359 page_started, nr_written, 1); 1360 if (ret) { 1361 btrfs_abort_transaction(trans, root, ret); 1362 goto error; 1363 } 1364 } 1365 1366 error: 1367 if (nolock) { 1368 err = btrfs_end_transaction_nolock(trans, root); 1369 } else { 1370 err = btrfs_end_transaction(trans, root); 1371 } 1372 if (!ret) 1373 ret = err; 1374 1375 if (ret && cur_offset < end) 1376 extent_clear_unlock_delalloc(inode, 1377 &BTRFS_I(inode)->io_tree, 1378 cur_offset, end, locked_page, 1379 EXTENT_CLEAR_UNLOCK_PAGE | 1380 EXTENT_CLEAR_UNLOCK | 1381 EXTENT_CLEAR_DELALLOC | 1382 EXTENT_CLEAR_DIRTY | 1383 EXTENT_SET_WRITEBACK | 1384 EXTENT_END_WRITEBACK); 1385 1386 btrfs_free_path(path); 1387 return ret; 1388 } 1389 1390 /* 1391 * extent_io.c call back to do delayed allocation processing 1392 */ 1393 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1394 u64 start, u64 end, int *page_started, 1395 unsigned long *nr_written) 1396 { 1397 int ret; 1398 struct btrfs_root *root = BTRFS_I(inode)->root; 1399 1400 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1401 ret = run_delalloc_nocow(inode, locked_page, start, end, 1402 page_started, 1, nr_written); 1403 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1404 ret = run_delalloc_nocow(inode, locked_page, start, end, 1405 page_started, 0, nr_written); 1406 } else if (!btrfs_test_opt(root, COMPRESS) && 1407 !(BTRFS_I(inode)->force_compress) && 1408 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { 1409 ret = cow_file_range(inode, locked_page, start, end, 1410 page_started, nr_written, 1); 1411 } else { 1412 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1413 &BTRFS_I(inode)->runtime_flags); 1414 ret = cow_file_range_async(inode, locked_page, start, end, 1415 page_started, nr_written); 1416 } 1417 return ret; 1418 } 1419 1420 static void btrfs_split_extent_hook(struct inode *inode, 1421 struct extent_state *orig, u64 split) 1422 { 1423 /* not delalloc, ignore it */ 1424 if (!(orig->state & EXTENT_DELALLOC)) 1425 return; 1426 1427 spin_lock(&BTRFS_I(inode)->lock); 1428 BTRFS_I(inode)->outstanding_extents++; 1429 spin_unlock(&BTRFS_I(inode)->lock); 1430 } 1431 1432 /* 1433 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1434 * extents so we can keep track of new extents that are just merged onto old 1435 * extents, such as when we are doing sequential writes, so we can properly 1436 * account for the metadata space we'll need. 1437 */ 1438 static void btrfs_merge_extent_hook(struct inode *inode, 1439 struct extent_state *new, 1440 struct extent_state *other) 1441 { 1442 /* not delalloc, ignore it */ 1443 if (!(other->state & EXTENT_DELALLOC)) 1444 return; 1445 1446 spin_lock(&BTRFS_I(inode)->lock); 1447 BTRFS_I(inode)->outstanding_extents--; 1448 spin_unlock(&BTRFS_I(inode)->lock); 1449 } 1450 1451 /* 1452 * extent_io.c set_bit_hook, used to track delayed allocation 1453 * bytes in this file, and to maintain the list of inodes that 1454 * have pending delalloc work to be done. 1455 */ 1456 static void btrfs_set_bit_hook(struct inode *inode, 1457 struct extent_state *state, int *bits) 1458 { 1459 1460 /* 1461 * set_bit and clear bit hooks normally require _irqsave/restore 1462 * but in this case, we are only testing for the DELALLOC 1463 * bit, which is only set or cleared with irqs on 1464 */ 1465 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1466 struct btrfs_root *root = BTRFS_I(inode)->root; 1467 u64 len = state->end + 1 - state->start; 1468 bool do_list = !btrfs_is_free_space_inode(inode); 1469 1470 if (*bits & EXTENT_FIRST_DELALLOC) { 1471 *bits &= ~EXTENT_FIRST_DELALLOC; 1472 } else { 1473 spin_lock(&BTRFS_I(inode)->lock); 1474 BTRFS_I(inode)->outstanding_extents++; 1475 spin_unlock(&BTRFS_I(inode)->lock); 1476 } 1477 1478 spin_lock(&root->fs_info->delalloc_lock); 1479 BTRFS_I(inode)->delalloc_bytes += len; 1480 root->fs_info->delalloc_bytes += len; 1481 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1482 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1483 &root->fs_info->delalloc_inodes); 1484 } 1485 spin_unlock(&root->fs_info->delalloc_lock); 1486 } 1487 } 1488 1489 /* 1490 * extent_io.c clear_bit_hook, see set_bit_hook for why 1491 */ 1492 static void btrfs_clear_bit_hook(struct inode *inode, 1493 struct extent_state *state, int *bits) 1494 { 1495 /* 1496 * set_bit and clear bit hooks normally require _irqsave/restore 1497 * but in this case, we are only testing for the DELALLOC 1498 * bit, which is only set or cleared with irqs on 1499 */ 1500 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1501 struct btrfs_root *root = BTRFS_I(inode)->root; 1502 u64 len = state->end + 1 - state->start; 1503 bool do_list = !btrfs_is_free_space_inode(inode); 1504 1505 if (*bits & EXTENT_FIRST_DELALLOC) { 1506 *bits &= ~EXTENT_FIRST_DELALLOC; 1507 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1508 spin_lock(&BTRFS_I(inode)->lock); 1509 BTRFS_I(inode)->outstanding_extents--; 1510 spin_unlock(&BTRFS_I(inode)->lock); 1511 } 1512 1513 if (*bits & EXTENT_DO_ACCOUNTING) 1514 btrfs_delalloc_release_metadata(inode, len); 1515 1516 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1517 && do_list) 1518 btrfs_free_reserved_data_space(inode, len); 1519 1520 spin_lock(&root->fs_info->delalloc_lock); 1521 root->fs_info->delalloc_bytes -= len; 1522 BTRFS_I(inode)->delalloc_bytes -= len; 1523 1524 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1525 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1526 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1527 } 1528 spin_unlock(&root->fs_info->delalloc_lock); 1529 } 1530 } 1531 1532 /* 1533 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1534 * we don't create bios that span stripes or chunks 1535 */ 1536 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1537 size_t size, struct bio *bio, 1538 unsigned long bio_flags) 1539 { 1540 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1541 struct btrfs_mapping_tree *map_tree; 1542 u64 logical = (u64)bio->bi_sector << 9; 1543 u64 length = 0; 1544 u64 map_length; 1545 int ret; 1546 1547 if (bio_flags & EXTENT_BIO_COMPRESSED) 1548 return 0; 1549 1550 length = bio->bi_size; 1551 map_tree = &root->fs_info->mapping_tree; 1552 map_length = length; 1553 ret = btrfs_map_block(map_tree, READ, logical, 1554 &map_length, NULL, 0); 1555 /* Will always return 0 or 1 with map_multi == NULL */ 1556 BUG_ON(ret < 0); 1557 if (map_length < length + size) 1558 return 1; 1559 return 0; 1560 } 1561 1562 /* 1563 * in order to insert checksums into the metadata in large chunks, 1564 * we wait until bio submission time. All the pages in the bio are 1565 * checksummed and sums are attached onto the ordered extent record. 1566 * 1567 * At IO completion time the cums attached on the ordered extent record 1568 * are inserted into the btree 1569 */ 1570 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1571 struct bio *bio, int mirror_num, 1572 unsigned long bio_flags, 1573 u64 bio_offset) 1574 { 1575 struct btrfs_root *root = BTRFS_I(inode)->root; 1576 int ret = 0; 1577 1578 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1579 BUG_ON(ret); /* -ENOMEM */ 1580 return 0; 1581 } 1582 1583 /* 1584 * in order to insert checksums into the metadata in large chunks, 1585 * we wait until bio submission time. All the pages in the bio are 1586 * checksummed and sums are attached onto the ordered extent record. 1587 * 1588 * At IO completion time the cums attached on the ordered extent record 1589 * are inserted into the btree 1590 */ 1591 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1592 int mirror_num, unsigned long bio_flags, 1593 u64 bio_offset) 1594 { 1595 struct btrfs_root *root = BTRFS_I(inode)->root; 1596 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1597 } 1598 1599 /* 1600 * extent_io.c submission hook. This does the right thing for csum calculation 1601 * on write, or reading the csums from the tree before a read 1602 */ 1603 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1604 int mirror_num, unsigned long bio_flags, 1605 u64 bio_offset) 1606 { 1607 struct btrfs_root *root = BTRFS_I(inode)->root; 1608 int ret = 0; 1609 int skip_sum; 1610 int metadata = 0; 1611 1612 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1613 1614 if (btrfs_is_free_space_inode(inode)) 1615 metadata = 2; 1616 1617 if (!(rw & REQ_WRITE)) { 1618 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1619 if (ret) 1620 return ret; 1621 1622 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1623 return btrfs_submit_compressed_read(inode, bio, 1624 mirror_num, bio_flags); 1625 } else if (!skip_sum) { 1626 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1627 if (ret) 1628 return ret; 1629 } 1630 goto mapit; 1631 } else if (!skip_sum) { 1632 /* csum items have already been cloned */ 1633 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1634 goto mapit; 1635 /* we're doing a write, do the async checksumming */ 1636 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1637 inode, rw, bio, mirror_num, 1638 bio_flags, bio_offset, 1639 __btrfs_submit_bio_start, 1640 __btrfs_submit_bio_done); 1641 } 1642 1643 mapit: 1644 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1645 } 1646 1647 /* 1648 * given a list of ordered sums record them in the inode. This happens 1649 * at IO completion time based on sums calculated at bio submission time. 1650 */ 1651 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1652 struct inode *inode, u64 file_offset, 1653 struct list_head *list) 1654 { 1655 struct btrfs_ordered_sum *sum; 1656 1657 list_for_each_entry(sum, list, list) { 1658 btrfs_csum_file_blocks(trans, 1659 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1660 } 1661 return 0; 1662 } 1663 1664 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1665 struct extent_state **cached_state) 1666 { 1667 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1668 WARN_ON(1); 1669 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1670 cached_state, GFP_NOFS); 1671 } 1672 1673 /* see btrfs_writepage_start_hook for details on why this is required */ 1674 struct btrfs_writepage_fixup { 1675 struct page *page; 1676 struct btrfs_work work; 1677 }; 1678 1679 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1680 { 1681 struct btrfs_writepage_fixup *fixup; 1682 struct btrfs_ordered_extent *ordered; 1683 struct extent_state *cached_state = NULL; 1684 struct page *page; 1685 struct inode *inode; 1686 u64 page_start; 1687 u64 page_end; 1688 int ret; 1689 1690 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1691 page = fixup->page; 1692 again: 1693 lock_page(page); 1694 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1695 ClearPageChecked(page); 1696 goto out_page; 1697 } 1698 1699 inode = page->mapping->host; 1700 page_start = page_offset(page); 1701 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1702 1703 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1704 &cached_state); 1705 1706 /* already ordered? We're done */ 1707 if (PagePrivate2(page)) 1708 goto out; 1709 1710 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1711 if (ordered) { 1712 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1713 page_end, &cached_state, GFP_NOFS); 1714 unlock_page(page); 1715 btrfs_start_ordered_extent(inode, ordered, 1); 1716 btrfs_put_ordered_extent(ordered); 1717 goto again; 1718 } 1719 1720 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1721 if (ret) { 1722 mapping_set_error(page->mapping, ret); 1723 end_extent_writepage(page, ret, page_start, page_end); 1724 ClearPageChecked(page); 1725 goto out; 1726 } 1727 1728 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1729 ClearPageChecked(page); 1730 set_page_dirty(page); 1731 out: 1732 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1733 &cached_state, GFP_NOFS); 1734 out_page: 1735 unlock_page(page); 1736 page_cache_release(page); 1737 kfree(fixup); 1738 } 1739 1740 /* 1741 * There are a few paths in the higher layers of the kernel that directly 1742 * set the page dirty bit without asking the filesystem if it is a 1743 * good idea. This causes problems because we want to make sure COW 1744 * properly happens and the data=ordered rules are followed. 1745 * 1746 * In our case any range that doesn't have the ORDERED bit set 1747 * hasn't been properly setup for IO. We kick off an async process 1748 * to fix it up. The async helper will wait for ordered extents, set 1749 * the delalloc bit and make it safe to write the page. 1750 */ 1751 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1752 { 1753 struct inode *inode = page->mapping->host; 1754 struct btrfs_writepage_fixup *fixup; 1755 struct btrfs_root *root = BTRFS_I(inode)->root; 1756 1757 /* this page is properly in the ordered list */ 1758 if (TestClearPagePrivate2(page)) 1759 return 0; 1760 1761 if (PageChecked(page)) 1762 return -EAGAIN; 1763 1764 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1765 if (!fixup) 1766 return -EAGAIN; 1767 1768 SetPageChecked(page); 1769 page_cache_get(page); 1770 fixup->work.func = btrfs_writepage_fixup_worker; 1771 fixup->page = page; 1772 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1773 return -EBUSY; 1774 } 1775 1776 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1777 struct inode *inode, u64 file_pos, 1778 u64 disk_bytenr, u64 disk_num_bytes, 1779 u64 num_bytes, u64 ram_bytes, 1780 u8 compression, u8 encryption, 1781 u16 other_encoding, int extent_type) 1782 { 1783 struct btrfs_root *root = BTRFS_I(inode)->root; 1784 struct btrfs_file_extent_item *fi; 1785 struct btrfs_path *path; 1786 struct extent_buffer *leaf; 1787 struct btrfs_key ins; 1788 u64 hint; 1789 int ret; 1790 1791 path = btrfs_alloc_path(); 1792 if (!path) 1793 return -ENOMEM; 1794 1795 path->leave_spinning = 1; 1796 1797 /* 1798 * we may be replacing one extent in the tree with another. 1799 * The new extent is pinned in the extent map, and we don't want 1800 * to drop it from the cache until it is completely in the btree. 1801 * 1802 * So, tell btrfs_drop_extents to leave this extent in the cache. 1803 * the caller is expected to unpin it and allow it to be merged 1804 * with the others. 1805 */ 1806 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1807 &hint, 0); 1808 if (ret) 1809 goto out; 1810 1811 ins.objectid = btrfs_ino(inode); 1812 ins.offset = file_pos; 1813 ins.type = BTRFS_EXTENT_DATA_KEY; 1814 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1815 if (ret) 1816 goto out; 1817 leaf = path->nodes[0]; 1818 fi = btrfs_item_ptr(leaf, path->slots[0], 1819 struct btrfs_file_extent_item); 1820 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1821 btrfs_set_file_extent_type(leaf, fi, extent_type); 1822 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1823 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1824 btrfs_set_file_extent_offset(leaf, fi, 0); 1825 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1826 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1827 btrfs_set_file_extent_compression(leaf, fi, compression); 1828 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1829 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1830 1831 btrfs_unlock_up_safe(path, 1); 1832 btrfs_set_lock_blocking(leaf); 1833 1834 btrfs_mark_buffer_dirty(leaf); 1835 1836 inode_add_bytes(inode, num_bytes); 1837 1838 ins.objectid = disk_bytenr; 1839 ins.offset = disk_num_bytes; 1840 ins.type = BTRFS_EXTENT_ITEM_KEY; 1841 ret = btrfs_alloc_reserved_file_extent(trans, root, 1842 root->root_key.objectid, 1843 btrfs_ino(inode), file_pos, &ins); 1844 out: 1845 btrfs_free_path(path); 1846 1847 return ret; 1848 } 1849 1850 /* 1851 * helper function for btrfs_finish_ordered_io, this 1852 * just reads in some of the csum leaves to prime them into ram 1853 * before we start the transaction. It limits the amount of btree 1854 * reads required while inside the transaction. 1855 */ 1856 /* as ordered data IO finishes, this gets called so we can finish 1857 * an ordered extent if the range of bytes in the file it covers are 1858 * fully written. 1859 */ 1860 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 1861 { 1862 struct inode *inode = ordered_extent->inode; 1863 struct btrfs_root *root = BTRFS_I(inode)->root; 1864 struct btrfs_trans_handle *trans = NULL; 1865 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1866 struct extent_state *cached_state = NULL; 1867 int compress_type = 0; 1868 int ret; 1869 bool nolock; 1870 1871 nolock = btrfs_is_free_space_inode(inode); 1872 1873 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 1874 ret = -EIO; 1875 goto out; 1876 } 1877 1878 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1879 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1880 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1881 if (!ret) { 1882 if (nolock) 1883 trans = btrfs_join_transaction_nolock(root); 1884 else 1885 trans = btrfs_join_transaction(root); 1886 if (IS_ERR(trans)) { 1887 ret = PTR_ERR(trans); 1888 trans = NULL; 1889 goto out; 1890 } 1891 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1892 ret = btrfs_update_inode_fallback(trans, root, inode); 1893 if (ret) /* -ENOMEM or corruption */ 1894 btrfs_abort_transaction(trans, root, ret); 1895 } 1896 goto out; 1897 } 1898 1899 lock_extent_bits(io_tree, ordered_extent->file_offset, 1900 ordered_extent->file_offset + ordered_extent->len - 1, 1901 0, &cached_state); 1902 1903 if (nolock) 1904 trans = btrfs_join_transaction_nolock(root); 1905 else 1906 trans = btrfs_join_transaction(root); 1907 if (IS_ERR(trans)) { 1908 ret = PTR_ERR(trans); 1909 trans = NULL; 1910 goto out_unlock; 1911 } 1912 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1913 1914 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1915 compress_type = ordered_extent->compress_type; 1916 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1917 BUG_ON(compress_type); 1918 ret = btrfs_mark_extent_written(trans, inode, 1919 ordered_extent->file_offset, 1920 ordered_extent->file_offset + 1921 ordered_extent->len); 1922 } else { 1923 BUG_ON(root == root->fs_info->tree_root); 1924 ret = insert_reserved_file_extent(trans, inode, 1925 ordered_extent->file_offset, 1926 ordered_extent->start, 1927 ordered_extent->disk_len, 1928 ordered_extent->len, 1929 ordered_extent->len, 1930 compress_type, 0, 0, 1931 BTRFS_FILE_EXTENT_REG); 1932 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1933 ordered_extent->file_offset, 1934 ordered_extent->len); 1935 } 1936 1937 if (ret < 0) { 1938 btrfs_abort_transaction(trans, root, ret); 1939 goto out_unlock; 1940 } 1941 1942 add_pending_csums(trans, inode, ordered_extent->file_offset, 1943 &ordered_extent->list); 1944 1945 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1946 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1947 ret = btrfs_update_inode_fallback(trans, root, inode); 1948 if (ret) { /* -ENOMEM or corruption */ 1949 btrfs_abort_transaction(trans, root, ret); 1950 goto out_unlock; 1951 } 1952 } 1953 ret = 0; 1954 out_unlock: 1955 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1956 ordered_extent->file_offset + 1957 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1958 out: 1959 if (root != root->fs_info->tree_root) 1960 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1961 if (trans) { 1962 if (nolock) 1963 btrfs_end_transaction_nolock(trans, root); 1964 else 1965 btrfs_end_transaction(trans, root); 1966 } 1967 1968 if (ret) 1969 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 1970 ordered_extent->file_offset + 1971 ordered_extent->len - 1, NULL, GFP_NOFS); 1972 1973 /* 1974 * This needs to be done to make sure anybody waiting knows we are done 1975 * updating everything for this ordered extent. 1976 */ 1977 btrfs_remove_ordered_extent(inode, ordered_extent); 1978 1979 /* once for us */ 1980 btrfs_put_ordered_extent(ordered_extent); 1981 /* once for the tree */ 1982 btrfs_put_ordered_extent(ordered_extent); 1983 1984 return ret; 1985 } 1986 1987 static void finish_ordered_fn(struct btrfs_work *work) 1988 { 1989 struct btrfs_ordered_extent *ordered_extent; 1990 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 1991 btrfs_finish_ordered_io(ordered_extent); 1992 } 1993 1994 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1995 struct extent_state *state, int uptodate) 1996 { 1997 struct inode *inode = page->mapping->host; 1998 struct btrfs_root *root = BTRFS_I(inode)->root; 1999 struct btrfs_ordered_extent *ordered_extent = NULL; 2000 struct btrfs_workers *workers; 2001 2002 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2003 2004 ClearPagePrivate2(page); 2005 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 2006 end - start + 1, uptodate)) 2007 return 0; 2008 2009 ordered_extent->work.func = finish_ordered_fn; 2010 ordered_extent->work.flags = 0; 2011 2012 if (btrfs_is_free_space_inode(inode)) 2013 workers = &root->fs_info->endio_freespace_worker; 2014 else 2015 workers = &root->fs_info->endio_write_workers; 2016 btrfs_queue_worker(workers, &ordered_extent->work); 2017 2018 return 0; 2019 } 2020 2021 /* 2022 * when reads are done, we need to check csums to verify the data is correct 2023 * if there's a match, we allow the bio to finish. If not, the code in 2024 * extent_io.c will try to find good copies for us. 2025 */ 2026 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 2027 struct extent_state *state, int mirror) 2028 { 2029 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 2030 struct inode *inode = page->mapping->host; 2031 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2032 char *kaddr; 2033 u64 private = ~(u32)0; 2034 int ret; 2035 struct btrfs_root *root = BTRFS_I(inode)->root; 2036 u32 csum = ~(u32)0; 2037 2038 if (PageChecked(page)) { 2039 ClearPageChecked(page); 2040 goto good; 2041 } 2042 2043 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2044 goto good; 2045 2046 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2047 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2048 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 2049 GFP_NOFS); 2050 return 0; 2051 } 2052 2053 if (state && state->start == start) { 2054 private = state->private; 2055 ret = 0; 2056 } else { 2057 ret = get_state_private(io_tree, start, &private); 2058 } 2059 kaddr = kmap_atomic(page); 2060 if (ret) 2061 goto zeroit; 2062 2063 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 2064 btrfs_csum_final(csum, (char *)&csum); 2065 if (csum != private) 2066 goto zeroit; 2067 2068 kunmap_atomic(kaddr); 2069 good: 2070 return 0; 2071 2072 zeroit: 2073 printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " 2074 "private %llu\n", 2075 (unsigned long long)btrfs_ino(page->mapping->host), 2076 (unsigned long long)start, csum, 2077 (unsigned long long)private); 2078 memset(kaddr + offset, 1, end - start + 1); 2079 flush_dcache_page(page); 2080 kunmap_atomic(kaddr); 2081 if (private == 0) 2082 return 0; 2083 return -EIO; 2084 } 2085 2086 struct delayed_iput { 2087 struct list_head list; 2088 struct inode *inode; 2089 }; 2090 2091 /* JDM: If this is fs-wide, why can't we add a pointer to 2092 * btrfs_inode instead and avoid the allocation? */ 2093 void btrfs_add_delayed_iput(struct inode *inode) 2094 { 2095 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2096 struct delayed_iput *delayed; 2097 2098 if (atomic_add_unless(&inode->i_count, -1, 1)) 2099 return; 2100 2101 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 2102 delayed->inode = inode; 2103 2104 spin_lock(&fs_info->delayed_iput_lock); 2105 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 2106 spin_unlock(&fs_info->delayed_iput_lock); 2107 } 2108 2109 void btrfs_run_delayed_iputs(struct btrfs_root *root) 2110 { 2111 LIST_HEAD(list); 2112 struct btrfs_fs_info *fs_info = root->fs_info; 2113 struct delayed_iput *delayed; 2114 int empty; 2115 2116 spin_lock(&fs_info->delayed_iput_lock); 2117 empty = list_empty(&fs_info->delayed_iputs); 2118 spin_unlock(&fs_info->delayed_iput_lock); 2119 if (empty) 2120 return; 2121 2122 down_read(&root->fs_info->cleanup_work_sem); 2123 spin_lock(&fs_info->delayed_iput_lock); 2124 list_splice_init(&fs_info->delayed_iputs, &list); 2125 spin_unlock(&fs_info->delayed_iput_lock); 2126 2127 while (!list_empty(&list)) { 2128 delayed = list_entry(list.next, struct delayed_iput, list); 2129 list_del(&delayed->list); 2130 iput(delayed->inode); 2131 kfree(delayed); 2132 } 2133 up_read(&root->fs_info->cleanup_work_sem); 2134 } 2135 2136 enum btrfs_orphan_cleanup_state { 2137 ORPHAN_CLEANUP_STARTED = 1, 2138 ORPHAN_CLEANUP_DONE = 2, 2139 }; 2140 2141 /* 2142 * This is called in transaction commit time. If there are no orphan 2143 * files in the subvolume, it removes orphan item and frees block_rsv 2144 * structure. 2145 */ 2146 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2147 struct btrfs_root *root) 2148 { 2149 struct btrfs_block_rsv *block_rsv; 2150 int ret; 2151 2152 if (atomic_read(&root->orphan_inodes) || 2153 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2154 return; 2155 2156 spin_lock(&root->orphan_lock); 2157 if (atomic_read(&root->orphan_inodes)) { 2158 spin_unlock(&root->orphan_lock); 2159 return; 2160 } 2161 2162 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { 2163 spin_unlock(&root->orphan_lock); 2164 return; 2165 } 2166 2167 block_rsv = root->orphan_block_rsv; 2168 root->orphan_block_rsv = NULL; 2169 spin_unlock(&root->orphan_lock); 2170 2171 if (root->orphan_item_inserted && 2172 btrfs_root_refs(&root->root_item) > 0) { 2173 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2174 root->root_key.objectid); 2175 BUG_ON(ret); 2176 root->orphan_item_inserted = 0; 2177 } 2178 2179 if (block_rsv) { 2180 WARN_ON(block_rsv->size > 0); 2181 btrfs_free_block_rsv(root, block_rsv); 2182 } 2183 } 2184 2185 /* 2186 * This creates an orphan entry for the given inode in case something goes 2187 * wrong in the middle of an unlink/truncate. 2188 * 2189 * NOTE: caller of this function should reserve 5 units of metadata for 2190 * this function. 2191 */ 2192 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2193 { 2194 struct btrfs_root *root = BTRFS_I(inode)->root; 2195 struct btrfs_block_rsv *block_rsv = NULL; 2196 int reserve = 0; 2197 int insert = 0; 2198 int ret; 2199 2200 if (!root->orphan_block_rsv) { 2201 block_rsv = btrfs_alloc_block_rsv(root); 2202 if (!block_rsv) 2203 return -ENOMEM; 2204 } 2205 2206 spin_lock(&root->orphan_lock); 2207 if (!root->orphan_block_rsv) { 2208 root->orphan_block_rsv = block_rsv; 2209 } else if (block_rsv) { 2210 btrfs_free_block_rsv(root, block_rsv); 2211 block_rsv = NULL; 2212 } 2213 2214 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2215 &BTRFS_I(inode)->runtime_flags)) { 2216 #if 0 2217 /* 2218 * For proper ENOSPC handling, we should do orphan 2219 * cleanup when mounting. But this introduces backward 2220 * compatibility issue. 2221 */ 2222 if (!xchg(&root->orphan_item_inserted, 1)) 2223 insert = 2; 2224 else 2225 insert = 1; 2226 #endif 2227 insert = 1; 2228 atomic_dec(&root->orphan_inodes); 2229 } 2230 2231 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2232 &BTRFS_I(inode)->runtime_flags)) 2233 reserve = 1; 2234 spin_unlock(&root->orphan_lock); 2235 2236 /* grab metadata reservation from transaction handle */ 2237 if (reserve) { 2238 ret = btrfs_orphan_reserve_metadata(trans, inode); 2239 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ 2240 } 2241 2242 /* insert an orphan item to track this unlinked/truncated file */ 2243 if (insert >= 1) { 2244 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2245 if (ret && ret != -EEXIST) { 2246 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2247 &BTRFS_I(inode)->runtime_flags); 2248 btrfs_abort_transaction(trans, root, ret); 2249 return ret; 2250 } 2251 ret = 0; 2252 } 2253 2254 /* insert an orphan item to track subvolume contains orphan files */ 2255 if (insert >= 2) { 2256 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2257 root->root_key.objectid); 2258 if (ret && ret != -EEXIST) { 2259 btrfs_abort_transaction(trans, root, ret); 2260 return ret; 2261 } 2262 } 2263 return 0; 2264 } 2265 2266 /* 2267 * We have done the truncate/delete so we can go ahead and remove the orphan 2268 * item for this particular inode. 2269 */ 2270 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2271 { 2272 struct btrfs_root *root = BTRFS_I(inode)->root; 2273 int delete_item = 0; 2274 int release_rsv = 0; 2275 int ret = 0; 2276 2277 spin_lock(&root->orphan_lock); 2278 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2279 &BTRFS_I(inode)->runtime_flags)) 2280 delete_item = 1; 2281 2282 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2283 &BTRFS_I(inode)->runtime_flags)) 2284 release_rsv = 1; 2285 spin_unlock(&root->orphan_lock); 2286 2287 if (trans && delete_item) { 2288 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); 2289 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2290 } 2291 2292 if (release_rsv) { 2293 btrfs_orphan_release_metadata(inode); 2294 atomic_dec(&root->orphan_inodes); 2295 } 2296 2297 return 0; 2298 } 2299 2300 /* 2301 * this cleans up any orphans that may be left on the list from the last use 2302 * of this root. 2303 */ 2304 int btrfs_orphan_cleanup(struct btrfs_root *root) 2305 { 2306 struct btrfs_path *path; 2307 struct extent_buffer *leaf; 2308 struct btrfs_key key, found_key; 2309 struct btrfs_trans_handle *trans; 2310 struct inode *inode; 2311 u64 last_objectid = 0; 2312 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2313 2314 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2315 return 0; 2316 2317 path = btrfs_alloc_path(); 2318 if (!path) { 2319 ret = -ENOMEM; 2320 goto out; 2321 } 2322 path->reada = -1; 2323 2324 key.objectid = BTRFS_ORPHAN_OBJECTID; 2325 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2326 key.offset = (u64)-1; 2327 2328 while (1) { 2329 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2330 if (ret < 0) 2331 goto out; 2332 2333 /* 2334 * if ret == 0 means we found what we were searching for, which 2335 * is weird, but possible, so only screw with path if we didn't 2336 * find the key and see if we have stuff that matches 2337 */ 2338 if (ret > 0) { 2339 ret = 0; 2340 if (path->slots[0] == 0) 2341 break; 2342 path->slots[0]--; 2343 } 2344 2345 /* pull out the item */ 2346 leaf = path->nodes[0]; 2347 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2348 2349 /* make sure the item matches what we want */ 2350 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 2351 break; 2352 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 2353 break; 2354 2355 /* release the path since we're done with it */ 2356 btrfs_release_path(path); 2357 2358 /* 2359 * this is where we are basically btrfs_lookup, without the 2360 * crossing root thing. we store the inode number in the 2361 * offset of the orphan item. 2362 */ 2363 2364 if (found_key.offset == last_objectid) { 2365 printk(KERN_ERR "btrfs: Error removing orphan entry, " 2366 "stopping orphan cleanup\n"); 2367 ret = -EINVAL; 2368 goto out; 2369 } 2370 2371 last_objectid = found_key.offset; 2372 2373 found_key.objectid = found_key.offset; 2374 found_key.type = BTRFS_INODE_ITEM_KEY; 2375 found_key.offset = 0; 2376 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2377 ret = PTR_RET(inode); 2378 if (ret && ret != -ESTALE) 2379 goto out; 2380 2381 if (ret == -ESTALE && root == root->fs_info->tree_root) { 2382 struct btrfs_root *dead_root; 2383 struct btrfs_fs_info *fs_info = root->fs_info; 2384 int is_dead_root = 0; 2385 2386 /* 2387 * this is an orphan in the tree root. Currently these 2388 * could come from 2 sources: 2389 * a) a snapshot deletion in progress 2390 * b) a free space cache inode 2391 * We need to distinguish those two, as the snapshot 2392 * orphan must not get deleted. 2393 * find_dead_roots already ran before us, so if this 2394 * is a snapshot deletion, we should find the root 2395 * in the dead_roots list 2396 */ 2397 spin_lock(&fs_info->trans_lock); 2398 list_for_each_entry(dead_root, &fs_info->dead_roots, 2399 root_list) { 2400 if (dead_root->root_key.objectid == 2401 found_key.objectid) { 2402 is_dead_root = 1; 2403 break; 2404 } 2405 } 2406 spin_unlock(&fs_info->trans_lock); 2407 if (is_dead_root) { 2408 /* prevent this orphan from being found again */ 2409 key.offset = found_key.objectid - 1; 2410 continue; 2411 } 2412 } 2413 /* 2414 * Inode is already gone but the orphan item is still there, 2415 * kill the orphan item. 2416 */ 2417 if (ret == -ESTALE) { 2418 trans = btrfs_start_transaction(root, 1); 2419 if (IS_ERR(trans)) { 2420 ret = PTR_ERR(trans); 2421 goto out; 2422 } 2423 printk(KERN_ERR "auto deleting %Lu\n", 2424 found_key.objectid); 2425 ret = btrfs_del_orphan_item(trans, root, 2426 found_key.objectid); 2427 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2428 btrfs_end_transaction(trans, root); 2429 continue; 2430 } 2431 2432 /* 2433 * add this inode to the orphan list so btrfs_orphan_del does 2434 * the proper thing when we hit it 2435 */ 2436 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2437 &BTRFS_I(inode)->runtime_flags); 2438 2439 /* if we have links, this was a truncate, lets do that */ 2440 if (inode->i_nlink) { 2441 if (!S_ISREG(inode->i_mode)) { 2442 WARN_ON(1); 2443 iput(inode); 2444 continue; 2445 } 2446 nr_truncate++; 2447 ret = btrfs_truncate(inode); 2448 } else { 2449 nr_unlink++; 2450 } 2451 2452 /* this will do delete_inode and everything for us */ 2453 iput(inode); 2454 if (ret) 2455 goto out; 2456 } 2457 /* release the path since we're done with it */ 2458 btrfs_release_path(path); 2459 2460 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2461 2462 if (root->orphan_block_rsv) 2463 btrfs_block_rsv_release(root, root->orphan_block_rsv, 2464 (u64)-1); 2465 2466 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2467 trans = btrfs_join_transaction(root); 2468 if (!IS_ERR(trans)) 2469 btrfs_end_transaction(trans, root); 2470 } 2471 2472 if (nr_unlink) 2473 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2474 if (nr_truncate) 2475 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2476 2477 out: 2478 if (ret) 2479 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); 2480 btrfs_free_path(path); 2481 return ret; 2482 } 2483 2484 /* 2485 * very simple check to peek ahead in the leaf looking for xattrs. If we 2486 * don't find any xattrs, we know there can't be any acls. 2487 * 2488 * slot is the slot the inode is in, objectid is the objectid of the inode 2489 */ 2490 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2491 int slot, u64 objectid) 2492 { 2493 u32 nritems = btrfs_header_nritems(leaf); 2494 struct btrfs_key found_key; 2495 int scanned = 0; 2496 2497 slot++; 2498 while (slot < nritems) { 2499 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2500 2501 /* we found a different objectid, there must not be acls */ 2502 if (found_key.objectid != objectid) 2503 return 0; 2504 2505 /* we found an xattr, assume we've got an acl */ 2506 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2507 return 1; 2508 2509 /* 2510 * we found a key greater than an xattr key, there can't 2511 * be any acls later on 2512 */ 2513 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2514 return 0; 2515 2516 slot++; 2517 scanned++; 2518 2519 /* 2520 * it goes inode, inode backrefs, xattrs, extents, 2521 * so if there are a ton of hard links to an inode there can 2522 * be a lot of backrefs. Don't waste time searching too hard, 2523 * this is just an optimization 2524 */ 2525 if (scanned >= 8) 2526 break; 2527 } 2528 /* we hit the end of the leaf before we found an xattr or 2529 * something larger than an xattr. We have to assume the inode 2530 * has acls 2531 */ 2532 return 1; 2533 } 2534 2535 /* 2536 * read an inode from the btree into the in-memory inode 2537 */ 2538 static void btrfs_read_locked_inode(struct inode *inode) 2539 { 2540 struct btrfs_path *path; 2541 struct extent_buffer *leaf; 2542 struct btrfs_inode_item *inode_item; 2543 struct btrfs_timespec *tspec; 2544 struct btrfs_root *root = BTRFS_I(inode)->root; 2545 struct btrfs_key location; 2546 int maybe_acls; 2547 u32 rdev; 2548 int ret; 2549 bool filled = false; 2550 2551 ret = btrfs_fill_inode(inode, &rdev); 2552 if (!ret) 2553 filled = true; 2554 2555 path = btrfs_alloc_path(); 2556 if (!path) 2557 goto make_bad; 2558 2559 path->leave_spinning = 1; 2560 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2561 2562 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2563 if (ret) 2564 goto make_bad; 2565 2566 leaf = path->nodes[0]; 2567 2568 if (filled) 2569 goto cache_acl; 2570 2571 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2572 struct btrfs_inode_item); 2573 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2574 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 2575 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2576 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2577 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2578 2579 tspec = btrfs_inode_atime(inode_item); 2580 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2581 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2582 2583 tspec = btrfs_inode_mtime(inode_item); 2584 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2585 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2586 2587 tspec = btrfs_inode_ctime(inode_item); 2588 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2589 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2590 2591 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2592 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2593 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2594 inode->i_generation = BTRFS_I(inode)->generation; 2595 inode->i_rdev = 0; 2596 rdev = btrfs_inode_rdev(leaf, inode_item); 2597 2598 BTRFS_I(inode)->index_cnt = (u64)-1; 2599 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2600 cache_acl: 2601 /* 2602 * try to precache a NULL acl entry for files that don't have 2603 * any xattrs or acls 2604 */ 2605 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 2606 btrfs_ino(inode)); 2607 if (!maybe_acls) 2608 cache_no_acl(inode); 2609 2610 btrfs_free_path(path); 2611 2612 switch (inode->i_mode & S_IFMT) { 2613 case S_IFREG: 2614 inode->i_mapping->a_ops = &btrfs_aops; 2615 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2616 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2617 inode->i_fop = &btrfs_file_operations; 2618 inode->i_op = &btrfs_file_inode_operations; 2619 break; 2620 case S_IFDIR: 2621 inode->i_fop = &btrfs_dir_file_operations; 2622 if (root == root->fs_info->tree_root) 2623 inode->i_op = &btrfs_dir_ro_inode_operations; 2624 else 2625 inode->i_op = &btrfs_dir_inode_operations; 2626 break; 2627 case S_IFLNK: 2628 inode->i_op = &btrfs_symlink_inode_operations; 2629 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2630 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2631 break; 2632 default: 2633 inode->i_op = &btrfs_special_inode_operations; 2634 init_special_inode(inode, inode->i_mode, rdev); 2635 break; 2636 } 2637 2638 btrfs_update_iflags(inode); 2639 return; 2640 2641 make_bad: 2642 btrfs_free_path(path); 2643 make_bad_inode(inode); 2644 } 2645 2646 /* 2647 * given a leaf and an inode, copy the inode fields into the leaf 2648 */ 2649 static void fill_inode_item(struct btrfs_trans_handle *trans, 2650 struct extent_buffer *leaf, 2651 struct btrfs_inode_item *item, 2652 struct inode *inode) 2653 { 2654 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2655 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2656 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2657 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2658 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2659 2660 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2661 inode->i_atime.tv_sec); 2662 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2663 inode->i_atime.tv_nsec); 2664 2665 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2666 inode->i_mtime.tv_sec); 2667 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2668 inode->i_mtime.tv_nsec); 2669 2670 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2671 inode->i_ctime.tv_sec); 2672 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2673 inode->i_ctime.tv_nsec); 2674 2675 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2676 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2677 btrfs_set_inode_sequence(leaf, item, inode->i_version); 2678 btrfs_set_inode_transid(leaf, item, trans->transid); 2679 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2680 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2681 btrfs_set_inode_block_group(leaf, item, 0); 2682 } 2683 2684 /* 2685 * copy everything in the in-memory inode into the btree. 2686 */ 2687 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 2688 struct btrfs_root *root, struct inode *inode) 2689 { 2690 struct btrfs_inode_item *inode_item; 2691 struct btrfs_path *path; 2692 struct extent_buffer *leaf; 2693 int ret; 2694 2695 path = btrfs_alloc_path(); 2696 if (!path) 2697 return -ENOMEM; 2698 2699 path->leave_spinning = 1; 2700 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 2701 1); 2702 if (ret) { 2703 if (ret > 0) 2704 ret = -ENOENT; 2705 goto failed; 2706 } 2707 2708 btrfs_unlock_up_safe(path, 1); 2709 leaf = path->nodes[0]; 2710 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2711 struct btrfs_inode_item); 2712 2713 fill_inode_item(trans, leaf, inode_item, inode); 2714 btrfs_mark_buffer_dirty(leaf); 2715 btrfs_set_inode_last_trans(trans, inode); 2716 ret = 0; 2717 failed: 2718 btrfs_free_path(path); 2719 return ret; 2720 } 2721 2722 /* 2723 * copy everything in the in-memory inode into the btree. 2724 */ 2725 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2726 struct btrfs_root *root, struct inode *inode) 2727 { 2728 int ret; 2729 2730 /* 2731 * If the inode is a free space inode, we can deadlock during commit 2732 * if we put it into the delayed code. 2733 * 2734 * The data relocation inode should also be directly updated 2735 * without delay 2736 */ 2737 if (!btrfs_is_free_space_inode(inode) 2738 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2739 btrfs_update_root_times(trans, root); 2740 2741 ret = btrfs_delayed_update_inode(trans, root, inode); 2742 if (!ret) 2743 btrfs_set_inode_last_trans(trans, inode); 2744 return ret; 2745 } 2746 2747 return btrfs_update_inode_item(trans, root, inode); 2748 } 2749 2750 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 2751 struct btrfs_root *root, struct inode *inode) 2752 { 2753 int ret; 2754 2755 ret = btrfs_update_inode(trans, root, inode); 2756 if (ret == -ENOSPC) 2757 return btrfs_update_inode_item(trans, root, inode); 2758 return ret; 2759 } 2760 2761 /* 2762 * unlink helper that gets used here in inode.c and in the tree logging 2763 * recovery code. It remove a link in a directory with a given name, and 2764 * also drops the back refs in the inode to the directory 2765 */ 2766 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2767 struct btrfs_root *root, 2768 struct inode *dir, struct inode *inode, 2769 const char *name, int name_len) 2770 { 2771 struct btrfs_path *path; 2772 int ret = 0; 2773 struct extent_buffer *leaf; 2774 struct btrfs_dir_item *di; 2775 struct btrfs_key key; 2776 u64 index; 2777 u64 ino = btrfs_ino(inode); 2778 u64 dir_ino = btrfs_ino(dir); 2779 2780 path = btrfs_alloc_path(); 2781 if (!path) { 2782 ret = -ENOMEM; 2783 goto out; 2784 } 2785 2786 path->leave_spinning = 1; 2787 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2788 name, name_len, -1); 2789 if (IS_ERR(di)) { 2790 ret = PTR_ERR(di); 2791 goto err; 2792 } 2793 if (!di) { 2794 ret = -ENOENT; 2795 goto err; 2796 } 2797 leaf = path->nodes[0]; 2798 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2799 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2800 if (ret) 2801 goto err; 2802 btrfs_release_path(path); 2803 2804 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 2805 dir_ino, &index); 2806 if (ret) { 2807 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2808 "inode %llu parent %llu\n", name_len, name, 2809 (unsigned long long)ino, (unsigned long long)dir_ino); 2810 btrfs_abort_transaction(trans, root, ret); 2811 goto err; 2812 } 2813 2814 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 2815 if (ret) { 2816 btrfs_abort_transaction(trans, root, ret); 2817 goto err; 2818 } 2819 2820 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2821 inode, dir_ino); 2822 if (ret != 0 && ret != -ENOENT) { 2823 btrfs_abort_transaction(trans, root, ret); 2824 goto err; 2825 } 2826 2827 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2828 dir, index); 2829 if (ret == -ENOENT) 2830 ret = 0; 2831 err: 2832 btrfs_free_path(path); 2833 if (ret) 2834 goto out; 2835 2836 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2837 inode_inc_iversion(inode); 2838 inode_inc_iversion(dir); 2839 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2840 ret = btrfs_update_inode(trans, root, dir); 2841 out: 2842 return ret; 2843 } 2844 2845 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2846 struct btrfs_root *root, 2847 struct inode *dir, struct inode *inode, 2848 const char *name, int name_len) 2849 { 2850 int ret; 2851 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 2852 if (!ret) { 2853 btrfs_drop_nlink(inode); 2854 ret = btrfs_update_inode(trans, root, inode); 2855 } 2856 return ret; 2857 } 2858 2859 2860 /* helper to check if there is any shared block in the path */ 2861 static int check_path_shared(struct btrfs_root *root, 2862 struct btrfs_path *path) 2863 { 2864 struct extent_buffer *eb; 2865 int level; 2866 u64 refs = 1; 2867 2868 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2869 int ret; 2870 2871 if (!path->nodes[level]) 2872 break; 2873 eb = path->nodes[level]; 2874 if (!btrfs_block_can_be_shared(root, eb)) 2875 continue; 2876 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, 2877 &refs, NULL); 2878 if (refs > 1) 2879 return 1; 2880 } 2881 return 0; 2882 } 2883 2884 /* 2885 * helper to start transaction for unlink and rmdir. 2886 * 2887 * unlink and rmdir are special in btrfs, they do not always free space. 2888 * so in enospc case, we should make sure they will free space before 2889 * allowing them to use the global metadata reservation. 2890 */ 2891 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 2892 struct dentry *dentry) 2893 { 2894 struct btrfs_trans_handle *trans; 2895 struct btrfs_root *root = BTRFS_I(dir)->root; 2896 struct btrfs_path *path; 2897 struct btrfs_inode_ref *ref; 2898 struct btrfs_dir_item *di; 2899 struct inode *inode = dentry->d_inode; 2900 u64 index; 2901 int check_link = 1; 2902 int err = -ENOSPC; 2903 int ret; 2904 u64 ino = btrfs_ino(inode); 2905 u64 dir_ino = btrfs_ino(dir); 2906 2907 /* 2908 * 1 for the possible orphan item 2909 * 1 for the dir item 2910 * 1 for the dir index 2911 * 1 for the inode ref 2912 * 1 for the inode ref in the tree log 2913 * 2 for the dir entries in the log 2914 * 1 for the inode 2915 */ 2916 trans = btrfs_start_transaction(root, 8); 2917 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2918 return trans; 2919 2920 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 2921 return ERR_PTR(-ENOSPC); 2922 2923 /* check if there is someone else holds reference */ 2924 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) 2925 return ERR_PTR(-ENOSPC); 2926 2927 if (atomic_read(&inode->i_count) > 2) 2928 return ERR_PTR(-ENOSPC); 2929 2930 if (xchg(&root->fs_info->enospc_unlink, 1)) 2931 return ERR_PTR(-ENOSPC); 2932 2933 path = btrfs_alloc_path(); 2934 if (!path) { 2935 root->fs_info->enospc_unlink = 0; 2936 return ERR_PTR(-ENOMEM); 2937 } 2938 2939 /* 1 for the orphan item */ 2940 trans = btrfs_start_transaction(root, 1); 2941 if (IS_ERR(trans)) { 2942 btrfs_free_path(path); 2943 root->fs_info->enospc_unlink = 0; 2944 return trans; 2945 } 2946 2947 path->skip_locking = 1; 2948 path->search_commit_root = 1; 2949 2950 ret = btrfs_lookup_inode(trans, root, path, 2951 &BTRFS_I(dir)->location, 0); 2952 if (ret < 0) { 2953 err = ret; 2954 goto out; 2955 } 2956 if (ret == 0) { 2957 if (check_path_shared(root, path)) 2958 goto out; 2959 } else { 2960 check_link = 0; 2961 } 2962 btrfs_release_path(path); 2963 2964 ret = btrfs_lookup_inode(trans, root, path, 2965 &BTRFS_I(inode)->location, 0); 2966 if (ret < 0) { 2967 err = ret; 2968 goto out; 2969 } 2970 if (ret == 0) { 2971 if (check_path_shared(root, path)) 2972 goto out; 2973 } else { 2974 check_link = 0; 2975 } 2976 btrfs_release_path(path); 2977 2978 if (ret == 0 && S_ISREG(inode->i_mode)) { 2979 ret = btrfs_lookup_file_extent(trans, root, path, 2980 ino, (u64)-1, 0); 2981 if (ret < 0) { 2982 err = ret; 2983 goto out; 2984 } 2985 BUG_ON(ret == 0); /* Corruption */ 2986 if (check_path_shared(root, path)) 2987 goto out; 2988 btrfs_release_path(path); 2989 } 2990 2991 if (!check_link) { 2992 err = 0; 2993 goto out; 2994 } 2995 2996 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2997 dentry->d_name.name, dentry->d_name.len, 0); 2998 if (IS_ERR(di)) { 2999 err = PTR_ERR(di); 3000 goto out; 3001 } 3002 if (di) { 3003 if (check_path_shared(root, path)) 3004 goto out; 3005 } else { 3006 err = 0; 3007 goto out; 3008 } 3009 btrfs_release_path(path); 3010 3011 ref = btrfs_lookup_inode_ref(trans, root, path, 3012 dentry->d_name.name, dentry->d_name.len, 3013 ino, dir_ino, 0); 3014 if (IS_ERR(ref)) { 3015 err = PTR_ERR(ref); 3016 goto out; 3017 } 3018 BUG_ON(!ref); /* Logic error */ 3019 if (check_path_shared(root, path)) 3020 goto out; 3021 index = btrfs_inode_ref_index(path->nodes[0], ref); 3022 btrfs_release_path(path); 3023 3024 /* 3025 * This is a commit root search, if we can lookup inode item and other 3026 * relative items in the commit root, it means the transaction of 3027 * dir/file creation has been committed, and the dir index item that we 3028 * delay to insert has also been inserted into the commit root. So 3029 * we needn't worry about the delayed insertion of the dir index item 3030 * here. 3031 */ 3032 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, 3033 dentry->d_name.name, dentry->d_name.len, 0); 3034 if (IS_ERR(di)) { 3035 err = PTR_ERR(di); 3036 goto out; 3037 } 3038 BUG_ON(ret == -ENOENT); 3039 if (check_path_shared(root, path)) 3040 goto out; 3041 3042 err = 0; 3043 out: 3044 btrfs_free_path(path); 3045 /* Migrate the orphan reservation over */ 3046 if (!err) 3047 err = btrfs_block_rsv_migrate(trans->block_rsv, 3048 &root->fs_info->global_block_rsv, 3049 trans->bytes_reserved); 3050 3051 if (err) { 3052 btrfs_end_transaction(trans, root); 3053 root->fs_info->enospc_unlink = 0; 3054 return ERR_PTR(err); 3055 } 3056 3057 trans->block_rsv = &root->fs_info->global_block_rsv; 3058 return trans; 3059 } 3060 3061 static void __unlink_end_trans(struct btrfs_trans_handle *trans, 3062 struct btrfs_root *root) 3063 { 3064 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 3065 btrfs_block_rsv_release(root, trans->block_rsv, 3066 trans->bytes_reserved); 3067 trans->block_rsv = &root->fs_info->trans_block_rsv; 3068 BUG_ON(!root->fs_info->enospc_unlink); 3069 root->fs_info->enospc_unlink = 0; 3070 } 3071 btrfs_end_transaction(trans, root); 3072 } 3073 3074 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3075 { 3076 struct btrfs_root *root = BTRFS_I(dir)->root; 3077 struct btrfs_trans_handle *trans; 3078 struct inode *inode = dentry->d_inode; 3079 int ret; 3080 unsigned long nr = 0; 3081 3082 trans = __unlink_start_trans(dir, dentry); 3083 if (IS_ERR(trans)) 3084 return PTR_ERR(trans); 3085 3086 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 3087 3088 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3089 dentry->d_name.name, dentry->d_name.len); 3090 if (ret) 3091 goto out; 3092 3093 if (inode->i_nlink == 0) { 3094 ret = btrfs_orphan_add(trans, inode); 3095 if (ret) 3096 goto out; 3097 } 3098 3099 out: 3100 nr = trans->blocks_used; 3101 __unlink_end_trans(trans, root); 3102 btrfs_btree_balance_dirty(root, nr); 3103 return ret; 3104 } 3105 3106 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 3107 struct btrfs_root *root, 3108 struct inode *dir, u64 objectid, 3109 const char *name, int name_len) 3110 { 3111 struct btrfs_path *path; 3112 struct extent_buffer *leaf; 3113 struct btrfs_dir_item *di; 3114 struct btrfs_key key; 3115 u64 index; 3116 int ret; 3117 u64 dir_ino = btrfs_ino(dir); 3118 3119 path = btrfs_alloc_path(); 3120 if (!path) 3121 return -ENOMEM; 3122 3123 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3124 name, name_len, -1); 3125 if (IS_ERR_OR_NULL(di)) { 3126 if (!di) 3127 ret = -ENOENT; 3128 else 3129 ret = PTR_ERR(di); 3130 goto out; 3131 } 3132 3133 leaf = path->nodes[0]; 3134 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3135 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3136 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3137 if (ret) { 3138 btrfs_abort_transaction(trans, root, ret); 3139 goto out; 3140 } 3141 btrfs_release_path(path); 3142 3143 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 3144 objectid, root->root_key.objectid, 3145 dir_ino, &index, name, name_len); 3146 if (ret < 0) { 3147 if (ret != -ENOENT) { 3148 btrfs_abort_transaction(trans, root, ret); 3149 goto out; 3150 } 3151 di = btrfs_search_dir_index_item(root, path, dir_ino, 3152 name, name_len); 3153 if (IS_ERR_OR_NULL(di)) { 3154 if (!di) 3155 ret = -ENOENT; 3156 else 3157 ret = PTR_ERR(di); 3158 btrfs_abort_transaction(trans, root, ret); 3159 goto out; 3160 } 3161 3162 leaf = path->nodes[0]; 3163 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3164 btrfs_release_path(path); 3165 index = key.offset; 3166 } 3167 btrfs_release_path(path); 3168 3169 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 3170 if (ret) { 3171 btrfs_abort_transaction(trans, root, ret); 3172 goto out; 3173 } 3174 3175 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3176 inode_inc_iversion(dir); 3177 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3178 ret = btrfs_update_inode_fallback(trans, root, dir); 3179 if (ret) 3180 btrfs_abort_transaction(trans, root, ret); 3181 out: 3182 btrfs_free_path(path); 3183 return ret; 3184 } 3185 3186 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 3187 { 3188 struct inode *inode = dentry->d_inode; 3189 int err = 0; 3190 struct btrfs_root *root = BTRFS_I(dir)->root; 3191 struct btrfs_trans_handle *trans; 3192 unsigned long nr = 0; 3193 3194 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3195 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3196 return -ENOTEMPTY; 3197 3198 trans = __unlink_start_trans(dir, dentry); 3199 if (IS_ERR(trans)) 3200 return PTR_ERR(trans); 3201 3202 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 3203 err = btrfs_unlink_subvol(trans, root, dir, 3204 BTRFS_I(inode)->location.objectid, 3205 dentry->d_name.name, 3206 dentry->d_name.len); 3207 goto out; 3208 } 3209 3210 err = btrfs_orphan_add(trans, inode); 3211 if (err) 3212 goto out; 3213 3214 /* now the directory is empty */ 3215 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3216 dentry->d_name.name, dentry->d_name.len); 3217 if (!err) 3218 btrfs_i_size_write(inode, 0); 3219 out: 3220 nr = trans->blocks_used; 3221 __unlink_end_trans(trans, root); 3222 btrfs_btree_balance_dirty(root, nr); 3223 3224 return err; 3225 } 3226 3227 /* 3228 * this can truncate away extent items, csum items and directory items. 3229 * It starts at a high offset and removes keys until it can't find 3230 * any higher than new_size 3231 * 3232 * csum items that cross the new i_size are truncated to the new size 3233 * as well. 3234 * 3235 * min_type is the minimum key type to truncate down to. If set to 0, this 3236 * will kill all the items on this inode, including the INODE_ITEM_KEY. 3237 */ 3238 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3239 struct btrfs_root *root, 3240 struct inode *inode, 3241 u64 new_size, u32 min_type) 3242 { 3243 struct btrfs_path *path; 3244 struct extent_buffer *leaf; 3245 struct btrfs_file_extent_item *fi; 3246 struct btrfs_key key; 3247 struct btrfs_key found_key; 3248 u64 extent_start = 0; 3249 u64 extent_num_bytes = 0; 3250 u64 extent_offset = 0; 3251 u64 item_end = 0; 3252 u64 mask = root->sectorsize - 1; 3253 u32 found_type = (u8)-1; 3254 int found_extent; 3255 int del_item; 3256 int pending_del_nr = 0; 3257 int pending_del_slot = 0; 3258 int extent_type = -1; 3259 int ret; 3260 int err = 0; 3261 u64 ino = btrfs_ino(inode); 3262 3263 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3264 3265 path = btrfs_alloc_path(); 3266 if (!path) 3267 return -ENOMEM; 3268 path->reada = -1; 3269 3270 if (root->ref_cows || root == root->fs_info->tree_root) 3271 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3272 3273 /* 3274 * This function is also used to drop the items in the log tree before 3275 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 3276 * it is used to drop the loged items. So we shouldn't kill the delayed 3277 * items. 3278 */ 3279 if (min_type == 0 && root == BTRFS_I(inode)->root) 3280 btrfs_kill_delayed_inode_items(inode); 3281 3282 key.objectid = ino; 3283 key.offset = (u64)-1; 3284 key.type = (u8)-1; 3285 3286 search_again: 3287 path->leave_spinning = 1; 3288 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3289 if (ret < 0) { 3290 err = ret; 3291 goto out; 3292 } 3293 3294 if (ret > 0) { 3295 /* there are no items in the tree for us to truncate, we're 3296 * done 3297 */ 3298 if (path->slots[0] == 0) 3299 goto out; 3300 path->slots[0]--; 3301 } 3302 3303 while (1) { 3304 fi = NULL; 3305 leaf = path->nodes[0]; 3306 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3307 found_type = btrfs_key_type(&found_key); 3308 3309 if (found_key.objectid != ino) 3310 break; 3311 3312 if (found_type < min_type) 3313 break; 3314 3315 item_end = found_key.offset; 3316 if (found_type == BTRFS_EXTENT_DATA_KEY) { 3317 fi = btrfs_item_ptr(leaf, path->slots[0], 3318 struct btrfs_file_extent_item); 3319 extent_type = btrfs_file_extent_type(leaf, fi); 3320 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3321 item_end += 3322 btrfs_file_extent_num_bytes(leaf, fi); 3323 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3324 item_end += btrfs_file_extent_inline_len(leaf, 3325 fi); 3326 } 3327 item_end--; 3328 } 3329 if (found_type > min_type) { 3330 del_item = 1; 3331 } else { 3332 if (item_end < new_size) 3333 break; 3334 if (found_key.offset >= new_size) 3335 del_item = 1; 3336 else 3337 del_item = 0; 3338 } 3339 found_extent = 0; 3340 /* FIXME, shrink the extent if the ref count is only 1 */ 3341 if (found_type != BTRFS_EXTENT_DATA_KEY) 3342 goto delete; 3343 3344 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3345 u64 num_dec; 3346 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3347 if (!del_item) { 3348 u64 orig_num_bytes = 3349 btrfs_file_extent_num_bytes(leaf, fi); 3350 extent_num_bytes = new_size - 3351 found_key.offset + root->sectorsize - 1; 3352 extent_num_bytes = extent_num_bytes & 3353 ~((u64)root->sectorsize - 1); 3354 btrfs_set_file_extent_num_bytes(leaf, fi, 3355 extent_num_bytes); 3356 num_dec = (orig_num_bytes - 3357 extent_num_bytes); 3358 if (root->ref_cows && extent_start != 0) 3359 inode_sub_bytes(inode, num_dec); 3360 btrfs_mark_buffer_dirty(leaf); 3361 } else { 3362 extent_num_bytes = 3363 btrfs_file_extent_disk_num_bytes(leaf, 3364 fi); 3365 extent_offset = found_key.offset - 3366 btrfs_file_extent_offset(leaf, fi); 3367 3368 /* FIXME blocksize != 4096 */ 3369 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 3370 if (extent_start != 0) { 3371 found_extent = 1; 3372 if (root->ref_cows) 3373 inode_sub_bytes(inode, num_dec); 3374 } 3375 } 3376 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3377 /* 3378 * we can't truncate inline items that have had 3379 * special encodings 3380 */ 3381 if (!del_item && 3382 btrfs_file_extent_compression(leaf, fi) == 0 && 3383 btrfs_file_extent_encryption(leaf, fi) == 0 && 3384 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 3385 u32 size = new_size - found_key.offset; 3386 3387 if (root->ref_cows) { 3388 inode_sub_bytes(inode, item_end + 1 - 3389 new_size); 3390 } 3391 size = 3392 btrfs_file_extent_calc_inline_size(size); 3393 btrfs_truncate_item(trans, root, path, 3394 size, 1); 3395 } else if (root->ref_cows) { 3396 inode_sub_bytes(inode, item_end + 1 - 3397 found_key.offset); 3398 } 3399 } 3400 delete: 3401 if (del_item) { 3402 if (!pending_del_nr) { 3403 /* no pending yet, add ourselves */ 3404 pending_del_slot = path->slots[0]; 3405 pending_del_nr = 1; 3406 } else if (pending_del_nr && 3407 path->slots[0] + 1 == pending_del_slot) { 3408 /* hop on the pending chunk */ 3409 pending_del_nr++; 3410 pending_del_slot = path->slots[0]; 3411 } else { 3412 BUG(); 3413 } 3414 } else { 3415 break; 3416 } 3417 if (found_extent && (root->ref_cows || 3418 root == root->fs_info->tree_root)) { 3419 btrfs_set_path_blocking(path); 3420 ret = btrfs_free_extent(trans, root, extent_start, 3421 extent_num_bytes, 0, 3422 btrfs_header_owner(leaf), 3423 ino, extent_offset, 0); 3424 BUG_ON(ret); 3425 } 3426 3427 if (found_type == BTRFS_INODE_ITEM_KEY) 3428 break; 3429 3430 if (path->slots[0] == 0 || 3431 path->slots[0] != pending_del_slot) { 3432 if (root->ref_cows && 3433 BTRFS_I(inode)->location.objectid != 3434 BTRFS_FREE_INO_OBJECTID) { 3435 err = -EAGAIN; 3436 goto out; 3437 } 3438 if (pending_del_nr) { 3439 ret = btrfs_del_items(trans, root, path, 3440 pending_del_slot, 3441 pending_del_nr); 3442 if (ret) { 3443 btrfs_abort_transaction(trans, 3444 root, ret); 3445 goto error; 3446 } 3447 pending_del_nr = 0; 3448 } 3449 btrfs_release_path(path); 3450 goto search_again; 3451 } else { 3452 path->slots[0]--; 3453 } 3454 } 3455 out: 3456 if (pending_del_nr) { 3457 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3458 pending_del_nr); 3459 if (ret) 3460 btrfs_abort_transaction(trans, root, ret); 3461 } 3462 error: 3463 btrfs_free_path(path); 3464 return err; 3465 } 3466 3467 /* 3468 * taken from block_truncate_page, but does cow as it zeros out 3469 * any bytes left in the last page in the file. 3470 */ 3471 static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3472 { 3473 struct inode *inode = mapping->host; 3474 struct btrfs_root *root = BTRFS_I(inode)->root; 3475 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3476 struct btrfs_ordered_extent *ordered; 3477 struct extent_state *cached_state = NULL; 3478 char *kaddr; 3479 u32 blocksize = root->sectorsize; 3480 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3481 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3482 struct page *page; 3483 gfp_t mask = btrfs_alloc_write_mask(mapping); 3484 int ret = 0; 3485 u64 page_start; 3486 u64 page_end; 3487 3488 if ((offset & (blocksize - 1)) == 0) 3489 goto out; 3490 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3491 if (ret) 3492 goto out; 3493 3494 ret = -ENOMEM; 3495 again: 3496 page = find_or_create_page(mapping, index, mask); 3497 if (!page) { 3498 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3499 goto out; 3500 } 3501 3502 page_start = page_offset(page); 3503 page_end = page_start + PAGE_CACHE_SIZE - 1; 3504 3505 if (!PageUptodate(page)) { 3506 ret = btrfs_readpage(NULL, page); 3507 lock_page(page); 3508 if (page->mapping != mapping) { 3509 unlock_page(page); 3510 page_cache_release(page); 3511 goto again; 3512 } 3513 if (!PageUptodate(page)) { 3514 ret = -EIO; 3515 goto out_unlock; 3516 } 3517 } 3518 wait_on_page_writeback(page); 3519 3520 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 3521 set_page_extent_mapped(page); 3522 3523 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3524 if (ordered) { 3525 unlock_extent_cached(io_tree, page_start, page_end, 3526 &cached_state, GFP_NOFS); 3527 unlock_page(page); 3528 page_cache_release(page); 3529 btrfs_start_ordered_extent(inode, ordered, 1); 3530 btrfs_put_ordered_extent(ordered); 3531 goto again; 3532 } 3533 3534 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3535 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3536 0, 0, &cached_state, GFP_NOFS); 3537 3538 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3539 &cached_state); 3540 if (ret) { 3541 unlock_extent_cached(io_tree, page_start, page_end, 3542 &cached_state, GFP_NOFS); 3543 goto out_unlock; 3544 } 3545 3546 ret = 0; 3547 if (offset != PAGE_CACHE_SIZE) { 3548 kaddr = kmap(page); 3549 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3550 flush_dcache_page(page); 3551 kunmap(page); 3552 } 3553 ClearPageChecked(page); 3554 set_page_dirty(page); 3555 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 3556 GFP_NOFS); 3557 3558 out_unlock: 3559 if (ret) 3560 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3561 unlock_page(page); 3562 page_cache_release(page); 3563 out: 3564 return ret; 3565 } 3566 3567 /* 3568 * This function puts in dummy file extents for the area we're creating a hole 3569 * for. So if we are truncating this file to a larger size we need to insert 3570 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 3571 * the range between oldsize and size 3572 */ 3573 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 3574 { 3575 struct btrfs_trans_handle *trans; 3576 struct btrfs_root *root = BTRFS_I(inode)->root; 3577 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3578 struct extent_map *em = NULL; 3579 struct extent_state *cached_state = NULL; 3580 u64 mask = root->sectorsize - 1; 3581 u64 hole_start = (oldsize + mask) & ~mask; 3582 u64 block_end = (size + mask) & ~mask; 3583 u64 last_byte; 3584 u64 cur_offset; 3585 u64 hole_size; 3586 int err = 0; 3587 3588 if (size <= hole_start) 3589 return 0; 3590 3591 while (1) { 3592 struct btrfs_ordered_extent *ordered; 3593 btrfs_wait_ordered_range(inode, hole_start, 3594 block_end - hole_start); 3595 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 3596 &cached_state); 3597 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3598 if (!ordered) 3599 break; 3600 unlock_extent_cached(io_tree, hole_start, block_end - 1, 3601 &cached_state, GFP_NOFS); 3602 btrfs_put_ordered_extent(ordered); 3603 } 3604 3605 cur_offset = hole_start; 3606 while (1) { 3607 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3608 block_end - cur_offset, 0); 3609 if (IS_ERR(em)) { 3610 err = PTR_ERR(em); 3611 break; 3612 } 3613 last_byte = min(extent_map_end(em), block_end); 3614 last_byte = (last_byte + mask) & ~mask; 3615 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3616 u64 hint_byte = 0; 3617 hole_size = last_byte - cur_offset; 3618 3619 trans = btrfs_start_transaction(root, 3); 3620 if (IS_ERR(trans)) { 3621 err = PTR_ERR(trans); 3622 break; 3623 } 3624 3625 err = btrfs_drop_extents(trans, inode, cur_offset, 3626 cur_offset + hole_size, 3627 &hint_byte, 1); 3628 if (err) { 3629 btrfs_abort_transaction(trans, root, err); 3630 btrfs_end_transaction(trans, root); 3631 break; 3632 } 3633 3634 err = btrfs_insert_file_extent(trans, root, 3635 btrfs_ino(inode), cur_offset, 0, 3636 0, hole_size, 0, hole_size, 3637 0, 0, 0); 3638 if (err) { 3639 btrfs_abort_transaction(trans, root, err); 3640 btrfs_end_transaction(trans, root); 3641 break; 3642 } 3643 3644 btrfs_drop_extent_cache(inode, hole_start, 3645 last_byte - 1, 0); 3646 3647 btrfs_update_inode(trans, root, inode); 3648 btrfs_end_transaction(trans, root); 3649 } 3650 free_extent_map(em); 3651 em = NULL; 3652 cur_offset = last_byte; 3653 if (cur_offset >= block_end) 3654 break; 3655 } 3656 3657 free_extent_map(em); 3658 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3659 GFP_NOFS); 3660 return err; 3661 } 3662 3663 static int btrfs_setsize(struct inode *inode, loff_t newsize) 3664 { 3665 struct btrfs_root *root = BTRFS_I(inode)->root; 3666 struct btrfs_trans_handle *trans; 3667 loff_t oldsize = i_size_read(inode); 3668 int ret; 3669 3670 if (newsize == oldsize) 3671 return 0; 3672 3673 if (newsize > oldsize) { 3674 truncate_pagecache(inode, oldsize, newsize); 3675 ret = btrfs_cont_expand(inode, oldsize, newsize); 3676 if (ret) 3677 return ret; 3678 3679 trans = btrfs_start_transaction(root, 1); 3680 if (IS_ERR(trans)) 3681 return PTR_ERR(trans); 3682 3683 i_size_write(inode, newsize); 3684 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3685 ret = btrfs_update_inode(trans, root, inode); 3686 btrfs_end_transaction(trans, root); 3687 } else { 3688 3689 /* 3690 * We're truncating a file that used to have good data down to 3691 * zero. Make sure it gets into the ordered flush list so that 3692 * any new writes get down to disk quickly. 3693 */ 3694 if (newsize == 0) 3695 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 3696 &BTRFS_I(inode)->runtime_flags); 3697 3698 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3699 truncate_setsize(inode, newsize); 3700 ret = btrfs_truncate(inode); 3701 } 3702 3703 return ret; 3704 } 3705 3706 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3707 { 3708 struct inode *inode = dentry->d_inode; 3709 struct btrfs_root *root = BTRFS_I(inode)->root; 3710 int err; 3711 3712 if (btrfs_root_readonly(root)) 3713 return -EROFS; 3714 3715 err = inode_change_ok(inode, attr); 3716 if (err) 3717 return err; 3718 3719 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3720 err = btrfs_setsize(inode, attr->ia_size); 3721 if (err) 3722 return err; 3723 } 3724 3725 if (attr->ia_valid) { 3726 setattr_copy(inode, attr); 3727 inode_inc_iversion(inode); 3728 err = btrfs_dirty_inode(inode); 3729 3730 if (!err && attr->ia_valid & ATTR_MODE) 3731 err = btrfs_acl_chmod(inode); 3732 } 3733 3734 return err; 3735 } 3736 3737 void btrfs_evict_inode(struct inode *inode) 3738 { 3739 struct btrfs_trans_handle *trans; 3740 struct btrfs_root *root = BTRFS_I(inode)->root; 3741 struct btrfs_block_rsv *rsv, *global_rsv; 3742 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3743 unsigned long nr; 3744 int ret; 3745 3746 trace_btrfs_inode_evict(inode); 3747 3748 truncate_inode_pages(&inode->i_data, 0); 3749 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3750 btrfs_is_free_space_inode(inode))) 3751 goto no_delete; 3752 3753 if (is_bad_inode(inode)) { 3754 btrfs_orphan_del(NULL, inode); 3755 goto no_delete; 3756 } 3757 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 3758 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3759 3760 if (root->fs_info->log_root_recovering) { 3761 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3762 &BTRFS_I(inode)->runtime_flags)); 3763 goto no_delete; 3764 } 3765 3766 if (inode->i_nlink > 0) { 3767 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3768 goto no_delete; 3769 } 3770 3771 rsv = btrfs_alloc_block_rsv(root); 3772 if (!rsv) { 3773 btrfs_orphan_del(NULL, inode); 3774 goto no_delete; 3775 } 3776 rsv->size = min_size; 3777 global_rsv = &root->fs_info->global_block_rsv; 3778 3779 btrfs_i_size_write(inode, 0); 3780 3781 /* 3782 * This is a bit simpler than btrfs_truncate since 3783 * 3784 * 1) We've already reserved our space for our orphan item in the 3785 * unlink. 3786 * 2) We're going to delete the inode item, so we don't need to update 3787 * it at all. 3788 * 3789 * So we just need to reserve some slack space in case we add bytes when 3790 * doing the truncate. 3791 */ 3792 while (1) { 3793 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3794 3795 /* 3796 * Try and steal from the global reserve since we will 3797 * likely not use this space anyway, we want to try as 3798 * hard as possible to get this to work. 3799 */ 3800 if (ret) 3801 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); 3802 3803 if (ret) { 3804 printk(KERN_WARNING "Could not get space for a " 3805 "delete, will truncate on mount %d\n", ret); 3806 btrfs_orphan_del(NULL, inode); 3807 btrfs_free_block_rsv(root, rsv); 3808 goto no_delete; 3809 } 3810 3811 trans = btrfs_start_transaction(root, 0); 3812 if (IS_ERR(trans)) { 3813 btrfs_orphan_del(NULL, inode); 3814 btrfs_free_block_rsv(root, rsv); 3815 goto no_delete; 3816 } 3817 3818 trans->block_rsv = rsv; 3819 3820 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3821 if (ret != -EAGAIN) 3822 break; 3823 3824 nr = trans->blocks_used; 3825 btrfs_end_transaction(trans, root); 3826 trans = NULL; 3827 btrfs_btree_balance_dirty(root, nr); 3828 } 3829 3830 btrfs_free_block_rsv(root, rsv); 3831 3832 if (ret == 0) { 3833 trans->block_rsv = root->orphan_block_rsv; 3834 ret = btrfs_orphan_del(trans, inode); 3835 BUG_ON(ret); 3836 } 3837 3838 trans->block_rsv = &root->fs_info->trans_block_rsv; 3839 if (!(root == root->fs_info->tree_root || 3840 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3841 btrfs_return_ino(root, btrfs_ino(inode)); 3842 3843 nr = trans->blocks_used; 3844 btrfs_end_transaction(trans, root); 3845 btrfs_btree_balance_dirty(root, nr); 3846 no_delete: 3847 clear_inode(inode); 3848 return; 3849 } 3850 3851 /* 3852 * this returns the key found in the dir entry in the location pointer. 3853 * If no dir entries were found, location->objectid is 0. 3854 */ 3855 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3856 struct btrfs_key *location) 3857 { 3858 const char *name = dentry->d_name.name; 3859 int namelen = dentry->d_name.len; 3860 struct btrfs_dir_item *di; 3861 struct btrfs_path *path; 3862 struct btrfs_root *root = BTRFS_I(dir)->root; 3863 int ret = 0; 3864 3865 path = btrfs_alloc_path(); 3866 if (!path) 3867 return -ENOMEM; 3868 3869 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 3870 namelen, 0); 3871 if (IS_ERR(di)) 3872 ret = PTR_ERR(di); 3873 3874 if (IS_ERR_OR_NULL(di)) 3875 goto out_err; 3876 3877 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3878 out: 3879 btrfs_free_path(path); 3880 return ret; 3881 out_err: 3882 location->objectid = 0; 3883 goto out; 3884 } 3885 3886 /* 3887 * when we hit a tree root in a directory, the btrfs part of the inode 3888 * needs to be changed to reflect the root directory of the tree root. This 3889 * is kind of like crossing a mount point. 3890 */ 3891 static int fixup_tree_root_location(struct btrfs_root *root, 3892 struct inode *dir, 3893 struct dentry *dentry, 3894 struct btrfs_key *location, 3895 struct btrfs_root **sub_root) 3896 { 3897 struct btrfs_path *path; 3898 struct btrfs_root *new_root; 3899 struct btrfs_root_ref *ref; 3900 struct extent_buffer *leaf; 3901 int ret; 3902 int err = 0; 3903 3904 path = btrfs_alloc_path(); 3905 if (!path) { 3906 err = -ENOMEM; 3907 goto out; 3908 } 3909 3910 err = -ENOENT; 3911 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3912 BTRFS_I(dir)->root->root_key.objectid, 3913 location->objectid); 3914 if (ret) { 3915 if (ret < 0) 3916 err = ret; 3917 goto out; 3918 } 3919 3920 leaf = path->nodes[0]; 3921 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3922 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 3923 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3924 goto out; 3925 3926 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3927 (unsigned long)(ref + 1), 3928 dentry->d_name.len); 3929 if (ret) 3930 goto out; 3931 3932 btrfs_release_path(path); 3933 3934 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3935 if (IS_ERR(new_root)) { 3936 err = PTR_ERR(new_root); 3937 goto out; 3938 } 3939 3940 if (btrfs_root_refs(&new_root->root_item) == 0) { 3941 err = -ENOENT; 3942 goto out; 3943 } 3944 3945 *sub_root = new_root; 3946 location->objectid = btrfs_root_dirid(&new_root->root_item); 3947 location->type = BTRFS_INODE_ITEM_KEY; 3948 location->offset = 0; 3949 err = 0; 3950 out: 3951 btrfs_free_path(path); 3952 return err; 3953 } 3954 3955 static void inode_tree_add(struct inode *inode) 3956 { 3957 struct btrfs_root *root = BTRFS_I(inode)->root; 3958 struct btrfs_inode *entry; 3959 struct rb_node **p; 3960 struct rb_node *parent; 3961 u64 ino = btrfs_ino(inode); 3962 again: 3963 p = &root->inode_tree.rb_node; 3964 parent = NULL; 3965 3966 if (inode_unhashed(inode)) 3967 return; 3968 3969 spin_lock(&root->inode_lock); 3970 while (*p) { 3971 parent = *p; 3972 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3973 3974 if (ino < btrfs_ino(&entry->vfs_inode)) 3975 p = &parent->rb_left; 3976 else if (ino > btrfs_ino(&entry->vfs_inode)) 3977 p = &parent->rb_right; 3978 else { 3979 WARN_ON(!(entry->vfs_inode.i_state & 3980 (I_WILL_FREE | I_FREEING))); 3981 rb_erase(parent, &root->inode_tree); 3982 RB_CLEAR_NODE(parent); 3983 spin_unlock(&root->inode_lock); 3984 goto again; 3985 } 3986 } 3987 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3988 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3989 spin_unlock(&root->inode_lock); 3990 } 3991 3992 static void inode_tree_del(struct inode *inode) 3993 { 3994 struct btrfs_root *root = BTRFS_I(inode)->root; 3995 int empty = 0; 3996 3997 spin_lock(&root->inode_lock); 3998 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3999 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 4000 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 4001 empty = RB_EMPTY_ROOT(&root->inode_tree); 4002 } 4003 spin_unlock(&root->inode_lock); 4004 4005 /* 4006 * Free space cache has inodes in the tree root, but the tree root has a 4007 * root_refs of 0, so this could end up dropping the tree root as a 4008 * snapshot, so we need the extra !root->fs_info->tree_root check to 4009 * make sure we don't drop it. 4010 */ 4011 if (empty && btrfs_root_refs(&root->root_item) == 0 && 4012 root != root->fs_info->tree_root) { 4013 synchronize_srcu(&root->fs_info->subvol_srcu); 4014 spin_lock(&root->inode_lock); 4015 empty = RB_EMPTY_ROOT(&root->inode_tree); 4016 spin_unlock(&root->inode_lock); 4017 if (empty) 4018 btrfs_add_dead_root(root); 4019 } 4020 } 4021 4022 void btrfs_invalidate_inodes(struct btrfs_root *root) 4023 { 4024 struct rb_node *node; 4025 struct rb_node *prev; 4026 struct btrfs_inode *entry; 4027 struct inode *inode; 4028 u64 objectid = 0; 4029 4030 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4031 4032 spin_lock(&root->inode_lock); 4033 again: 4034 node = root->inode_tree.rb_node; 4035 prev = NULL; 4036 while (node) { 4037 prev = node; 4038 entry = rb_entry(node, struct btrfs_inode, rb_node); 4039 4040 if (objectid < btrfs_ino(&entry->vfs_inode)) 4041 node = node->rb_left; 4042 else if (objectid > btrfs_ino(&entry->vfs_inode)) 4043 node = node->rb_right; 4044 else 4045 break; 4046 } 4047 if (!node) { 4048 while (prev) { 4049 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4050 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 4051 node = prev; 4052 break; 4053 } 4054 prev = rb_next(prev); 4055 } 4056 } 4057 while (node) { 4058 entry = rb_entry(node, struct btrfs_inode, rb_node); 4059 objectid = btrfs_ino(&entry->vfs_inode) + 1; 4060 inode = igrab(&entry->vfs_inode); 4061 if (inode) { 4062 spin_unlock(&root->inode_lock); 4063 if (atomic_read(&inode->i_count) > 1) 4064 d_prune_aliases(inode); 4065 /* 4066 * btrfs_drop_inode will have it removed from 4067 * the inode cache when its usage count 4068 * hits zero. 4069 */ 4070 iput(inode); 4071 cond_resched(); 4072 spin_lock(&root->inode_lock); 4073 goto again; 4074 } 4075 4076 if (cond_resched_lock(&root->inode_lock)) 4077 goto again; 4078 4079 node = rb_next(node); 4080 } 4081 spin_unlock(&root->inode_lock); 4082 } 4083 4084 static int btrfs_init_locked_inode(struct inode *inode, void *p) 4085 { 4086 struct btrfs_iget_args *args = p; 4087 inode->i_ino = args->ino; 4088 BTRFS_I(inode)->root = args->root; 4089 return 0; 4090 } 4091 4092 static int btrfs_find_actor(struct inode *inode, void *opaque) 4093 { 4094 struct btrfs_iget_args *args = opaque; 4095 return args->ino == btrfs_ino(inode) && 4096 args->root == BTRFS_I(inode)->root; 4097 } 4098 4099 static struct inode *btrfs_iget_locked(struct super_block *s, 4100 u64 objectid, 4101 struct btrfs_root *root) 4102 { 4103 struct inode *inode; 4104 struct btrfs_iget_args args; 4105 args.ino = objectid; 4106 args.root = root; 4107 4108 inode = iget5_locked(s, objectid, btrfs_find_actor, 4109 btrfs_init_locked_inode, 4110 (void *)&args); 4111 return inode; 4112 } 4113 4114 /* Get an inode object given its location and corresponding root. 4115 * Returns in *is_new if the inode was read from disk 4116 */ 4117 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 4118 struct btrfs_root *root, int *new) 4119 { 4120 struct inode *inode; 4121 4122 inode = btrfs_iget_locked(s, location->objectid, root); 4123 if (!inode) 4124 return ERR_PTR(-ENOMEM); 4125 4126 if (inode->i_state & I_NEW) { 4127 BTRFS_I(inode)->root = root; 4128 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4129 btrfs_read_locked_inode(inode); 4130 if (!is_bad_inode(inode)) { 4131 inode_tree_add(inode); 4132 unlock_new_inode(inode); 4133 if (new) 4134 *new = 1; 4135 } else { 4136 unlock_new_inode(inode); 4137 iput(inode); 4138 inode = ERR_PTR(-ESTALE); 4139 } 4140 } 4141 4142 return inode; 4143 } 4144 4145 static struct inode *new_simple_dir(struct super_block *s, 4146 struct btrfs_key *key, 4147 struct btrfs_root *root) 4148 { 4149 struct inode *inode = new_inode(s); 4150 4151 if (!inode) 4152 return ERR_PTR(-ENOMEM); 4153 4154 BTRFS_I(inode)->root = root; 4155 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4156 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 4157 4158 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4159 inode->i_op = &btrfs_dir_ro_inode_operations; 4160 inode->i_fop = &simple_dir_operations; 4161 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 4162 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4163 4164 return inode; 4165 } 4166 4167 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 4168 { 4169 struct inode *inode; 4170 struct btrfs_root *root = BTRFS_I(dir)->root; 4171 struct btrfs_root *sub_root = root; 4172 struct btrfs_key location; 4173 int index; 4174 int ret = 0; 4175 4176 if (dentry->d_name.len > BTRFS_NAME_LEN) 4177 return ERR_PTR(-ENAMETOOLONG); 4178 4179 if (unlikely(d_need_lookup(dentry))) { 4180 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); 4181 kfree(dentry->d_fsdata); 4182 dentry->d_fsdata = NULL; 4183 /* This thing is hashed, drop it for now */ 4184 d_drop(dentry); 4185 } else { 4186 ret = btrfs_inode_by_name(dir, dentry, &location); 4187 } 4188 4189 if (ret < 0) 4190 return ERR_PTR(ret); 4191 4192 if (location.objectid == 0) 4193 return NULL; 4194 4195 if (location.type == BTRFS_INODE_ITEM_KEY) { 4196 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 4197 return inode; 4198 } 4199 4200 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 4201 4202 index = srcu_read_lock(&root->fs_info->subvol_srcu); 4203 ret = fixup_tree_root_location(root, dir, dentry, 4204 &location, &sub_root); 4205 if (ret < 0) { 4206 if (ret != -ENOENT) 4207 inode = ERR_PTR(ret); 4208 else 4209 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4210 } else { 4211 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 4212 } 4213 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4214 4215 if (!IS_ERR(inode) && root != sub_root) { 4216 down_read(&root->fs_info->cleanup_work_sem); 4217 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4218 ret = btrfs_orphan_cleanup(sub_root); 4219 up_read(&root->fs_info->cleanup_work_sem); 4220 if (ret) 4221 inode = ERR_PTR(ret); 4222 } 4223 4224 return inode; 4225 } 4226 4227 static int btrfs_dentry_delete(const struct dentry *dentry) 4228 { 4229 struct btrfs_root *root; 4230 struct inode *inode = dentry->d_inode; 4231 4232 if (!inode && !IS_ROOT(dentry)) 4233 inode = dentry->d_parent->d_inode; 4234 4235 if (inode) { 4236 root = BTRFS_I(inode)->root; 4237 if (btrfs_root_refs(&root->root_item) == 0) 4238 return 1; 4239 4240 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 4241 return 1; 4242 } 4243 return 0; 4244 } 4245 4246 static void btrfs_dentry_release(struct dentry *dentry) 4247 { 4248 if (dentry->d_fsdata) 4249 kfree(dentry->d_fsdata); 4250 } 4251 4252 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4253 unsigned int flags) 4254 { 4255 struct dentry *ret; 4256 4257 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4258 if (unlikely(d_need_lookup(dentry))) { 4259 spin_lock(&dentry->d_lock); 4260 dentry->d_flags &= ~DCACHE_NEED_LOOKUP; 4261 spin_unlock(&dentry->d_lock); 4262 } 4263 return ret; 4264 } 4265 4266 unsigned char btrfs_filetype_table[] = { 4267 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 4268 }; 4269 4270 static int btrfs_real_readdir(struct file *filp, void *dirent, 4271 filldir_t filldir) 4272 { 4273 struct inode *inode = filp->f_dentry->d_inode; 4274 struct btrfs_root *root = BTRFS_I(inode)->root; 4275 struct btrfs_item *item; 4276 struct btrfs_dir_item *di; 4277 struct btrfs_key key; 4278 struct btrfs_key found_key; 4279 struct btrfs_path *path; 4280 struct list_head ins_list; 4281 struct list_head del_list; 4282 int ret; 4283 struct extent_buffer *leaf; 4284 int slot; 4285 unsigned char d_type; 4286 int over = 0; 4287 u32 di_cur; 4288 u32 di_total; 4289 u32 di_len; 4290 int key_type = BTRFS_DIR_INDEX_KEY; 4291 char tmp_name[32]; 4292 char *name_ptr; 4293 int name_len; 4294 int is_curr = 0; /* filp->f_pos points to the current index? */ 4295 4296 /* FIXME, use a real flag for deciding about the key type */ 4297 if (root->fs_info->tree_root == root) 4298 key_type = BTRFS_DIR_ITEM_KEY; 4299 4300 /* special case for "." */ 4301 if (filp->f_pos == 0) { 4302 over = filldir(dirent, ".", 1, 4303 filp->f_pos, btrfs_ino(inode), DT_DIR); 4304 if (over) 4305 return 0; 4306 filp->f_pos = 1; 4307 } 4308 /* special case for .., just use the back ref */ 4309 if (filp->f_pos == 1) { 4310 u64 pino = parent_ino(filp->f_path.dentry); 4311 over = filldir(dirent, "..", 2, 4312 filp->f_pos, pino, DT_DIR); 4313 if (over) 4314 return 0; 4315 filp->f_pos = 2; 4316 } 4317 path = btrfs_alloc_path(); 4318 if (!path) 4319 return -ENOMEM; 4320 4321 path->reada = 1; 4322 4323 if (key_type == BTRFS_DIR_INDEX_KEY) { 4324 INIT_LIST_HEAD(&ins_list); 4325 INIT_LIST_HEAD(&del_list); 4326 btrfs_get_delayed_items(inode, &ins_list, &del_list); 4327 } 4328 4329 btrfs_set_key_type(&key, key_type); 4330 key.offset = filp->f_pos; 4331 key.objectid = btrfs_ino(inode); 4332 4333 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4334 if (ret < 0) 4335 goto err; 4336 4337 while (1) { 4338 leaf = path->nodes[0]; 4339 slot = path->slots[0]; 4340 if (slot >= btrfs_header_nritems(leaf)) { 4341 ret = btrfs_next_leaf(root, path); 4342 if (ret < 0) 4343 goto err; 4344 else if (ret > 0) 4345 break; 4346 continue; 4347 } 4348 4349 item = btrfs_item_nr(leaf, slot); 4350 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4351 4352 if (found_key.objectid != key.objectid) 4353 break; 4354 if (btrfs_key_type(&found_key) != key_type) 4355 break; 4356 if (found_key.offset < filp->f_pos) 4357 goto next; 4358 if (key_type == BTRFS_DIR_INDEX_KEY && 4359 btrfs_should_delete_dir_index(&del_list, 4360 found_key.offset)) 4361 goto next; 4362 4363 filp->f_pos = found_key.offset; 4364 is_curr = 1; 4365 4366 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 4367 di_cur = 0; 4368 di_total = btrfs_item_size(leaf, item); 4369 4370 while (di_cur < di_total) { 4371 struct btrfs_key location; 4372 4373 if (verify_dir_item(root, leaf, di)) 4374 break; 4375 4376 name_len = btrfs_dir_name_len(leaf, di); 4377 if (name_len <= sizeof(tmp_name)) { 4378 name_ptr = tmp_name; 4379 } else { 4380 name_ptr = kmalloc(name_len, GFP_NOFS); 4381 if (!name_ptr) { 4382 ret = -ENOMEM; 4383 goto err; 4384 } 4385 } 4386 read_extent_buffer(leaf, name_ptr, 4387 (unsigned long)(di + 1), name_len); 4388 4389 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4390 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4391 4392 4393 /* is this a reference to our own snapshot? If so 4394 * skip it. 4395 * 4396 * In contrast to old kernels, we insert the snapshot's 4397 * dir item and dir index after it has been created, so 4398 * we won't find a reference to our own snapshot. We 4399 * still keep the following code for backward 4400 * compatibility. 4401 */ 4402 if (location.type == BTRFS_ROOT_ITEM_KEY && 4403 location.objectid == root->root_key.objectid) { 4404 over = 0; 4405 goto skip; 4406 } 4407 over = filldir(dirent, name_ptr, name_len, 4408 found_key.offset, location.objectid, 4409 d_type); 4410 4411 skip: 4412 if (name_ptr != tmp_name) 4413 kfree(name_ptr); 4414 4415 if (over) 4416 goto nopos; 4417 di_len = btrfs_dir_name_len(leaf, di) + 4418 btrfs_dir_data_len(leaf, di) + sizeof(*di); 4419 di_cur += di_len; 4420 di = (struct btrfs_dir_item *)((char *)di + di_len); 4421 } 4422 next: 4423 path->slots[0]++; 4424 } 4425 4426 if (key_type == BTRFS_DIR_INDEX_KEY) { 4427 if (is_curr) 4428 filp->f_pos++; 4429 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, 4430 &ins_list); 4431 if (ret) 4432 goto nopos; 4433 } 4434 4435 /* Reached end of directory/root. Bump pos past the last item. */ 4436 if (key_type == BTRFS_DIR_INDEX_KEY) 4437 /* 4438 * 32-bit glibc will use getdents64, but then strtol - 4439 * so the last number we can serve is this. 4440 */ 4441 filp->f_pos = 0x7fffffff; 4442 else 4443 filp->f_pos++; 4444 nopos: 4445 ret = 0; 4446 err: 4447 if (key_type == BTRFS_DIR_INDEX_KEY) 4448 btrfs_put_delayed_items(&ins_list, &del_list); 4449 btrfs_free_path(path); 4450 return ret; 4451 } 4452 4453 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 4454 { 4455 struct btrfs_root *root = BTRFS_I(inode)->root; 4456 struct btrfs_trans_handle *trans; 4457 int ret = 0; 4458 bool nolock = false; 4459 4460 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4461 return 0; 4462 4463 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) 4464 nolock = true; 4465 4466 if (wbc->sync_mode == WB_SYNC_ALL) { 4467 if (nolock) 4468 trans = btrfs_join_transaction_nolock(root); 4469 else 4470 trans = btrfs_join_transaction(root); 4471 if (IS_ERR(trans)) 4472 return PTR_ERR(trans); 4473 if (nolock) 4474 ret = btrfs_end_transaction_nolock(trans, root); 4475 else 4476 ret = btrfs_commit_transaction(trans, root); 4477 } 4478 return ret; 4479 } 4480 4481 /* 4482 * This is somewhat expensive, updating the tree every time the 4483 * inode changes. But, it is most likely to find the inode in cache. 4484 * FIXME, needs more benchmarking...there are no reasons other than performance 4485 * to keep or drop this code. 4486 */ 4487 int btrfs_dirty_inode(struct inode *inode) 4488 { 4489 struct btrfs_root *root = BTRFS_I(inode)->root; 4490 struct btrfs_trans_handle *trans; 4491 int ret; 4492 4493 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4494 return 0; 4495 4496 trans = btrfs_join_transaction(root); 4497 if (IS_ERR(trans)) 4498 return PTR_ERR(trans); 4499 4500 ret = btrfs_update_inode(trans, root, inode); 4501 if (ret && ret == -ENOSPC) { 4502 /* whoops, lets try again with the full transaction */ 4503 btrfs_end_transaction(trans, root); 4504 trans = btrfs_start_transaction(root, 1); 4505 if (IS_ERR(trans)) 4506 return PTR_ERR(trans); 4507 4508 ret = btrfs_update_inode(trans, root, inode); 4509 } 4510 btrfs_end_transaction(trans, root); 4511 if (BTRFS_I(inode)->delayed_node) 4512 btrfs_balance_delayed_items(root); 4513 4514 return ret; 4515 } 4516 4517 /* 4518 * This is a copy of file_update_time. We need this so we can return error on 4519 * ENOSPC for updating the inode in the case of file write and mmap writes. 4520 */ 4521 static int btrfs_update_time(struct inode *inode, struct timespec *now, 4522 int flags) 4523 { 4524 struct btrfs_root *root = BTRFS_I(inode)->root; 4525 4526 if (btrfs_root_readonly(root)) 4527 return -EROFS; 4528 4529 if (flags & S_VERSION) 4530 inode_inc_iversion(inode); 4531 if (flags & S_CTIME) 4532 inode->i_ctime = *now; 4533 if (flags & S_MTIME) 4534 inode->i_mtime = *now; 4535 if (flags & S_ATIME) 4536 inode->i_atime = *now; 4537 return btrfs_dirty_inode(inode); 4538 } 4539 4540 /* 4541 * find the highest existing sequence number in a directory 4542 * and then set the in-memory index_cnt variable to reflect 4543 * free sequence numbers 4544 */ 4545 static int btrfs_set_inode_index_count(struct inode *inode) 4546 { 4547 struct btrfs_root *root = BTRFS_I(inode)->root; 4548 struct btrfs_key key, found_key; 4549 struct btrfs_path *path; 4550 struct extent_buffer *leaf; 4551 int ret; 4552 4553 key.objectid = btrfs_ino(inode); 4554 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 4555 key.offset = (u64)-1; 4556 4557 path = btrfs_alloc_path(); 4558 if (!path) 4559 return -ENOMEM; 4560 4561 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4562 if (ret < 0) 4563 goto out; 4564 /* FIXME: we should be able to handle this */ 4565 if (ret == 0) 4566 goto out; 4567 ret = 0; 4568 4569 /* 4570 * MAGIC NUMBER EXPLANATION: 4571 * since we search a directory based on f_pos we have to start at 2 4572 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 4573 * else has to start at 2 4574 */ 4575 if (path->slots[0] == 0) { 4576 BTRFS_I(inode)->index_cnt = 2; 4577 goto out; 4578 } 4579 4580 path->slots[0]--; 4581 4582 leaf = path->nodes[0]; 4583 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4584 4585 if (found_key.objectid != btrfs_ino(inode) || 4586 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 4587 BTRFS_I(inode)->index_cnt = 2; 4588 goto out; 4589 } 4590 4591 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 4592 out: 4593 btrfs_free_path(path); 4594 return ret; 4595 } 4596 4597 /* 4598 * helper to find a free sequence number in a given directory. This current 4599 * code is very simple, later versions will do smarter things in the btree 4600 */ 4601 int btrfs_set_inode_index(struct inode *dir, u64 *index) 4602 { 4603 int ret = 0; 4604 4605 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 4606 ret = btrfs_inode_delayed_dir_index_count(dir); 4607 if (ret) { 4608 ret = btrfs_set_inode_index_count(dir); 4609 if (ret) 4610 return ret; 4611 } 4612 } 4613 4614 *index = BTRFS_I(dir)->index_cnt; 4615 BTRFS_I(dir)->index_cnt++; 4616 4617 return ret; 4618 } 4619 4620 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 4621 struct btrfs_root *root, 4622 struct inode *dir, 4623 const char *name, int name_len, 4624 u64 ref_objectid, u64 objectid, 4625 umode_t mode, u64 *index) 4626 { 4627 struct inode *inode; 4628 struct btrfs_inode_item *inode_item; 4629 struct btrfs_key *location; 4630 struct btrfs_path *path; 4631 struct btrfs_inode_ref *ref; 4632 struct btrfs_key key[2]; 4633 u32 sizes[2]; 4634 unsigned long ptr; 4635 int ret; 4636 int owner; 4637 4638 path = btrfs_alloc_path(); 4639 if (!path) 4640 return ERR_PTR(-ENOMEM); 4641 4642 inode = new_inode(root->fs_info->sb); 4643 if (!inode) { 4644 btrfs_free_path(path); 4645 return ERR_PTR(-ENOMEM); 4646 } 4647 4648 /* 4649 * we have to initialize this early, so we can reclaim the inode 4650 * number if we fail afterwards in this function. 4651 */ 4652 inode->i_ino = objectid; 4653 4654 if (dir) { 4655 trace_btrfs_inode_request(dir); 4656 4657 ret = btrfs_set_inode_index(dir, index); 4658 if (ret) { 4659 btrfs_free_path(path); 4660 iput(inode); 4661 return ERR_PTR(ret); 4662 } 4663 } 4664 /* 4665 * index_cnt is ignored for everything but a dir, 4666 * btrfs_get_inode_index_count has an explanation for the magic 4667 * number 4668 */ 4669 BTRFS_I(inode)->index_cnt = 2; 4670 BTRFS_I(inode)->root = root; 4671 BTRFS_I(inode)->generation = trans->transid; 4672 inode->i_generation = BTRFS_I(inode)->generation; 4673 4674 if (S_ISDIR(mode)) 4675 owner = 0; 4676 else 4677 owner = 1; 4678 4679 key[0].objectid = objectid; 4680 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4681 key[0].offset = 0; 4682 4683 key[1].objectid = objectid; 4684 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4685 key[1].offset = ref_objectid; 4686 4687 sizes[0] = sizeof(struct btrfs_inode_item); 4688 sizes[1] = name_len + sizeof(*ref); 4689 4690 path->leave_spinning = 1; 4691 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 4692 if (ret != 0) 4693 goto fail; 4694 4695 inode_init_owner(inode, dir, mode); 4696 inode_set_bytes(inode, 0); 4697 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4698 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4699 struct btrfs_inode_item); 4700 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, 4701 sizeof(*inode_item)); 4702 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4703 4704 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4705 struct btrfs_inode_ref); 4706 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 4707 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 4708 ptr = (unsigned long)(ref + 1); 4709 write_extent_buffer(path->nodes[0], name, ptr, name_len); 4710 4711 btrfs_mark_buffer_dirty(path->nodes[0]); 4712 btrfs_free_path(path); 4713 4714 location = &BTRFS_I(inode)->location; 4715 location->objectid = objectid; 4716 location->offset = 0; 4717 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 4718 4719 btrfs_inherit_iflags(inode, dir); 4720 4721 if (S_ISREG(mode)) { 4722 if (btrfs_test_opt(root, NODATASUM)) 4723 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4724 if (btrfs_test_opt(root, NODATACOW) || 4725 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) 4726 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4727 } 4728 4729 insert_inode_hash(inode); 4730 inode_tree_add(inode); 4731 4732 trace_btrfs_inode_new(inode); 4733 btrfs_set_inode_last_trans(trans, inode); 4734 4735 btrfs_update_root_times(trans, root); 4736 4737 return inode; 4738 fail: 4739 if (dir) 4740 BTRFS_I(dir)->index_cnt--; 4741 btrfs_free_path(path); 4742 iput(inode); 4743 return ERR_PTR(ret); 4744 } 4745 4746 static inline u8 btrfs_inode_type(struct inode *inode) 4747 { 4748 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 4749 } 4750 4751 /* 4752 * utility function to add 'inode' into 'parent_inode' with 4753 * a give name and a given sequence number. 4754 * if 'add_backref' is true, also insert a backref from the 4755 * inode to the parent directory. 4756 */ 4757 int btrfs_add_link(struct btrfs_trans_handle *trans, 4758 struct inode *parent_inode, struct inode *inode, 4759 const char *name, int name_len, int add_backref, u64 index) 4760 { 4761 int ret = 0; 4762 struct btrfs_key key; 4763 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4764 u64 ino = btrfs_ino(inode); 4765 u64 parent_ino = btrfs_ino(parent_inode); 4766 4767 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4768 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4769 } else { 4770 key.objectid = ino; 4771 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4772 key.offset = 0; 4773 } 4774 4775 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4776 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4777 key.objectid, root->root_key.objectid, 4778 parent_ino, index, name, name_len); 4779 } else if (add_backref) { 4780 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 4781 parent_ino, index); 4782 } 4783 4784 /* Nothing to clean up yet */ 4785 if (ret) 4786 return ret; 4787 4788 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4789 parent_inode, &key, 4790 btrfs_inode_type(inode), index); 4791 if (ret == -EEXIST) 4792 goto fail_dir_item; 4793 else if (ret) { 4794 btrfs_abort_transaction(trans, root, ret); 4795 return ret; 4796 } 4797 4798 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4799 name_len * 2); 4800 inode_inc_iversion(parent_inode); 4801 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4802 ret = btrfs_update_inode(trans, root, parent_inode); 4803 if (ret) 4804 btrfs_abort_transaction(trans, root, ret); 4805 return ret; 4806 4807 fail_dir_item: 4808 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4809 u64 local_index; 4810 int err; 4811 err = btrfs_del_root_ref(trans, root->fs_info->tree_root, 4812 key.objectid, root->root_key.objectid, 4813 parent_ino, &local_index, name, name_len); 4814 4815 } else if (add_backref) { 4816 u64 local_index; 4817 int err; 4818 4819 err = btrfs_del_inode_ref(trans, root, name, name_len, 4820 ino, parent_ino, &local_index); 4821 } 4822 return ret; 4823 } 4824 4825 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4826 struct inode *dir, struct dentry *dentry, 4827 struct inode *inode, int backref, u64 index) 4828 { 4829 int err = btrfs_add_link(trans, dir, inode, 4830 dentry->d_name.name, dentry->d_name.len, 4831 backref, index); 4832 if (err > 0) 4833 err = -EEXIST; 4834 return err; 4835 } 4836 4837 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4838 umode_t mode, dev_t rdev) 4839 { 4840 struct btrfs_trans_handle *trans; 4841 struct btrfs_root *root = BTRFS_I(dir)->root; 4842 struct inode *inode = NULL; 4843 int err; 4844 int drop_inode = 0; 4845 u64 objectid; 4846 unsigned long nr = 0; 4847 u64 index = 0; 4848 4849 if (!new_valid_dev(rdev)) 4850 return -EINVAL; 4851 4852 /* 4853 * 2 for inode item and ref 4854 * 2 for dir items 4855 * 1 for xattr if selinux is on 4856 */ 4857 trans = btrfs_start_transaction(root, 5); 4858 if (IS_ERR(trans)) 4859 return PTR_ERR(trans); 4860 4861 err = btrfs_find_free_ino(root, &objectid); 4862 if (err) 4863 goto out_unlock; 4864 4865 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4866 dentry->d_name.len, btrfs_ino(dir), objectid, 4867 mode, &index); 4868 if (IS_ERR(inode)) { 4869 err = PTR_ERR(inode); 4870 goto out_unlock; 4871 } 4872 4873 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4874 if (err) { 4875 drop_inode = 1; 4876 goto out_unlock; 4877 } 4878 4879 /* 4880 * If the active LSM wants to access the inode during 4881 * d_instantiate it needs these. Smack checks to see 4882 * if the filesystem supports xattrs by looking at the 4883 * ops vector. 4884 */ 4885 4886 inode->i_op = &btrfs_special_inode_operations; 4887 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4888 if (err) 4889 drop_inode = 1; 4890 else { 4891 init_special_inode(inode, inode->i_mode, rdev); 4892 btrfs_update_inode(trans, root, inode); 4893 d_instantiate(dentry, inode); 4894 } 4895 out_unlock: 4896 nr = trans->blocks_used; 4897 btrfs_end_transaction(trans, root); 4898 btrfs_btree_balance_dirty(root, nr); 4899 if (drop_inode) { 4900 inode_dec_link_count(inode); 4901 iput(inode); 4902 } 4903 return err; 4904 } 4905 4906 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4907 umode_t mode, bool excl) 4908 { 4909 struct btrfs_trans_handle *trans; 4910 struct btrfs_root *root = BTRFS_I(dir)->root; 4911 struct inode *inode = NULL; 4912 int drop_inode = 0; 4913 int err; 4914 unsigned long nr = 0; 4915 u64 objectid; 4916 u64 index = 0; 4917 4918 /* 4919 * 2 for inode item and ref 4920 * 2 for dir items 4921 * 1 for xattr if selinux is on 4922 */ 4923 trans = btrfs_start_transaction(root, 5); 4924 if (IS_ERR(trans)) 4925 return PTR_ERR(trans); 4926 4927 err = btrfs_find_free_ino(root, &objectid); 4928 if (err) 4929 goto out_unlock; 4930 4931 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4932 dentry->d_name.len, btrfs_ino(dir), objectid, 4933 mode, &index); 4934 if (IS_ERR(inode)) { 4935 err = PTR_ERR(inode); 4936 goto out_unlock; 4937 } 4938 4939 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4940 if (err) { 4941 drop_inode = 1; 4942 goto out_unlock; 4943 } 4944 4945 /* 4946 * If the active LSM wants to access the inode during 4947 * d_instantiate it needs these. Smack checks to see 4948 * if the filesystem supports xattrs by looking at the 4949 * ops vector. 4950 */ 4951 inode->i_fop = &btrfs_file_operations; 4952 inode->i_op = &btrfs_file_inode_operations; 4953 4954 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4955 if (err) 4956 drop_inode = 1; 4957 else { 4958 inode->i_mapping->a_ops = &btrfs_aops; 4959 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4960 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4961 d_instantiate(dentry, inode); 4962 } 4963 out_unlock: 4964 nr = trans->blocks_used; 4965 btrfs_end_transaction(trans, root); 4966 if (drop_inode) { 4967 inode_dec_link_count(inode); 4968 iput(inode); 4969 } 4970 btrfs_btree_balance_dirty(root, nr); 4971 return err; 4972 } 4973 4974 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 4975 struct dentry *dentry) 4976 { 4977 struct btrfs_trans_handle *trans; 4978 struct btrfs_root *root = BTRFS_I(dir)->root; 4979 struct inode *inode = old_dentry->d_inode; 4980 u64 index; 4981 unsigned long nr = 0; 4982 int err; 4983 int drop_inode = 0; 4984 4985 /* do not allow sys_link's with other subvols of the same device */ 4986 if (root->objectid != BTRFS_I(inode)->root->objectid) 4987 return -EXDEV; 4988 4989 if (inode->i_nlink == ~0U) 4990 return -EMLINK; 4991 4992 err = btrfs_set_inode_index(dir, &index); 4993 if (err) 4994 goto fail; 4995 4996 /* 4997 * 2 items for inode and inode ref 4998 * 2 items for dir items 4999 * 1 item for parent inode 5000 */ 5001 trans = btrfs_start_transaction(root, 5); 5002 if (IS_ERR(trans)) { 5003 err = PTR_ERR(trans); 5004 goto fail; 5005 } 5006 5007 btrfs_inc_nlink(inode); 5008 inode_inc_iversion(inode); 5009 inode->i_ctime = CURRENT_TIME; 5010 ihold(inode); 5011 5012 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5013 5014 if (err) { 5015 drop_inode = 1; 5016 } else { 5017 struct dentry *parent = dentry->d_parent; 5018 err = btrfs_update_inode(trans, root, inode); 5019 if (err) 5020 goto fail; 5021 d_instantiate(dentry, inode); 5022 btrfs_log_new_name(trans, inode, NULL, parent); 5023 } 5024 5025 nr = trans->blocks_used; 5026 btrfs_end_transaction(trans, root); 5027 fail: 5028 if (drop_inode) { 5029 inode_dec_link_count(inode); 5030 iput(inode); 5031 } 5032 btrfs_btree_balance_dirty(root, nr); 5033 return err; 5034 } 5035 5036 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 5037 { 5038 struct inode *inode = NULL; 5039 struct btrfs_trans_handle *trans; 5040 struct btrfs_root *root = BTRFS_I(dir)->root; 5041 int err = 0; 5042 int drop_on_err = 0; 5043 u64 objectid = 0; 5044 u64 index = 0; 5045 unsigned long nr = 1; 5046 5047 /* 5048 * 2 items for inode and ref 5049 * 2 items for dir items 5050 * 1 for xattr if selinux is on 5051 */ 5052 trans = btrfs_start_transaction(root, 5); 5053 if (IS_ERR(trans)) 5054 return PTR_ERR(trans); 5055 5056 err = btrfs_find_free_ino(root, &objectid); 5057 if (err) 5058 goto out_fail; 5059 5060 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5061 dentry->d_name.len, btrfs_ino(dir), objectid, 5062 S_IFDIR | mode, &index); 5063 if (IS_ERR(inode)) { 5064 err = PTR_ERR(inode); 5065 goto out_fail; 5066 } 5067 5068 drop_on_err = 1; 5069 5070 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5071 if (err) 5072 goto out_fail; 5073 5074 inode->i_op = &btrfs_dir_inode_operations; 5075 inode->i_fop = &btrfs_dir_file_operations; 5076 5077 btrfs_i_size_write(inode, 0); 5078 err = btrfs_update_inode(trans, root, inode); 5079 if (err) 5080 goto out_fail; 5081 5082 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 5083 dentry->d_name.len, 0, index); 5084 if (err) 5085 goto out_fail; 5086 5087 d_instantiate(dentry, inode); 5088 drop_on_err = 0; 5089 5090 out_fail: 5091 nr = trans->blocks_used; 5092 btrfs_end_transaction(trans, root); 5093 if (drop_on_err) 5094 iput(inode); 5095 btrfs_btree_balance_dirty(root, nr); 5096 return err; 5097 } 5098 5099 /* helper for btfs_get_extent. Given an existing extent in the tree, 5100 * and an extent that you want to insert, deal with overlap and insert 5101 * the new extent into the tree. 5102 */ 5103 static int merge_extent_mapping(struct extent_map_tree *em_tree, 5104 struct extent_map *existing, 5105 struct extent_map *em, 5106 u64 map_start, u64 map_len) 5107 { 5108 u64 start_diff; 5109 5110 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 5111 start_diff = map_start - em->start; 5112 em->start = map_start; 5113 em->len = map_len; 5114 if (em->block_start < EXTENT_MAP_LAST_BYTE && 5115 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 5116 em->block_start += start_diff; 5117 em->block_len -= start_diff; 5118 } 5119 return add_extent_mapping(em_tree, em); 5120 } 5121 5122 static noinline int uncompress_inline(struct btrfs_path *path, 5123 struct inode *inode, struct page *page, 5124 size_t pg_offset, u64 extent_offset, 5125 struct btrfs_file_extent_item *item) 5126 { 5127 int ret; 5128 struct extent_buffer *leaf = path->nodes[0]; 5129 char *tmp; 5130 size_t max_size; 5131 unsigned long inline_size; 5132 unsigned long ptr; 5133 int compress_type; 5134 5135 WARN_ON(pg_offset != 0); 5136 compress_type = btrfs_file_extent_compression(leaf, item); 5137 max_size = btrfs_file_extent_ram_bytes(leaf, item); 5138 inline_size = btrfs_file_extent_inline_item_len(leaf, 5139 btrfs_item_nr(leaf, path->slots[0])); 5140 tmp = kmalloc(inline_size, GFP_NOFS); 5141 if (!tmp) 5142 return -ENOMEM; 5143 ptr = btrfs_file_extent_inline_start(item); 5144 5145 read_extent_buffer(leaf, tmp, ptr, inline_size); 5146 5147 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 5148 ret = btrfs_decompress(compress_type, tmp, page, 5149 extent_offset, inline_size, max_size); 5150 if (ret) { 5151 char *kaddr = kmap_atomic(page); 5152 unsigned long copy_size = min_t(u64, 5153 PAGE_CACHE_SIZE - pg_offset, 5154 max_size - extent_offset); 5155 memset(kaddr + pg_offset, 0, copy_size); 5156 kunmap_atomic(kaddr); 5157 } 5158 kfree(tmp); 5159 return 0; 5160 } 5161 5162 /* 5163 * a bit scary, this does extent mapping from logical file offset to the disk. 5164 * the ugly parts come from merging extents from the disk with the in-ram 5165 * representation. This gets more complex because of the data=ordered code, 5166 * where the in-ram extents might be locked pending data=ordered completion. 5167 * 5168 * This also copies inline extents directly into the page. 5169 */ 5170 5171 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 5172 size_t pg_offset, u64 start, u64 len, 5173 int create) 5174 { 5175 int ret; 5176 int err = 0; 5177 u64 bytenr; 5178 u64 extent_start = 0; 5179 u64 extent_end = 0; 5180 u64 objectid = btrfs_ino(inode); 5181 u32 found_type; 5182 struct btrfs_path *path = NULL; 5183 struct btrfs_root *root = BTRFS_I(inode)->root; 5184 struct btrfs_file_extent_item *item; 5185 struct extent_buffer *leaf; 5186 struct btrfs_key found_key; 5187 struct extent_map *em = NULL; 5188 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5189 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5190 struct btrfs_trans_handle *trans = NULL; 5191 int compress_type; 5192 5193 again: 5194 read_lock(&em_tree->lock); 5195 em = lookup_extent_mapping(em_tree, start, len); 5196 if (em) 5197 em->bdev = root->fs_info->fs_devices->latest_bdev; 5198 read_unlock(&em_tree->lock); 5199 5200 if (em) { 5201 if (em->start > start || em->start + em->len <= start) 5202 free_extent_map(em); 5203 else if (em->block_start == EXTENT_MAP_INLINE && page) 5204 free_extent_map(em); 5205 else 5206 goto out; 5207 } 5208 em = alloc_extent_map(); 5209 if (!em) { 5210 err = -ENOMEM; 5211 goto out; 5212 } 5213 em->bdev = root->fs_info->fs_devices->latest_bdev; 5214 em->start = EXTENT_MAP_HOLE; 5215 em->orig_start = EXTENT_MAP_HOLE; 5216 em->len = (u64)-1; 5217 em->block_len = (u64)-1; 5218 5219 if (!path) { 5220 path = btrfs_alloc_path(); 5221 if (!path) { 5222 err = -ENOMEM; 5223 goto out; 5224 } 5225 /* 5226 * Chances are we'll be called again, so go ahead and do 5227 * readahead 5228 */ 5229 path->reada = 1; 5230 } 5231 5232 ret = btrfs_lookup_file_extent(trans, root, path, 5233 objectid, start, trans != NULL); 5234 if (ret < 0) { 5235 err = ret; 5236 goto out; 5237 } 5238 5239 if (ret != 0) { 5240 if (path->slots[0] == 0) 5241 goto not_found; 5242 path->slots[0]--; 5243 } 5244 5245 leaf = path->nodes[0]; 5246 item = btrfs_item_ptr(leaf, path->slots[0], 5247 struct btrfs_file_extent_item); 5248 /* are we inside the extent that was found? */ 5249 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5250 found_type = btrfs_key_type(&found_key); 5251 if (found_key.objectid != objectid || 5252 found_type != BTRFS_EXTENT_DATA_KEY) { 5253 goto not_found; 5254 } 5255 5256 found_type = btrfs_file_extent_type(leaf, item); 5257 extent_start = found_key.offset; 5258 compress_type = btrfs_file_extent_compression(leaf, item); 5259 if (found_type == BTRFS_FILE_EXTENT_REG || 5260 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5261 extent_end = extent_start + 5262 btrfs_file_extent_num_bytes(leaf, item); 5263 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5264 size_t size; 5265 size = btrfs_file_extent_inline_len(leaf, item); 5266 extent_end = (extent_start + size + root->sectorsize - 1) & 5267 ~((u64)root->sectorsize - 1); 5268 } 5269 5270 if (start >= extent_end) { 5271 path->slots[0]++; 5272 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 5273 ret = btrfs_next_leaf(root, path); 5274 if (ret < 0) { 5275 err = ret; 5276 goto out; 5277 } 5278 if (ret > 0) 5279 goto not_found; 5280 leaf = path->nodes[0]; 5281 } 5282 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5283 if (found_key.objectid != objectid || 5284 found_key.type != BTRFS_EXTENT_DATA_KEY) 5285 goto not_found; 5286 if (start + len <= found_key.offset) 5287 goto not_found; 5288 em->start = start; 5289 em->len = found_key.offset - start; 5290 goto not_found_em; 5291 } 5292 5293 if (found_type == BTRFS_FILE_EXTENT_REG || 5294 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5295 em->start = extent_start; 5296 em->len = extent_end - extent_start; 5297 em->orig_start = extent_start - 5298 btrfs_file_extent_offset(leaf, item); 5299 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5300 if (bytenr == 0) { 5301 em->block_start = EXTENT_MAP_HOLE; 5302 goto insert; 5303 } 5304 if (compress_type != BTRFS_COMPRESS_NONE) { 5305 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5306 em->compress_type = compress_type; 5307 em->block_start = bytenr; 5308 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5309 item); 5310 } else { 5311 bytenr += btrfs_file_extent_offset(leaf, item); 5312 em->block_start = bytenr; 5313 em->block_len = em->len; 5314 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 5315 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5316 } 5317 goto insert; 5318 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5319 unsigned long ptr; 5320 char *map; 5321 size_t size; 5322 size_t extent_offset; 5323 size_t copy_size; 5324 5325 em->block_start = EXTENT_MAP_INLINE; 5326 if (!page || create) { 5327 em->start = extent_start; 5328 em->len = extent_end - extent_start; 5329 goto out; 5330 } 5331 5332 size = btrfs_file_extent_inline_len(leaf, item); 5333 extent_offset = page_offset(page) + pg_offset - extent_start; 5334 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 5335 size - extent_offset); 5336 em->start = extent_start + extent_offset; 5337 em->len = (copy_size + root->sectorsize - 1) & 5338 ~((u64)root->sectorsize - 1); 5339 em->orig_start = EXTENT_MAP_INLINE; 5340 if (compress_type) { 5341 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5342 em->compress_type = compress_type; 5343 } 5344 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5345 if (create == 0 && !PageUptodate(page)) { 5346 if (btrfs_file_extent_compression(leaf, item) != 5347 BTRFS_COMPRESS_NONE) { 5348 ret = uncompress_inline(path, inode, page, 5349 pg_offset, 5350 extent_offset, item); 5351 BUG_ON(ret); /* -ENOMEM */ 5352 } else { 5353 map = kmap(page); 5354 read_extent_buffer(leaf, map + pg_offset, ptr, 5355 copy_size); 5356 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 5357 memset(map + pg_offset + copy_size, 0, 5358 PAGE_CACHE_SIZE - pg_offset - 5359 copy_size); 5360 } 5361 kunmap(page); 5362 } 5363 flush_dcache_page(page); 5364 } else if (create && PageUptodate(page)) { 5365 BUG(); 5366 if (!trans) { 5367 kunmap(page); 5368 free_extent_map(em); 5369 em = NULL; 5370 5371 btrfs_release_path(path); 5372 trans = btrfs_join_transaction(root); 5373 5374 if (IS_ERR(trans)) 5375 return ERR_CAST(trans); 5376 goto again; 5377 } 5378 map = kmap(page); 5379 write_extent_buffer(leaf, map + pg_offset, ptr, 5380 copy_size); 5381 kunmap(page); 5382 btrfs_mark_buffer_dirty(leaf); 5383 } 5384 set_extent_uptodate(io_tree, em->start, 5385 extent_map_end(em) - 1, NULL, GFP_NOFS); 5386 goto insert; 5387 } else { 5388 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5389 WARN_ON(1); 5390 } 5391 not_found: 5392 em->start = start; 5393 em->len = len; 5394 not_found_em: 5395 em->block_start = EXTENT_MAP_HOLE; 5396 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 5397 insert: 5398 btrfs_release_path(path); 5399 if (em->start > start || extent_map_end(em) <= start) { 5400 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 5401 "[%llu %llu]\n", (unsigned long long)em->start, 5402 (unsigned long long)em->len, 5403 (unsigned long long)start, 5404 (unsigned long long)len); 5405 err = -EIO; 5406 goto out; 5407 } 5408 5409 err = 0; 5410 write_lock(&em_tree->lock); 5411 ret = add_extent_mapping(em_tree, em); 5412 /* it is possible that someone inserted the extent into the tree 5413 * while we had the lock dropped. It is also possible that 5414 * an overlapping map exists in the tree 5415 */ 5416 if (ret == -EEXIST) { 5417 struct extent_map *existing; 5418 5419 ret = 0; 5420 5421 existing = lookup_extent_mapping(em_tree, start, len); 5422 if (existing && (existing->start > start || 5423 existing->start + existing->len <= start)) { 5424 free_extent_map(existing); 5425 existing = NULL; 5426 } 5427 if (!existing) { 5428 existing = lookup_extent_mapping(em_tree, em->start, 5429 em->len); 5430 if (existing) { 5431 err = merge_extent_mapping(em_tree, existing, 5432 em, start, 5433 root->sectorsize); 5434 free_extent_map(existing); 5435 if (err) { 5436 free_extent_map(em); 5437 em = NULL; 5438 } 5439 } else { 5440 err = -EIO; 5441 free_extent_map(em); 5442 em = NULL; 5443 } 5444 } else { 5445 free_extent_map(em); 5446 em = existing; 5447 err = 0; 5448 } 5449 } 5450 write_unlock(&em_tree->lock); 5451 out: 5452 5453 trace_btrfs_get_extent(root, em); 5454 5455 if (path) 5456 btrfs_free_path(path); 5457 if (trans) { 5458 ret = btrfs_end_transaction(trans, root); 5459 if (!err) 5460 err = ret; 5461 } 5462 if (err) { 5463 free_extent_map(em); 5464 return ERR_PTR(err); 5465 } 5466 BUG_ON(!em); /* Error is always set */ 5467 return em; 5468 } 5469 5470 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 5471 size_t pg_offset, u64 start, u64 len, 5472 int create) 5473 { 5474 struct extent_map *em; 5475 struct extent_map *hole_em = NULL; 5476 u64 range_start = start; 5477 u64 end; 5478 u64 found; 5479 u64 found_end; 5480 int err = 0; 5481 5482 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 5483 if (IS_ERR(em)) 5484 return em; 5485 if (em) { 5486 /* 5487 * if our em maps to a hole, there might 5488 * actually be delalloc bytes behind it 5489 */ 5490 if (em->block_start != EXTENT_MAP_HOLE) 5491 return em; 5492 else 5493 hole_em = em; 5494 } 5495 5496 /* check to see if we've wrapped (len == -1 or similar) */ 5497 end = start + len; 5498 if (end < start) 5499 end = (u64)-1; 5500 else 5501 end -= 1; 5502 5503 em = NULL; 5504 5505 /* ok, we didn't find anything, lets look for delalloc */ 5506 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 5507 end, len, EXTENT_DELALLOC, 1); 5508 found_end = range_start + found; 5509 if (found_end < range_start) 5510 found_end = (u64)-1; 5511 5512 /* 5513 * we didn't find anything useful, return 5514 * the original results from get_extent() 5515 */ 5516 if (range_start > end || found_end <= start) { 5517 em = hole_em; 5518 hole_em = NULL; 5519 goto out; 5520 } 5521 5522 /* adjust the range_start to make sure it doesn't 5523 * go backwards from the start they passed in 5524 */ 5525 range_start = max(start,range_start); 5526 found = found_end - range_start; 5527 5528 if (found > 0) { 5529 u64 hole_start = start; 5530 u64 hole_len = len; 5531 5532 em = alloc_extent_map(); 5533 if (!em) { 5534 err = -ENOMEM; 5535 goto out; 5536 } 5537 /* 5538 * when btrfs_get_extent can't find anything it 5539 * returns one huge hole 5540 * 5541 * make sure what it found really fits our range, and 5542 * adjust to make sure it is based on the start from 5543 * the caller 5544 */ 5545 if (hole_em) { 5546 u64 calc_end = extent_map_end(hole_em); 5547 5548 if (calc_end <= start || (hole_em->start > end)) { 5549 free_extent_map(hole_em); 5550 hole_em = NULL; 5551 } else { 5552 hole_start = max(hole_em->start, start); 5553 hole_len = calc_end - hole_start; 5554 } 5555 } 5556 em->bdev = NULL; 5557 if (hole_em && range_start > hole_start) { 5558 /* our hole starts before our delalloc, so we 5559 * have to return just the parts of the hole 5560 * that go until the delalloc starts 5561 */ 5562 em->len = min(hole_len, 5563 range_start - hole_start); 5564 em->start = hole_start; 5565 em->orig_start = hole_start; 5566 /* 5567 * don't adjust block start at all, 5568 * it is fixed at EXTENT_MAP_HOLE 5569 */ 5570 em->block_start = hole_em->block_start; 5571 em->block_len = hole_len; 5572 } else { 5573 em->start = range_start; 5574 em->len = found; 5575 em->orig_start = range_start; 5576 em->block_start = EXTENT_MAP_DELALLOC; 5577 em->block_len = found; 5578 } 5579 } else if (hole_em) { 5580 return hole_em; 5581 } 5582 out: 5583 5584 free_extent_map(hole_em); 5585 if (err) { 5586 free_extent_map(em); 5587 return ERR_PTR(err); 5588 } 5589 return em; 5590 } 5591 5592 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5593 struct extent_map *em, 5594 u64 start, u64 len) 5595 { 5596 struct btrfs_root *root = BTRFS_I(inode)->root; 5597 struct btrfs_trans_handle *trans; 5598 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5599 struct btrfs_key ins; 5600 u64 alloc_hint; 5601 int ret; 5602 bool insert = false; 5603 5604 /* 5605 * Ok if the extent map we looked up is a hole and is for the exact 5606 * range we want, there is no reason to allocate a new one, however if 5607 * it is not right then we need to free this one and drop the cache for 5608 * our range. 5609 */ 5610 if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5611 em->len != len) { 5612 free_extent_map(em); 5613 em = NULL; 5614 insert = true; 5615 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5616 } 5617 5618 trans = btrfs_join_transaction(root); 5619 if (IS_ERR(trans)) 5620 return ERR_CAST(trans); 5621 5622 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) 5623 btrfs_add_inode_defrag(trans, inode); 5624 5625 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5626 5627 alloc_hint = get_extent_allocation_hint(inode, start, len); 5628 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, 5629 alloc_hint, &ins, 1); 5630 if (ret) { 5631 em = ERR_PTR(ret); 5632 goto out; 5633 } 5634 5635 if (!em) { 5636 em = alloc_extent_map(); 5637 if (!em) { 5638 em = ERR_PTR(-ENOMEM); 5639 goto out; 5640 } 5641 } 5642 5643 em->start = start; 5644 em->orig_start = em->start; 5645 em->len = ins.offset; 5646 5647 em->block_start = ins.objectid; 5648 em->block_len = ins.offset; 5649 em->bdev = root->fs_info->fs_devices->latest_bdev; 5650 5651 /* 5652 * We need to do this because if we're using the original em we searched 5653 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5654 */ 5655 em->flags = 0; 5656 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5657 5658 while (insert) { 5659 write_lock(&em_tree->lock); 5660 ret = add_extent_mapping(em_tree, em); 5661 write_unlock(&em_tree->lock); 5662 if (ret != -EEXIST) 5663 break; 5664 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5665 } 5666 5667 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5668 ins.offset, ins.offset, 0); 5669 if (ret) { 5670 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 5671 em = ERR_PTR(ret); 5672 } 5673 out: 5674 btrfs_end_transaction(trans, root); 5675 return em; 5676 } 5677 5678 /* 5679 * returns 1 when the nocow is safe, < 1 on error, 0 if the 5680 * block must be cow'd 5681 */ 5682 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 5683 struct inode *inode, u64 offset, u64 len) 5684 { 5685 struct btrfs_path *path; 5686 int ret; 5687 struct extent_buffer *leaf; 5688 struct btrfs_root *root = BTRFS_I(inode)->root; 5689 struct btrfs_file_extent_item *fi; 5690 struct btrfs_key key; 5691 u64 disk_bytenr; 5692 u64 backref_offset; 5693 u64 extent_end; 5694 u64 num_bytes; 5695 int slot; 5696 int found_type; 5697 5698 path = btrfs_alloc_path(); 5699 if (!path) 5700 return -ENOMEM; 5701 5702 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 5703 offset, 0); 5704 if (ret < 0) 5705 goto out; 5706 5707 slot = path->slots[0]; 5708 if (ret == 1) { 5709 if (slot == 0) { 5710 /* can't find the item, must cow */ 5711 ret = 0; 5712 goto out; 5713 } 5714 slot--; 5715 } 5716 ret = 0; 5717 leaf = path->nodes[0]; 5718 btrfs_item_key_to_cpu(leaf, &key, slot); 5719 if (key.objectid != btrfs_ino(inode) || 5720 key.type != BTRFS_EXTENT_DATA_KEY) { 5721 /* not our file or wrong item type, must cow */ 5722 goto out; 5723 } 5724 5725 if (key.offset > offset) { 5726 /* Wrong offset, must cow */ 5727 goto out; 5728 } 5729 5730 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5731 found_type = btrfs_file_extent_type(leaf, fi); 5732 if (found_type != BTRFS_FILE_EXTENT_REG && 5733 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 5734 /* not a regular extent, must cow */ 5735 goto out; 5736 } 5737 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 5738 backref_offset = btrfs_file_extent_offset(leaf, fi); 5739 5740 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 5741 if (extent_end < offset + len) { 5742 /* extent doesn't include our full range, must cow */ 5743 goto out; 5744 } 5745 5746 if (btrfs_extent_readonly(root, disk_bytenr)) 5747 goto out; 5748 5749 /* 5750 * look for other files referencing this extent, if we 5751 * find any we must cow 5752 */ 5753 if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), 5754 key.offset - backref_offset, disk_bytenr)) 5755 goto out; 5756 5757 /* 5758 * adjust disk_bytenr and num_bytes to cover just the bytes 5759 * in this extent we are about to write. If there 5760 * are any csums in that range we have to cow in order 5761 * to keep the csums correct 5762 */ 5763 disk_bytenr += backref_offset; 5764 disk_bytenr += offset - key.offset; 5765 num_bytes = min(offset + len, extent_end) - offset; 5766 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 5767 goto out; 5768 /* 5769 * all of the above have passed, it is safe to overwrite this extent 5770 * without cow 5771 */ 5772 ret = 1; 5773 out: 5774 btrfs_free_path(path); 5775 return ret; 5776 } 5777 5778 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 5779 struct extent_state **cached_state, int writing) 5780 { 5781 struct btrfs_ordered_extent *ordered; 5782 int ret = 0; 5783 5784 while (1) { 5785 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 5786 0, cached_state); 5787 /* 5788 * We're concerned with the entire range that we're going to be 5789 * doing DIO to, so we need to make sure theres no ordered 5790 * extents in this range. 5791 */ 5792 ordered = btrfs_lookup_ordered_range(inode, lockstart, 5793 lockend - lockstart + 1); 5794 5795 /* 5796 * We need to make sure there are no buffered pages in this 5797 * range either, we could have raced between the invalidate in 5798 * generic_file_direct_write and locking the extent. The 5799 * invalidate needs to happen so that reads after a write do not 5800 * get stale data. 5801 */ 5802 if (!ordered && (!writing || 5803 !test_range_bit(&BTRFS_I(inode)->io_tree, 5804 lockstart, lockend, EXTENT_UPTODATE, 0, 5805 *cached_state))) 5806 break; 5807 5808 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 5809 cached_state, GFP_NOFS); 5810 5811 if (ordered) { 5812 btrfs_start_ordered_extent(inode, ordered, 1); 5813 btrfs_put_ordered_extent(ordered); 5814 } else { 5815 /* Screw you mmap */ 5816 ret = filemap_write_and_wait_range(inode->i_mapping, 5817 lockstart, 5818 lockend); 5819 if (ret) 5820 break; 5821 5822 /* 5823 * If we found a page that couldn't be invalidated just 5824 * fall back to buffered. 5825 */ 5826 ret = invalidate_inode_pages2_range(inode->i_mapping, 5827 lockstart >> PAGE_CACHE_SHIFT, 5828 lockend >> PAGE_CACHE_SHIFT); 5829 if (ret) 5830 break; 5831 } 5832 5833 cond_resched(); 5834 } 5835 5836 return ret; 5837 } 5838 5839 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5840 struct buffer_head *bh_result, int create) 5841 { 5842 struct extent_map *em; 5843 struct btrfs_root *root = BTRFS_I(inode)->root; 5844 struct extent_state *cached_state = NULL; 5845 u64 start = iblock << inode->i_blkbits; 5846 u64 lockstart, lockend; 5847 u64 len = bh_result->b_size; 5848 struct btrfs_trans_handle *trans; 5849 int unlock_bits = EXTENT_LOCKED; 5850 int ret; 5851 5852 if (create) { 5853 ret = btrfs_delalloc_reserve_space(inode, len); 5854 if (ret) 5855 return ret; 5856 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; 5857 } else { 5858 len = min_t(u64, len, root->sectorsize); 5859 } 5860 5861 lockstart = start; 5862 lockend = start + len - 1; 5863 5864 /* 5865 * If this errors out it's because we couldn't invalidate pagecache for 5866 * this range and we need to fallback to buffered. 5867 */ 5868 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) 5869 return -ENOTBLK; 5870 5871 if (create) { 5872 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 5873 lockend, EXTENT_DELALLOC, NULL, 5874 &cached_state, GFP_NOFS); 5875 if (ret) 5876 goto unlock_err; 5877 } 5878 5879 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5880 if (IS_ERR(em)) { 5881 ret = PTR_ERR(em); 5882 goto unlock_err; 5883 } 5884 5885 /* 5886 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5887 * io. INLINE is special, and we could probably kludge it in here, but 5888 * it's still buffered so for safety lets just fall back to the generic 5889 * buffered path. 5890 * 5891 * For COMPRESSED we _have_ to read the entire extent in so we can 5892 * decompress it, so there will be buffering required no matter what we 5893 * do, so go ahead and fallback to buffered. 5894 * 5895 * We return -ENOTBLK because thats what makes DIO go ahead and go back 5896 * to buffered IO. Don't blame me, this is the price we pay for using 5897 * the generic code. 5898 */ 5899 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5900 em->block_start == EXTENT_MAP_INLINE) { 5901 free_extent_map(em); 5902 ret = -ENOTBLK; 5903 goto unlock_err; 5904 } 5905 5906 /* Just a good old fashioned hole, return */ 5907 if (!create && (em->block_start == EXTENT_MAP_HOLE || 5908 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5909 free_extent_map(em); 5910 ret = 0; 5911 goto unlock_err; 5912 } 5913 5914 /* 5915 * We don't allocate a new extent in the following cases 5916 * 5917 * 1) The inode is marked as NODATACOW. In this case we'll just use the 5918 * existing extent. 5919 * 2) The extent is marked as PREALLOC. We're good to go here and can 5920 * just use the extent. 5921 * 5922 */ 5923 if (!create) { 5924 len = min(len, em->len - (start - em->start)); 5925 lockstart = start + len; 5926 goto unlock; 5927 } 5928 5929 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 5930 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 5931 em->block_start != EXTENT_MAP_HOLE)) { 5932 int type; 5933 int ret; 5934 u64 block_start; 5935 5936 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5937 type = BTRFS_ORDERED_PREALLOC; 5938 else 5939 type = BTRFS_ORDERED_NOCOW; 5940 len = min(len, em->len - (start - em->start)); 5941 block_start = em->block_start + (start - em->start); 5942 5943 /* 5944 * we're not going to log anything, but we do need 5945 * to make sure the current transaction stays open 5946 * while we look for nocow cross refs 5947 */ 5948 trans = btrfs_join_transaction(root); 5949 if (IS_ERR(trans)) 5950 goto must_cow; 5951 5952 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5953 ret = btrfs_add_ordered_extent_dio(inode, start, 5954 block_start, len, len, type); 5955 btrfs_end_transaction(trans, root); 5956 if (ret) { 5957 free_extent_map(em); 5958 goto unlock_err; 5959 } 5960 goto unlock; 5961 } 5962 btrfs_end_transaction(trans, root); 5963 } 5964 must_cow: 5965 /* 5966 * this will cow the extent, reset the len in case we changed 5967 * it above 5968 */ 5969 len = bh_result->b_size; 5970 em = btrfs_new_extent_direct(inode, em, start, len); 5971 if (IS_ERR(em)) { 5972 ret = PTR_ERR(em); 5973 goto unlock_err; 5974 } 5975 len = min(len, em->len - (start - em->start)); 5976 unlock: 5977 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 5978 inode->i_blkbits; 5979 bh_result->b_size = len; 5980 bh_result->b_bdev = em->bdev; 5981 set_buffer_mapped(bh_result); 5982 if (create) { 5983 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5984 set_buffer_new(bh_result); 5985 5986 /* 5987 * Need to update the i_size under the extent lock so buffered 5988 * readers will get the updated i_size when we unlock. 5989 */ 5990 if (start + len > i_size_read(inode)) 5991 i_size_write(inode, start + len); 5992 } 5993 5994 /* 5995 * In the case of write we need to clear and unlock the entire range, 5996 * in the case of read we need to unlock only the end area that we 5997 * aren't using if there is any left over space. 5998 */ 5999 if (lockstart < lockend) { 6000 if (create && len < lockend - lockstart) { 6001 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6002 lockstart + len - 1, unlock_bits, 1, 0, 6003 &cached_state, GFP_NOFS); 6004 /* 6005 * Beside unlock, we also need to cleanup reserved space 6006 * for the left range by attaching EXTENT_DO_ACCOUNTING. 6007 */ 6008 clear_extent_bit(&BTRFS_I(inode)->io_tree, 6009 lockstart + len, lockend, 6010 unlock_bits | EXTENT_DO_ACCOUNTING, 6011 1, 0, NULL, GFP_NOFS); 6012 } else { 6013 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6014 lockend, unlock_bits, 1, 0, 6015 &cached_state, GFP_NOFS); 6016 } 6017 } else { 6018 free_extent_state(cached_state); 6019 } 6020 6021 free_extent_map(em); 6022 6023 return 0; 6024 6025 unlock_err: 6026 if (create) 6027 unlock_bits |= EXTENT_DO_ACCOUNTING; 6028 6029 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6030 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 6031 return ret; 6032 } 6033 6034 struct btrfs_dio_private { 6035 struct inode *inode; 6036 u64 logical_offset; 6037 u64 disk_bytenr; 6038 u64 bytes; 6039 void *private; 6040 6041 /* number of bios pending for this dio */ 6042 atomic_t pending_bios; 6043 6044 /* IO errors */ 6045 int errors; 6046 6047 struct bio *orig_bio; 6048 }; 6049 6050 static void btrfs_endio_direct_read(struct bio *bio, int err) 6051 { 6052 struct btrfs_dio_private *dip = bio->bi_private; 6053 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 6054 struct bio_vec *bvec = bio->bi_io_vec; 6055 struct inode *inode = dip->inode; 6056 struct btrfs_root *root = BTRFS_I(inode)->root; 6057 u64 start; 6058 6059 start = dip->logical_offset; 6060 do { 6061 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 6062 struct page *page = bvec->bv_page; 6063 char *kaddr; 6064 u32 csum = ~(u32)0; 6065 u64 private = ~(u32)0; 6066 unsigned long flags; 6067 6068 if (get_state_private(&BTRFS_I(inode)->io_tree, 6069 start, &private)) 6070 goto failed; 6071 local_irq_save(flags); 6072 kaddr = kmap_atomic(page); 6073 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, 6074 csum, bvec->bv_len); 6075 btrfs_csum_final(csum, (char *)&csum); 6076 kunmap_atomic(kaddr); 6077 local_irq_restore(flags); 6078 6079 flush_dcache_page(bvec->bv_page); 6080 if (csum != private) { 6081 failed: 6082 printk(KERN_ERR "btrfs csum failed ino %llu off" 6083 " %llu csum %u private %u\n", 6084 (unsigned long long)btrfs_ino(inode), 6085 (unsigned long long)start, 6086 csum, (unsigned)private); 6087 err = -EIO; 6088 } 6089 } 6090 6091 start += bvec->bv_len; 6092 bvec++; 6093 } while (bvec <= bvec_end); 6094 6095 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 6096 dip->logical_offset + dip->bytes - 1); 6097 bio->bi_private = dip->private; 6098 6099 kfree(dip); 6100 6101 /* If we had a csum failure make sure to clear the uptodate flag */ 6102 if (err) 6103 clear_bit(BIO_UPTODATE, &bio->bi_flags); 6104 dio_end_io(bio, err); 6105 } 6106 6107 static void btrfs_endio_direct_write(struct bio *bio, int err) 6108 { 6109 struct btrfs_dio_private *dip = bio->bi_private; 6110 struct inode *inode = dip->inode; 6111 struct btrfs_root *root = BTRFS_I(inode)->root; 6112 struct btrfs_ordered_extent *ordered = NULL; 6113 u64 ordered_offset = dip->logical_offset; 6114 u64 ordered_bytes = dip->bytes; 6115 int ret; 6116 6117 if (err) 6118 goto out_done; 6119 again: 6120 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 6121 &ordered_offset, 6122 ordered_bytes, !err); 6123 if (!ret) 6124 goto out_test; 6125 6126 ordered->work.func = finish_ordered_fn; 6127 ordered->work.flags = 0; 6128 btrfs_queue_worker(&root->fs_info->endio_write_workers, 6129 &ordered->work); 6130 out_test: 6131 /* 6132 * our bio might span multiple ordered extents. If we haven't 6133 * completed the accounting for the whole dio, go back and try again 6134 */ 6135 if (ordered_offset < dip->logical_offset + dip->bytes) { 6136 ordered_bytes = dip->logical_offset + dip->bytes - 6137 ordered_offset; 6138 ordered = NULL; 6139 goto again; 6140 } 6141 out_done: 6142 bio->bi_private = dip->private; 6143 6144 kfree(dip); 6145 6146 /* If we had an error make sure to clear the uptodate flag */ 6147 if (err) 6148 clear_bit(BIO_UPTODATE, &bio->bi_flags); 6149 dio_end_io(bio, err); 6150 } 6151 6152 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 6153 struct bio *bio, int mirror_num, 6154 unsigned long bio_flags, u64 offset) 6155 { 6156 int ret; 6157 struct btrfs_root *root = BTRFS_I(inode)->root; 6158 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 6159 BUG_ON(ret); /* -ENOMEM */ 6160 return 0; 6161 } 6162 6163 static void btrfs_end_dio_bio(struct bio *bio, int err) 6164 { 6165 struct btrfs_dio_private *dip = bio->bi_private; 6166 6167 if (err) { 6168 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " 6169 "sector %#Lx len %u err no %d\n", 6170 (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, 6171 (unsigned long long)bio->bi_sector, bio->bi_size, err); 6172 dip->errors = 1; 6173 6174 /* 6175 * before atomic variable goto zero, we must make sure 6176 * dip->errors is perceived to be set. 6177 */ 6178 smp_mb__before_atomic_dec(); 6179 } 6180 6181 /* if there are more bios still pending for this dio, just exit */ 6182 if (!atomic_dec_and_test(&dip->pending_bios)) 6183 goto out; 6184 6185 if (dip->errors) 6186 bio_io_error(dip->orig_bio); 6187 else { 6188 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); 6189 bio_endio(dip->orig_bio, 0); 6190 } 6191 out: 6192 bio_put(bio); 6193 } 6194 6195 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 6196 u64 first_sector, gfp_t gfp_flags) 6197 { 6198 int nr_vecs = bio_get_nr_vecs(bdev); 6199 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 6200 } 6201 6202 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 6203 int rw, u64 file_offset, int skip_sum, 6204 int async_submit) 6205 { 6206 int write = rw & REQ_WRITE; 6207 struct btrfs_root *root = BTRFS_I(inode)->root; 6208 int ret; 6209 6210 bio_get(bio); 6211 6212 if (!write) { 6213 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6214 if (ret) 6215 goto err; 6216 } 6217 6218 if (skip_sum) 6219 goto map; 6220 6221 if (write && async_submit) { 6222 ret = btrfs_wq_submit_bio(root->fs_info, 6223 inode, rw, bio, 0, 0, 6224 file_offset, 6225 __btrfs_submit_bio_start_direct_io, 6226 __btrfs_submit_bio_done); 6227 goto err; 6228 } else if (write) { 6229 /* 6230 * If we aren't doing async submit, calculate the csum of the 6231 * bio now. 6232 */ 6233 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 6234 if (ret) 6235 goto err; 6236 } else if (!skip_sum) { 6237 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset); 6238 if (ret) 6239 goto err; 6240 } 6241 6242 map: 6243 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 6244 err: 6245 bio_put(bio); 6246 return ret; 6247 } 6248 6249 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 6250 int skip_sum) 6251 { 6252 struct inode *inode = dip->inode; 6253 struct btrfs_root *root = BTRFS_I(inode)->root; 6254 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 6255 struct bio *bio; 6256 struct bio *orig_bio = dip->orig_bio; 6257 struct bio_vec *bvec = orig_bio->bi_io_vec; 6258 u64 start_sector = orig_bio->bi_sector; 6259 u64 file_offset = dip->logical_offset; 6260 u64 submit_len = 0; 6261 u64 map_length; 6262 int nr_pages = 0; 6263 int ret = 0; 6264 int async_submit = 0; 6265 6266 map_length = orig_bio->bi_size; 6267 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6268 &map_length, NULL, 0); 6269 if (ret) { 6270 bio_put(orig_bio); 6271 return -EIO; 6272 } 6273 6274 if (map_length >= orig_bio->bi_size) { 6275 bio = orig_bio; 6276 goto submit; 6277 } 6278 6279 async_submit = 1; 6280 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6281 if (!bio) 6282 return -ENOMEM; 6283 bio->bi_private = dip; 6284 bio->bi_end_io = btrfs_end_dio_bio; 6285 atomic_inc(&dip->pending_bios); 6286 6287 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6288 if (unlikely(map_length < submit_len + bvec->bv_len || 6289 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 6290 bvec->bv_offset) < bvec->bv_len)) { 6291 /* 6292 * inc the count before we submit the bio so 6293 * we know the end IO handler won't happen before 6294 * we inc the count. Otherwise, the dip might get freed 6295 * before we're done setting it up 6296 */ 6297 atomic_inc(&dip->pending_bios); 6298 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6299 file_offset, skip_sum, 6300 async_submit); 6301 if (ret) { 6302 bio_put(bio); 6303 atomic_dec(&dip->pending_bios); 6304 goto out_err; 6305 } 6306 6307 start_sector += submit_len >> 9; 6308 file_offset += submit_len; 6309 6310 submit_len = 0; 6311 nr_pages = 0; 6312 6313 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 6314 start_sector, GFP_NOFS); 6315 if (!bio) 6316 goto out_err; 6317 bio->bi_private = dip; 6318 bio->bi_end_io = btrfs_end_dio_bio; 6319 6320 map_length = orig_bio->bi_size; 6321 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6322 &map_length, NULL, 0); 6323 if (ret) { 6324 bio_put(bio); 6325 goto out_err; 6326 } 6327 } else { 6328 submit_len += bvec->bv_len; 6329 nr_pages ++; 6330 bvec++; 6331 } 6332 } 6333 6334 submit: 6335 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6336 async_submit); 6337 if (!ret) 6338 return 0; 6339 6340 bio_put(bio); 6341 out_err: 6342 dip->errors = 1; 6343 /* 6344 * before atomic variable goto zero, we must 6345 * make sure dip->errors is perceived to be set. 6346 */ 6347 smp_mb__before_atomic_dec(); 6348 if (atomic_dec_and_test(&dip->pending_bios)) 6349 bio_io_error(dip->orig_bio); 6350 6351 /* bio_end_io() will handle error, so we needn't return it */ 6352 return 0; 6353 } 6354 6355 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 6356 loff_t file_offset) 6357 { 6358 struct btrfs_root *root = BTRFS_I(inode)->root; 6359 struct btrfs_dio_private *dip; 6360 struct bio_vec *bvec = bio->bi_io_vec; 6361 int skip_sum; 6362 int write = rw & REQ_WRITE; 6363 int ret = 0; 6364 6365 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 6366 6367 dip = kmalloc(sizeof(*dip), GFP_NOFS); 6368 if (!dip) { 6369 ret = -ENOMEM; 6370 goto free_ordered; 6371 } 6372 6373 dip->private = bio->bi_private; 6374 dip->inode = inode; 6375 dip->logical_offset = file_offset; 6376 6377 dip->bytes = 0; 6378 do { 6379 dip->bytes += bvec->bv_len; 6380 bvec++; 6381 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); 6382 6383 dip->disk_bytenr = (u64)bio->bi_sector << 9; 6384 bio->bi_private = dip; 6385 dip->errors = 0; 6386 dip->orig_bio = bio; 6387 atomic_set(&dip->pending_bios, 0); 6388 6389 if (write) 6390 bio->bi_end_io = btrfs_endio_direct_write; 6391 else 6392 bio->bi_end_io = btrfs_endio_direct_read; 6393 6394 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 6395 if (!ret) 6396 return; 6397 free_ordered: 6398 /* 6399 * If this is a write, we need to clean up the reserved space and kill 6400 * the ordered extent. 6401 */ 6402 if (write) { 6403 struct btrfs_ordered_extent *ordered; 6404 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 6405 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 6406 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 6407 btrfs_free_reserved_extent(root, ordered->start, 6408 ordered->disk_len); 6409 btrfs_put_ordered_extent(ordered); 6410 btrfs_put_ordered_extent(ordered); 6411 } 6412 bio_endio(bio, ret); 6413 } 6414 6415 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 6416 const struct iovec *iov, loff_t offset, 6417 unsigned long nr_segs) 6418 { 6419 int seg; 6420 int i; 6421 size_t size; 6422 unsigned long addr; 6423 unsigned blocksize_mask = root->sectorsize - 1; 6424 ssize_t retval = -EINVAL; 6425 loff_t end = offset; 6426 6427 if (offset & blocksize_mask) 6428 goto out; 6429 6430 /* Check the memory alignment. Blocks cannot straddle pages */ 6431 for (seg = 0; seg < nr_segs; seg++) { 6432 addr = (unsigned long)iov[seg].iov_base; 6433 size = iov[seg].iov_len; 6434 end += size; 6435 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6436 goto out; 6437 6438 /* If this is a write we don't need to check anymore */ 6439 if (rw & WRITE) 6440 continue; 6441 6442 /* 6443 * Check to make sure we don't have duplicate iov_base's in this 6444 * iovec, if so return EINVAL, otherwise we'll get csum errors 6445 * when reading back. 6446 */ 6447 for (i = seg + 1; i < nr_segs; i++) { 6448 if (iov[seg].iov_base == iov[i].iov_base) 6449 goto out; 6450 } 6451 } 6452 retval = 0; 6453 out: 6454 return retval; 6455 } 6456 6457 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 6458 const struct iovec *iov, loff_t offset, 6459 unsigned long nr_segs) 6460 { 6461 struct file *file = iocb->ki_filp; 6462 struct inode *inode = file->f_mapping->host; 6463 6464 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6465 offset, nr_segs)) 6466 return 0; 6467 6468 return __blockdev_direct_IO(rw, iocb, inode, 6469 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6470 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6471 btrfs_submit_direct, 0); 6472 } 6473 6474 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6475 __u64 start, __u64 len) 6476 { 6477 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6478 } 6479 6480 int btrfs_readpage(struct file *file, struct page *page) 6481 { 6482 struct extent_io_tree *tree; 6483 tree = &BTRFS_I(page->mapping->host)->io_tree; 6484 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 6485 } 6486 6487 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6488 { 6489 struct extent_io_tree *tree; 6490 6491 6492 if (current->flags & PF_MEMALLOC) { 6493 redirty_page_for_writepage(wbc, page); 6494 unlock_page(page); 6495 return 0; 6496 } 6497 tree = &BTRFS_I(page->mapping->host)->io_tree; 6498 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 6499 } 6500 6501 int btrfs_writepages(struct address_space *mapping, 6502 struct writeback_control *wbc) 6503 { 6504 struct extent_io_tree *tree; 6505 6506 tree = &BTRFS_I(mapping->host)->io_tree; 6507 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 6508 } 6509 6510 static int 6511 btrfs_readpages(struct file *file, struct address_space *mapping, 6512 struct list_head *pages, unsigned nr_pages) 6513 { 6514 struct extent_io_tree *tree; 6515 tree = &BTRFS_I(mapping->host)->io_tree; 6516 return extent_readpages(tree, mapping, pages, nr_pages, 6517 btrfs_get_extent); 6518 } 6519 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6520 { 6521 struct extent_io_tree *tree; 6522 struct extent_map_tree *map; 6523 int ret; 6524 6525 tree = &BTRFS_I(page->mapping->host)->io_tree; 6526 map = &BTRFS_I(page->mapping->host)->extent_tree; 6527 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 6528 if (ret == 1) { 6529 ClearPagePrivate(page); 6530 set_page_private(page, 0); 6531 page_cache_release(page); 6532 } 6533 return ret; 6534 } 6535 6536 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6537 { 6538 if (PageWriteback(page) || PageDirty(page)) 6539 return 0; 6540 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 6541 } 6542 6543 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6544 { 6545 struct inode *inode = page->mapping->host; 6546 struct extent_io_tree *tree; 6547 struct btrfs_ordered_extent *ordered; 6548 struct extent_state *cached_state = NULL; 6549 u64 page_start = page_offset(page); 6550 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6551 6552 /* 6553 * we have the page locked, so new writeback can't start, 6554 * and the dirty bit won't be cleared while we are here. 6555 * 6556 * Wait for IO on this page so that we can safely clear 6557 * the PagePrivate2 bit and do ordered accounting 6558 */ 6559 wait_on_page_writeback(page); 6560 6561 tree = &BTRFS_I(inode)->io_tree; 6562 if (offset) { 6563 btrfs_releasepage(page, GFP_NOFS); 6564 return; 6565 } 6566 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6567 ordered = btrfs_lookup_ordered_extent(inode, 6568 page_offset(page)); 6569 if (ordered) { 6570 /* 6571 * IO on this page will never be started, so we need 6572 * to account for any ordered extents now 6573 */ 6574 clear_extent_bit(tree, page_start, page_end, 6575 EXTENT_DIRTY | EXTENT_DELALLOC | 6576 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6577 &cached_state, GFP_NOFS); 6578 /* 6579 * whoever cleared the private bit is responsible 6580 * for the finish_ordered_io 6581 */ 6582 if (TestClearPagePrivate2(page) && 6583 btrfs_dec_test_ordered_pending(inode, &ordered, page_start, 6584 PAGE_CACHE_SIZE, 1)) { 6585 btrfs_finish_ordered_io(ordered); 6586 } 6587 btrfs_put_ordered_extent(ordered); 6588 cached_state = NULL; 6589 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6590 } 6591 clear_extent_bit(tree, page_start, page_end, 6592 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6593 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6594 __btrfs_releasepage(page, GFP_NOFS); 6595 6596 ClearPageChecked(page); 6597 if (PagePrivate(page)) { 6598 ClearPagePrivate(page); 6599 set_page_private(page, 0); 6600 page_cache_release(page); 6601 } 6602 } 6603 6604 /* 6605 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 6606 * called from a page fault handler when a page is first dirtied. Hence we must 6607 * be careful to check for EOF conditions here. We set the page up correctly 6608 * for a written page which means we get ENOSPC checking when writing into 6609 * holes and correct delalloc and unwritten extent mapping on filesystems that 6610 * support these features. 6611 * 6612 * We are not allowed to take the i_mutex here so we have to play games to 6613 * protect against truncate races as the page could now be beyond EOF. Because 6614 * vmtruncate() writes the inode size before removing pages, once we have the 6615 * page lock we can determine safely if the page is beyond EOF. If it is not 6616 * beyond EOF, then the page is guaranteed safe against truncation until we 6617 * unlock the page. 6618 */ 6619 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 6620 { 6621 struct page *page = vmf->page; 6622 struct inode *inode = fdentry(vma->vm_file)->d_inode; 6623 struct btrfs_root *root = BTRFS_I(inode)->root; 6624 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6625 struct btrfs_ordered_extent *ordered; 6626 struct extent_state *cached_state = NULL; 6627 char *kaddr; 6628 unsigned long zero_start; 6629 loff_t size; 6630 int ret; 6631 int reserved = 0; 6632 u64 page_start; 6633 u64 page_end; 6634 6635 sb_start_pagefault(inode->i_sb); 6636 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6637 if (!ret) { 6638 ret = file_update_time(vma->vm_file); 6639 reserved = 1; 6640 } 6641 if (ret) { 6642 if (ret == -ENOMEM) 6643 ret = VM_FAULT_OOM; 6644 else /* -ENOSPC, -EIO, etc */ 6645 ret = VM_FAULT_SIGBUS; 6646 if (reserved) 6647 goto out; 6648 goto out_noreserve; 6649 } 6650 6651 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6652 again: 6653 lock_page(page); 6654 size = i_size_read(inode); 6655 page_start = page_offset(page); 6656 page_end = page_start + PAGE_CACHE_SIZE - 1; 6657 6658 if ((page->mapping != inode->i_mapping) || 6659 (page_start >= size)) { 6660 /* page got truncated out from underneath us */ 6661 goto out_unlock; 6662 } 6663 wait_on_page_writeback(page); 6664 6665 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 6666 set_page_extent_mapped(page); 6667 6668 /* 6669 * we can't set the delalloc bits if there are pending ordered 6670 * extents. Drop our locks and wait for them to finish 6671 */ 6672 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6673 if (ordered) { 6674 unlock_extent_cached(io_tree, page_start, page_end, 6675 &cached_state, GFP_NOFS); 6676 unlock_page(page); 6677 btrfs_start_ordered_extent(inode, ordered, 1); 6678 btrfs_put_ordered_extent(ordered); 6679 goto again; 6680 } 6681 6682 /* 6683 * XXX - page_mkwrite gets called every time the page is dirtied, even 6684 * if it was already dirty, so for space accounting reasons we need to 6685 * clear any delalloc bits for the range we are fixing to save. There 6686 * is probably a better way to do this, but for now keep consistent with 6687 * prepare_pages in the normal write path. 6688 */ 6689 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6690 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6691 0, 0, &cached_state, GFP_NOFS); 6692 6693 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6694 &cached_state); 6695 if (ret) { 6696 unlock_extent_cached(io_tree, page_start, page_end, 6697 &cached_state, GFP_NOFS); 6698 ret = VM_FAULT_SIGBUS; 6699 goto out_unlock; 6700 } 6701 ret = 0; 6702 6703 /* page is wholly or partially inside EOF */ 6704 if (page_start + PAGE_CACHE_SIZE > size) 6705 zero_start = size & ~PAGE_CACHE_MASK; 6706 else 6707 zero_start = PAGE_CACHE_SIZE; 6708 6709 if (zero_start != PAGE_CACHE_SIZE) { 6710 kaddr = kmap(page); 6711 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 6712 flush_dcache_page(page); 6713 kunmap(page); 6714 } 6715 ClearPageChecked(page); 6716 set_page_dirty(page); 6717 SetPageUptodate(page); 6718 6719 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6720 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6721 6722 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6723 6724 out_unlock: 6725 if (!ret) { 6726 sb_end_pagefault(inode->i_sb); 6727 return VM_FAULT_LOCKED; 6728 } 6729 unlock_page(page); 6730 out: 6731 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6732 out_noreserve: 6733 sb_end_pagefault(inode->i_sb); 6734 return ret; 6735 } 6736 6737 static int btrfs_truncate(struct inode *inode) 6738 { 6739 struct btrfs_root *root = BTRFS_I(inode)->root; 6740 struct btrfs_block_rsv *rsv; 6741 int ret; 6742 int err = 0; 6743 struct btrfs_trans_handle *trans; 6744 unsigned long nr; 6745 u64 mask = root->sectorsize - 1; 6746 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6747 6748 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6749 if (ret) 6750 return ret; 6751 6752 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6753 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6754 6755 /* 6756 * Yes ladies and gentelment, this is indeed ugly. The fact is we have 6757 * 3 things going on here 6758 * 6759 * 1) We need to reserve space for our orphan item and the space to 6760 * delete our orphan item. Lord knows we don't want to have a dangling 6761 * orphan item because we didn't reserve space to remove it. 6762 * 6763 * 2) We need to reserve space to update our inode. 6764 * 6765 * 3) We need to have something to cache all the space that is going to 6766 * be free'd up by the truncate operation, but also have some slack 6767 * space reserved in case it uses space during the truncate (thank you 6768 * very much snapshotting). 6769 * 6770 * And we need these to all be seperate. The fact is we can use alot of 6771 * space doing the truncate, and we have no earthly idea how much space 6772 * we will use, so we need the truncate reservation to be seperate so it 6773 * doesn't end up using space reserved for updating the inode or 6774 * removing the orphan item. We also need to be able to stop the 6775 * transaction and start a new one, which means we need to be able to 6776 * update the inode several times, and we have no idea of knowing how 6777 * many times that will be, so we can't just reserve 1 item for the 6778 * entirety of the opration, so that has to be done seperately as well. 6779 * Then there is the orphan item, which does indeed need to be held on 6780 * to for the whole operation, and we need nobody to touch this reserved 6781 * space except the orphan code. 6782 * 6783 * So that leaves us with 6784 * 6785 * 1) root->orphan_block_rsv - for the orphan deletion. 6786 * 2) rsv - for the truncate reservation, which we will steal from the 6787 * transaction reservation. 6788 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6789 * updating the inode. 6790 */ 6791 rsv = btrfs_alloc_block_rsv(root); 6792 if (!rsv) 6793 return -ENOMEM; 6794 rsv->size = min_size; 6795 6796 /* 6797 * 1 for the truncate slack space 6798 * 1 for the orphan item we're going to add 6799 * 1 for the orphan item deletion 6800 * 1 for updating the inode. 6801 */ 6802 trans = btrfs_start_transaction(root, 4); 6803 if (IS_ERR(trans)) { 6804 err = PTR_ERR(trans); 6805 goto out; 6806 } 6807 6808 /* Migrate the slack space for the truncate to our reserve */ 6809 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 6810 min_size); 6811 BUG_ON(ret); 6812 6813 ret = btrfs_orphan_add(trans, inode); 6814 if (ret) { 6815 btrfs_end_transaction(trans, root); 6816 goto out; 6817 } 6818 6819 /* 6820 * setattr is responsible for setting the ordered_data_close flag, 6821 * but that is only tested during the last file release. That 6822 * could happen well after the next commit, leaving a great big 6823 * window where new writes may get lost if someone chooses to write 6824 * to this file after truncating to zero 6825 * 6826 * The inode doesn't have any dirty data here, and so if we commit 6827 * this is a noop. If someone immediately starts writing to the inode 6828 * it is very likely we'll catch some of their writes in this 6829 * transaction, and the commit will find this file on the ordered 6830 * data list with good things to send down. 6831 * 6832 * This is a best effort solution, there is still a window where 6833 * using truncate to replace the contents of the file will 6834 * end up with a zero length file after a crash. 6835 */ 6836 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 6837 &BTRFS_I(inode)->runtime_flags)) 6838 btrfs_add_ordered_operation(trans, root, inode); 6839 6840 while (1) { 6841 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6842 if (ret) { 6843 /* 6844 * This can only happen with the original transaction we 6845 * started above, every other time we shouldn't have a 6846 * transaction started yet. 6847 */ 6848 if (ret == -EAGAIN) 6849 goto end_trans; 6850 err = ret; 6851 break; 6852 } 6853 6854 if (!trans) { 6855 /* Just need the 1 for updating the inode */ 6856 trans = btrfs_start_transaction(root, 1); 6857 if (IS_ERR(trans)) { 6858 ret = err = PTR_ERR(trans); 6859 trans = NULL; 6860 break; 6861 } 6862 } 6863 6864 trans->block_rsv = rsv; 6865 6866 ret = btrfs_truncate_inode_items(trans, root, inode, 6867 inode->i_size, 6868 BTRFS_EXTENT_DATA_KEY); 6869 if (ret != -EAGAIN) { 6870 err = ret; 6871 break; 6872 } 6873 6874 trans->block_rsv = &root->fs_info->trans_block_rsv; 6875 ret = btrfs_update_inode(trans, root, inode); 6876 if (ret) { 6877 err = ret; 6878 break; 6879 } 6880 end_trans: 6881 nr = trans->blocks_used; 6882 btrfs_end_transaction(trans, root); 6883 trans = NULL; 6884 btrfs_btree_balance_dirty(root, nr); 6885 } 6886 6887 if (ret == 0 && inode->i_nlink > 0) { 6888 trans->block_rsv = root->orphan_block_rsv; 6889 ret = btrfs_orphan_del(trans, inode); 6890 if (ret) 6891 err = ret; 6892 } else if (ret && inode->i_nlink > 0) { 6893 /* 6894 * Failed to do the truncate, remove us from the in memory 6895 * orphan list. 6896 */ 6897 ret = btrfs_orphan_del(NULL, inode); 6898 } 6899 6900 if (trans) { 6901 trans->block_rsv = &root->fs_info->trans_block_rsv; 6902 ret = btrfs_update_inode(trans, root, inode); 6903 if (ret && !err) 6904 err = ret; 6905 6906 nr = trans->blocks_used; 6907 ret = btrfs_end_transaction(trans, root); 6908 btrfs_btree_balance_dirty(root, nr); 6909 } 6910 6911 out: 6912 btrfs_free_block_rsv(root, rsv); 6913 6914 if (ret && !err) 6915 err = ret; 6916 6917 return err; 6918 } 6919 6920 /* 6921 * create a new subvolume directory/inode (helper for the ioctl). 6922 */ 6923 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 6924 struct btrfs_root *new_root, u64 new_dirid) 6925 { 6926 struct inode *inode; 6927 int err; 6928 u64 index = 0; 6929 6930 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 6931 new_dirid, new_dirid, 6932 S_IFDIR | (~current_umask() & S_IRWXUGO), 6933 &index); 6934 if (IS_ERR(inode)) 6935 return PTR_ERR(inode); 6936 inode->i_op = &btrfs_dir_inode_operations; 6937 inode->i_fop = &btrfs_dir_file_operations; 6938 6939 set_nlink(inode, 1); 6940 btrfs_i_size_write(inode, 0); 6941 6942 err = btrfs_update_inode(trans, new_root, inode); 6943 6944 iput(inode); 6945 return err; 6946 } 6947 6948 struct inode *btrfs_alloc_inode(struct super_block *sb) 6949 { 6950 struct btrfs_inode *ei; 6951 struct inode *inode; 6952 6953 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6954 if (!ei) 6955 return NULL; 6956 6957 ei->root = NULL; 6958 ei->generation = 0; 6959 ei->last_trans = 0; 6960 ei->last_sub_trans = 0; 6961 ei->logged_trans = 0; 6962 ei->delalloc_bytes = 0; 6963 ei->disk_i_size = 0; 6964 ei->flags = 0; 6965 ei->csum_bytes = 0; 6966 ei->index_cnt = (u64)-1; 6967 ei->last_unlink_trans = 0; 6968 6969 spin_lock_init(&ei->lock); 6970 ei->outstanding_extents = 0; 6971 ei->reserved_extents = 0; 6972 6973 ei->runtime_flags = 0; 6974 ei->force_compress = BTRFS_COMPRESS_NONE; 6975 6976 ei->delayed_node = NULL; 6977 6978 inode = &ei->vfs_inode; 6979 extent_map_tree_init(&ei->extent_tree); 6980 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6981 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6982 ei->io_tree.track_uptodate = 1; 6983 ei->io_failure_tree.track_uptodate = 1; 6984 mutex_init(&ei->log_mutex); 6985 mutex_init(&ei->delalloc_mutex); 6986 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6987 INIT_LIST_HEAD(&ei->delalloc_inodes); 6988 INIT_LIST_HEAD(&ei->ordered_operations); 6989 RB_CLEAR_NODE(&ei->rb_node); 6990 6991 return inode; 6992 } 6993 6994 static void btrfs_i_callback(struct rcu_head *head) 6995 { 6996 struct inode *inode = container_of(head, struct inode, i_rcu); 6997 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6998 } 6999 7000 void btrfs_destroy_inode(struct inode *inode) 7001 { 7002 struct btrfs_ordered_extent *ordered; 7003 struct btrfs_root *root = BTRFS_I(inode)->root; 7004 7005 WARN_ON(!hlist_empty(&inode->i_dentry)); 7006 WARN_ON(inode->i_data.nrpages); 7007 WARN_ON(BTRFS_I(inode)->outstanding_extents); 7008 WARN_ON(BTRFS_I(inode)->reserved_extents); 7009 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 7010 WARN_ON(BTRFS_I(inode)->csum_bytes); 7011 7012 /* 7013 * This can happen where we create an inode, but somebody else also 7014 * created the same inode and we need to destroy the one we already 7015 * created. 7016 */ 7017 if (!root) 7018 goto free; 7019 7020 /* 7021 * Make sure we're properly removed from the ordered operation 7022 * lists. 7023 */ 7024 smp_mb(); 7025 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7026 spin_lock(&root->fs_info->ordered_extent_lock); 7027 list_del_init(&BTRFS_I(inode)->ordered_operations); 7028 spin_unlock(&root->fs_info->ordered_extent_lock); 7029 } 7030 7031 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7032 &BTRFS_I(inode)->runtime_flags)) { 7033 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 7034 (unsigned long long)btrfs_ino(inode)); 7035 atomic_dec(&root->orphan_inodes); 7036 } 7037 7038 while (1) { 7039 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 7040 if (!ordered) 7041 break; 7042 else { 7043 printk(KERN_ERR "btrfs found ordered " 7044 "extent %llu %llu on inode cleanup\n", 7045 (unsigned long long)ordered->file_offset, 7046 (unsigned long long)ordered->len); 7047 btrfs_remove_ordered_extent(inode, ordered); 7048 btrfs_put_ordered_extent(ordered); 7049 btrfs_put_ordered_extent(ordered); 7050 } 7051 } 7052 inode_tree_del(inode); 7053 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 7054 free: 7055 btrfs_remove_delayed_node(inode); 7056 call_rcu(&inode->i_rcu, btrfs_i_callback); 7057 } 7058 7059 int btrfs_drop_inode(struct inode *inode) 7060 { 7061 struct btrfs_root *root = BTRFS_I(inode)->root; 7062 7063 if (btrfs_root_refs(&root->root_item) == 0 && 7064 !btrfs_is_free_space_inode(inode)) 7065 return 1; 7066 else 7067 return generic_drop_inode(inode); 7068 } 7069 7070 static void init_once(void *foo) 7071 { 7072 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 7073 7074 inode_init_once(&ei->vfs_inode); 7075 } 7076 7077 void btrfs_destroy_cachep(void) 7078 { 7079 if (btrfs_inode_cachep) 7080 kmem_cache_destroy(btrfs_inode_cachep); 7081 if (btrfs_trans_handle_cachep) 7082 kmem_cache_destroy(btrfs_trans_handle_cachep); 7083 if (btrfs_transaction_cachep) 7084 kmem_cache_destroy(btrfs_transaction_cachep); 7085 if (btrfs_path_cachep) 7086 kmem_cache_destroy(btrfs_path_cachep); 7087 if (btrfs_free_space_cachep) 7088 kmem_cache_destroy(btrfs_free_space_cachep); 7089 } 7090 7091 int btrfs_init_cachep(void) 7092 { 7093 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 7094 sizeof(struct btrfs_inode), 0, 7095 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7096 if (!btrfs_inode_cachep) 7097 goto fail; 7098 7099 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 7100 sizeof(struct btrfs_trans_handle), 0, 7101 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7102 if (!btrfs_trans_handle_cachep) 7103 goto fail; 7104 7105 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 7106 sizeof(struct btrfs_transaction), 0, 7107 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7108 if (!btrfs_transaction_cachep) 7109 goto fail; 7110 7111 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 7112 sizeof(struct btrfs_path), 0, 7113 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7114 if (!btrfs_path_cachep) 7115 goto fail; 7116 7117 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 7118 sizeof(struct btrfs_free_space), 0, 7119 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7120 if (!btrfs_free_space_cachep) 7121 goto fail; 7122 7123 return 0; 7124 fail: 7125 btrfs_destroy_cachep(); 7126 return -ENOMEM; 7127 } 7128 7129 static int btrfs_getattr(struct vfsmount *mnt, 7130 struct dentry *dentry, struct kstat *stat) 7131 { 7132 struct inode *inode = dentry->d_inode; 7133 u32 blocksize = inode->i_sb->s_blocksize; 7134 7135 generic_fillattr(inode, stat); 7136 stat->dev = BTRFS_I(inode)->root->anon_dev; 7137 stat->blksize = PAGE_CACHE_SIZE; 7138 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 7139 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; 7140 return 0; 7141 } 7142 7143 /* 7144 * If a file is moved, it will inherit the cow and compression flags of the new 7145 * directory. 7146 */ 7147 static void fixup_inode_flags(struct inode *dir, struct inode *inode) 7148 { 7149 struct btrfs_inode *b_dir = BTRFS_I(dir); 7150 struct btrfs_inode *b_inode = BTRFS_I(inode); 7151 7152 if (b_dir->flags & BTRFS_INODE_NODATACOW) 7153 b_inode->flags |= BTRFS_INODE_NODATACOW; 7154 else 7155 b_inode->flags &= ~BTRFS_INODE_NODATACOW; 7156 7157 if (b_dir->flags & BTRFS_INODE_COMPRESS) { 7158 b_inode->flags |= BTRFS_INODE_COMPRESS; 7159 b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; 7160 } else { 7161 b_inode->flags &= ~(BTRFS_INODE_COMPRESS | 7162 BTRFS_INODE_NOCOMPRESS); 7163 } 7164 } 7165 7166 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 7167 struct inode *new_dir, struct dentry *new_dentry) 7168 { 7169 struct btrfs_trans_handle *trans; 7170 struct btrfs_root *root = BTRFS_I(old_dir)->root; 7171 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 7172 struct inode *new_inode = new_dentry->d_inode; 7173 struct inode *old_inode = old_dentry->d_inode; 7174 struct timespec ctime = CURRENT_TIME; 7175 u64 index = 0; 7176 u64 root_objectid; 7177 int ret; 7178 u64 old_ino = btrfs_ino(old_inode); 7179 7180 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 7181 return -EPERM; 7182 7183 /* we only allow rename subvolume link between subvolumes */ 7184 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 7185 return -EXDEV; 7186 7187 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 7188 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) 7189 return -ENOTEMPTY; 7190 7191 if (S_ISDIR(old_inode->i_mode) && new_inode && 7192 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7193 return -ENOTEMPTY; 7194 /* 7195 * we're using rename to replace one file with another. 7196 * and the replacement file is large. Start IO on it now so 7197 * we don't add too much work to the end of the transaction 7198 */ 7199 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 7200 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 7201 filemap_flush(old_inode->i_mapping); 7202 7203 /* close the racy window with snapshot create/destroy ioctl */ 7204 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7205 down_read(&root->fs_info->subvol_sem); 7206 /* 7207 * We want to reserve the absolute worst case amount of items. So if 7208 * both inodes are subvols and we need to unlink them then that would 7209 * require 4 item modifications, but if they are both normal inodes it 7210 * would require 5 item modifications, so we'll assume their normal 7211 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 7212 * should cover the worst case number of items we'll modify. 7213 */ 7214 trans = btrfs_start_transaction(root, 20); 7215 if (IS_ERR(trans)) { 7216 ret = PTR_ERR(trans); 7217 goto out_notrans; 7218 } 7219 7220 if (dest != root) 7221 btrfs_record_root_in_trans(trans, dest); 7222 7223 ret = btrfs_set_inode_index(new_dir, &index); 7224 if (ret) 7225 goto out_fail; 7226 7227 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7228 /* force full log commit if subvolume involved. */ 7229 root->fs_info->last_trans_log_full_commit = trans->transid; 7230 } else { 7231 ret = btrfs_insert_inode_ref(trans, dest, 7232 new_dentry->d_name.name, 7233 new_dentry->d_name.len, 7234 old_ino, 7235 btrfs_ino(new_dir), index); 7236 if (ret) 7237 goto out_fail; 7238 /* 7239 * this is an ugly little race, but the rename is required 7240 * to make sure that if we crash, the inode is either at the 7241 * old name or the new one. pinning the log transaction lets 7242 * us make sure we don't allow a log commit to come in after 7243 * we unlink the name but before we add the new name back in. 7244 */ 7245 btrfs_pin_log_trans(root); 7246 } 7247 /* 7248 * make sure the inode gets flushed if it is replacing 7249 * something. 7250 */ 7251 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 7252 btrfs_add_ordered_operation(trans, root, old_inode); 7253 7254 inode_inc_iversion(old_dir); 7255 inode_inc_iversion(new_dir); 7256 inode_inc_iversion(old_inode); 7257 old_dir->i_ctime = old_dir->i_mtime = ctime; 7258 new_dir->i_ctime = new_dir->i_mtime = ctime; 7259 old_inode->i_ctime = ctime; 7260 7261 if (old_dentry->d_parent != new_dentry->d_parent) 7262 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 7263 7264 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7265 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 7266 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 7267 old_dentry->d_name.name, 7268 old_dentry->d_name.len); 7269 } else { 7270 ret = __btrfs_unlink_inode(trans, root, old_dir, 7271 old_dentry->d_inode, 7272 old_dentry->d_name.name, 7273 old_dentry->d_name.len); 7274 if (!ret) 7275 ret = btrfs_update_inode(trans, root, old_inode); 7276 } 7277 if (ret) { 7278 btrfs_abort_transaction(trans, root, ret); 7279 goto out_fail; 7280 } 7281 7282 if (new_inode) { 7283 inode_inc_iversion(new_inode); 7284 new_inode->i_ctime = CURRENT_TIME; 7285 if (unlikely(btrfs_ino(new_inode) == 7286 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7287 root_objectid = BTRFS_I(new_inode)->location.objectid; 7288 ret = btrfs_unlink_subvol(trans, dest, new_dir, 7289 root_objectid, 7290 new_dentry->d_name.name, 7291 new_dentry->d_name.len); 7292 BUG_ON(new_inode->i_nlink == 0); 7293 } else { 7294 ret = btrfs_unlink_inode(trans, dest, new_dir, 7295 new_dentry->d_inode, 7296 new_dentry->d_name.name, 7297 new_dentry->d_name.len); 7298 } 7299 if (!ret && new_inode->i_nlink == 0) { 7300 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 7301 BUG_ON(ret); 7302 } 7303 if (ret) { 7304 btrfs_abort_transaction(trans, root, ret); 7305 goto out_fail; 7306 } 7307 } 7308 7309 fixup_inode_flags(new_dir, old_inode); 7310 7311 ret = btrfs_add_link(trans, new_dir, old_inode, 7312 new_dentry->d_name.name, 7313 new_dentry->d_name.len, 0, index); 7314 if (ret) { 7315 btrfs_abort_transaction(trans, root, ret); 7316 goto out_fail; 7317 } 7318 7319 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 7320 struct dentry *parent = new_dentry->d_parent; 7321 btrfs_log_new_name(trans, old_inode, old_dir, parent); 7322 btrfs_end_log_trans(root); 7323 } 7324 out_fail: 7325 btrfs_end_transaction(trans, root); 7326 out_notrans: 7327 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7328 up_read(&root->fs_info->subvol_sem); 7329 7330 return ret; 7331 } 7332 7333 /* 7334 * some fairly slow code that needs optimization. This walks the list 7335 * of all the inodes with pending delalloc and forces them to disk. 7336 */ 7337 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 7338 { 7339 struct list_head *head = &root->fs_info->delalloc_inodes; 7340 struct btrfs_inode *binode; 7341 struct inode *inode; 7342 7343 if (root->fs_info->sb->s_flags & MS_RDONLY) 7344 return -EROFS; 7345 7346 spin_lock(&root->fs_info->delalloc_lock); 7347 while (!list_empty(head)) { 7348 binode = list_entry(head->next, struct btrfs_inode, 7349 delalloc_inodes); 7350 inode = igrab(&binode->vfs_inode); 7351 if (!inode) 7352 list_del_init(&binode->delalloc_inodes); 7353 spin_unlock(&root->fs_info->delalloc_lock); 7354 if (inode) { 7355 filemap_flush(inode->i_mapping); 7356 if (delay_iput) 7357 btrfs_add_delayed_iput(inode); 7358 else 7359 iput(inode); 7360 } 7361 cond_resched(); 7362 spin_lock(&root->fs_info->delalloc_lock); 7363 } 7364 spin_unlock(&root->fs_info->delalloc_lock); 7365 7366 /* the filemap_flush will queue IO into the worker threads, but 7367 * we have to make sure the IO is actually started and that 7368 * ordered extents get created before we return 7369 */ 7370 atomic_inc(&root->fs_info->async_submit_draining); 7371 while (atomic_read(&root->fs_info->nr_async_submits) || 7372 atomic_read(&root->fs_info->async_delalloc_pages)) { 7373 wait_event(root->fs_info->async_submit_wait, 7374 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 7375 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7376 } 7377 atomic_dec(&root->fs_info->async_submit_draining); 7378 return 0; 7379 } 7380 7381 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7382 const char *symname) 7383 { 7384 struct btrfs_trans_handle *trans; 7385 struct btrfs_root *root = BTRFS_I(dir)->root; 7386 struct btrfs_path *path; 7387 struct btrfs_key key; 7388 struct inode *inode = NULL; 7389 int err; 7390 int drop_inode = 0; 7391 u64 objectid; 7392 u64 index = 0 ; 7393 int name_len; 7394 int datasize; 7395 unsigned long ptr; 7396 struct btrfs_file_extent_item *ei; 7397 struct extent_buffer *leaf; 7398 unsigned long nr = 0; 7399 7400 name_len = strlen(symname) + 1; 7401 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7402 return -ENAMETOOLONG; 7403 7404 /* 7405 * 2 items for inode item and ref 7406 * 2 items for dir items 7407 * 1 item for xattr if selinux is on 7408 */ 7409 trans = btrfs_start_transaction(root, 5); 7410 if (IS_ERR(trans)) 7411 return PTR_ERR(trans); 7412 7413 err = btrfs_find_free_ino(root, &objectid); 7414 if (err) 7415 goto out_unlock; 7416 7417 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 7418 dentry->d_name.len, btrfs_ino(dir), objectid, 7419 S_IFLNK|S_IRWXUGO, &index); 7420 if (IS_ERR(inode)) { 7421 err = PTR_ERR(inode); 7422 goto out_unlock; 7423 } 7424 7425 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 7426 if (err) { 7427 drop_inode = 1; 7428 goto out_unlock; 7429 } 7430 7431 /* 7432 * If the active LSM wants to access the inode during 7433 * d_instantiate it needs these. Smack checks to see 7434 * if the filesystem supports xattrs by looking at the 7435 * ops vector. 7436 */ 7437 inode->i_fop = &btrfs_file_operations; 7438 inode->i_op = &btrfs_file_inode_operations; 7439 7440 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7441 if (err) 7442 drop_inode = 1; 7443 else { 7444 inode->i_mapping->a_ops = &btrfs_aops; 7445 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7446 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7447 } 7448 if (drop_inode) 7449 goto out_unlock; 7450 7451 path = btrfs_alloc_path(); 7452 if (!path) { 7453 err = -ENOMEM; 7454 drop_inode = 1; 7455 goto out_unlock; 7456 } 7457 key.objectid = btrfs_ino(inode); 7458 key.offset = 0; 7459 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7460 datasize = btrfs_file_extent_calc_inline_size(name_len); 7461 err = btrfs_insert_empty_item(trans, root, path, &key, 7462 datasize); 7463 if (err) { 7464 drop_inode = 1; 7465 btrfs_free_path(path); 7466 goto out_unlock; 7467 } 7468 leaf = path->nodes[0]; 7469 ei = btrfs_item_ptr(leaf, path->slots[0], 7470 struct btrfs_file_extent_item); 7471 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 7472 btrfs_set_file_extent_type(leaf, ei, 7473 BTRFS_FILE_EXTENT_INLINE); 7474 btrfs_set_file_extent_encryption(leaf, ei, 0); 7475 btrfs_set_file_extent_compression(leaf, ei, 0); 7476 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 7477 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 7478 7479 ptr = btrfs_file_extent_inline_start(ei); 7480 write_extent_buffer(leaf, symname, ptr, name_len); 7481 btrfs_mark_buffer_dirty(leaf); 7482 btrfs_free_path(path); 7483 7484 inode->i_op = &btrfs_symlink_inode_operations; 7485 inode->i_mapping->a_ops = &btrfs_symlink_aops; 7486 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7487 inode_set_bytes(inode, name_len); 7488 btrfs_i_size_write(inode, name_len - 1); 7489 err = btrfs_update_inode(trans, root, inode); 7490 if (err) 7491 drop_inode = 1; 7492 7493 out_unlock: 7494 if (!err) 7495 d_instantiate(dentry, inode); 7496 nr = trans->blocks_used; 7497 btrfs_end_transaction(trans, root); 7498 if (drop_inode) { 7499 inode_dec_link_count(inode); 7500 iput(inode); 7501 } 7502 btrfs_btree_balance_dirty(root, nr); 7503 return err; 7504 } 7505 7506 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 7507 u64 start, u64 num_bytes, u64 min_size, 7508 loff_t actual_len, u64 *alloc_hint, 7509 struct btrfs_trans_handle *trans) 7510 { 7511 struct btrfs_root *root = BTRFS_I(inode)->root; 7512 struct btrfs_key ins; 7513 u64 cur_offset = start; 7514 u64 i_size; 7515 int ret = 0; 7516 bool own_trans = true; 7517 7518 if (trans) 7519 own_trans = false; 7520 while (num_bytes > 0) { 7521 if (own_trans) { 7522 trans = btrfs_start_transaction(root, 3); 7523 if (IS_ERR(trans)) { 7524 ret = PTR_ERR(trans); 7525 break; 7526 } 7527 } 7528 7529 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 7530 0, *alloc_hint, &ins, 1); 7531 if (ret) { 7532 if (own_trans) 7533 btrfs_end_transaction(trans, root); 7534 break; 7535 } 7536 7537 ret = insert_reserved_file_extent(trans, inode, 7538 cur_offset, ins.objectid, 7539 ins.offset, ins.offset, 7540 ins.offset, 0, 0, 0, 7541 BTRFS_FILE_EXTENT_PREALLOC); 7542 if (ret) { 7543 btrfs_abort_transaction(trans, root, ret); 7544 if (own_trans) 7545 btrfs_end_transaction(trans, root); 7546 break; 7547 } 7548 btrfs_drop_extent_cache(inode, cur_offset, 7549 cur_offset + ins.offset -1, 0); 7550 7551 num_bytes -= ins.offset; 7552 cur_offset += ins.offset; 7553 *alloc_hint = ins.objectid + ins.offset; 7554 7555 inode_inc_iversion(inode); 7556 inode->i_ctime = CURRENT_TIME; 7557 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7558 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7559 (actual_len > inode->i_size) && 7560 (cur_offset > inode->i_size)) { 7561 if (cur_offset > actual_len) 7562 i_size = actual_len; 7563 else 7564 i_size = cur_offset; 7565 i_size_write(inode, i_size); 7566 btrfs_ordered_update_i_size(inode, i_size, NULL); 7567 } 7568 7569 ret = btrfs_update_inode(trans, root, inode); 7570 7571 if (ret) { 7572 btrfs_abort_transaction(trans, root, ret); 7573 if (own_trans) 7574 btrfs_end_transaction(trans, root); 7575 break; 7576 } 7577 7578 if (own_trans) 7579 btrfs_end_transaction(trans, root); 7580 } 7581 return ret; 7582 } 7583 7584 int btrfs_prealloc_file_range(struct inode *inode, int mode, 7585 u64 start, u64 num_bytes, u64 min_size, 7586 loff_t actual_len, u64 *alloc_hint) 7587 { 7588 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7589 min_size, actual_len, alloc_hint, 7590 NULL); 7591 } 7592 7593 int btrfs_prealloc_file_range_trans(struct inode *inode, 7594 struct btrfs_trans_handle *trans, int mode, 7595 u64 start, u64 num_bytes, u64 min_size, 7596 loff_t actual_len, u64 *alloc_hint) 7597 { 7598 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7599 min_size, actual_len, alloc_hint, trans); 7600 } 7601 7602 static int btrfs_set_page_dirty(struct page *page) 7603 { 7604 return __set_page_dirty_nobuffers(page); 7605 } 7606 7607 static int btrfs_permission(struct inode *inode, int mask) 7608 { 7609 struct btrfs_root *root = BTRFS_I(inode)->root; 7610 umode_t mode = inode->i_mode; 7611 7612 if (mask & MAY_WRITE && 7613 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 7614 if (btrfs_root_readonly(root)) 7615 return -EROFS; 7616 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 7617 return -EACCES; 7618 } 7619 return generic_permission(inode, mask); 7620 } 7621 7622 static const struct inode_operations btrfs_dir_inode_operations = { 7623 .getattr = btrfs_getattr, 7624 .lookup = btrfs_lookup, 7625 .create = btrfs_create, 7626 .unlink = btrfs_unlink, 7627 .link = btrfs_link, 7628 .mkdir = btrfs_mkdir, 7629 .rmdir = btrfs_rmdir, 7630 .rename = btrfs_rename, 7631 .symlink = btrfs_symlink, 7632 .setattr = btrfs_setattr, 7633 .mknod = btrfs_mknod, 7634 .setxattr = btrfs_setxattr, 7635 .getxattr = btrfs_getxattr, 7636 .listxattr = btrfs_listxattr, 7637 .removexattr = btrfs_removexattr, 7638 .permission = btrfs_permission, 7639 .get_acl = btrfs_get_acl, 7640 }; 7641 static const struct inode_operations btrfs_dir_ro_inode_operations = { 7642 .lookup = btrfs_lookup, 7643 .permission = btrfs_permission, 7644 .get_acl = btrfs_get_acl, 7645 }; 7646 7647 static const struct file_operations btrfs_dir_file_operations = { 7648 .llseek = generic_file_llseek, 7649 .read = generic_read_dir, 7650 .readdir = btrfs_real_readdir, 7651 .unlocked_ioctl = btrfs_ioctl, 7652 #ifdef CONFIG_COMPAT 7653 .compat_ioctl = btrfs_ioctl, 7654 #endif 7655 .release = btrfs_release_file, 7656 .fsync = btrfs_sync_file, 7657 }; 7658 7659 static struct extent_io_ops btrfs_extent_io_ops = { 7660 .fill_delalloc = run_delalloc_range, 7661 .submit_bio_hook = btrfs_submit_bio_hook, 7662 .merge_bio_hook = btrfs_merge_bio_hook, 7663 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7664 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7665 .writepage_start_hook = btrfs_writepage_start_hook, 7666 .set_bit_hook = btrfs_set_bit_hook, 7667 .clear_bit_hook = btrfs_clear_bit_hook, 7668 .merge_extent_hook = btrfs_merge_extent_hook, 7669 .split_extent_hook = btrfs_split_extent_hook, 7670 }; 7671 7672 /* 7673 * btrfs doesn't support the bmap operation because swapfiles 7674 * use bmap to make a mapping of extents in the file. They assume 7675 * these extents won't change over the life of the file and they 7676 * use the bmap result to do IO directly to the drive. 7677 * 7678 * the btrfs bmap call would return logical addresses that aren't 7679 * suitable for IO and they also will change frequently as COW 7680 * operations happen. So, swapfile + btrfs == corruption. 7681 * 7682 * For now we're avoiding this by dropping bmap. 7683 */ 7684 static const struct address_space_operations btrfs_aops = { 7685 .readpage = btrfs_readpage, 7686 .writepage = btrfs_writepage, 7687 .writepages = btrfs_writepages, 7688 .readpages = btrfs_readpages, 7689 .direct_IO = btrfs_direct_IO, 7690 .invalidatepage = btrfs_invalidatepage, 7691 .releasepage = btrfs_releasepage, 7692 .set_page_dirty = btrfs_set_page_dirty, 7693 .error_remove_page = generic_error_remove_page, 7694 }; 7695 7696 static const struct address_space_operations btrfs_symlink_aops = { 7697 .readpage = btrfs_readpage, 7698 .writepage = btrfs_writepage, 7699 .invalidatepage = btrfs_invalidatepage, 7700 .releasepage = btrfs_releasepage, 7701 }; 7702 7703 static const struct inode_operations btrfs_file_inode_operations = { 7704 .getattr = btrfs_getattr, 7705 .setattr = btrfs_setattr, 7706 .setxattr = btrfs_setxattr, 7707 .getxattr = btrfs_getxattr, 7708 .listxattr = btrfs_listxattr, 7709 .removexattr = btrfs_removexattr, 7710 .permission = btrfs_permission, 7711 .fiemap = btrfs_fiemap, 7712 .get_acl = btrfs_get_acl, 7713 .update_time = btrfs_update_time, 7714 }; 7715 static const struct inode_operations btrfs_special_inode_operations = { 7716 .getattr = btrfs_getattr, 7717 .setattr = btrfs_setattr, 7718 .permission = btrfs_permission, 7719 .setxattr = btrfs_setxattr, 7720 .getxattr = btrfs_getxattr, 7721 .listxattr = btrfs_listxattr, 7722 .removexattr = btrfs_removexattr, 7723 .get_acl = btrfs_get_acl, 7724 .update_time = btrfs_update_time, 7725 }; 7726 static const struct inode_operations btrfs_symlink_inode_operations = { 7727 .readlink = generic_readlink, 7728 .follow_link = page_follow_link_light, 7729 .put_link = page_put_link, 7730 .getattr = btrfs_getattr, 7731 .setattr = btrfs_setattr, 7732 .permission = btrfs_permission, 7733 .setxattr = btrfs_setxattr, 7734 .getxattr = btrfs_getxattr, 7735 .listxattr = btrfs_listxattr, 7736 .removexattr = btrfs_removexattr, 7737 .get_acl = btrfs_get_acl, 7738 .update_time = btrfs_update_time, 7739 }; 7740 7741 const struct dentry_operations btrfs_dentry_operations = { 7742 .d_delete = btrfs_dentry_delete, 7743 .d_release = btrfs_dentry_release, 7744 }; 7745