1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include <linux/slab.h> 40 #include <linux/ratelimit.h> 41 #include <linux/mount.h> 42 #include "compat.h" 43 #include "ctree.h" 44 #include "disk-io.h" 45 #include "transaction.h" 46 #include "btrfs_inode.h" 47 #include "ioctl.h" 48 #include "print-tree.h" 49 #include "ordered-data.h" 50 #include "xattr.h" 51 #include "tree-log.h" 52 #include "volumes.h" 53 #include "compression.h" 54 #include "locking.h" 55 #include "free-space-cache.h" 56 #include "inode-map.h" 57 58 struct btrfs_iget_args { 59 u64 ino; 60 struct btrfs_root *root; 61 }; 62 63 static const struct inode_operations btrfs_dir_inode_operations; 64 static const struct inode_operations btrfs_symlink_inode_operations; 65 static const struct inode_operations btrfs_dir_ro_inode_operations; 66 static const struct inode_operations btrfs_special_inode_operations; 67 static const struct inode_operations btrfs_file_inode_operations; 68 static const struct address_space_operations btrfs_aops; 69 static const struct address_space_operations btrfs_symlink_aops; 70 static const struct file_operations btrfs_dir_file_operations; 71 static struct extent_io_ops btrfs_extent_io_ops; 72 73 static struct kmem_cache *btrfs_inode_cachep; 74 struct kmem_cache *btrfs_trans_handle_cachep; 75 struct kmem_cache *btrfs_transaction_cachep; 76 struct kmem_cache *btrfs_path_cachep; 77 struct kmem_cache *btrfs_free_space_cachep; 78 79 #define S_SHIFT 12 80 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 81 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 82 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 83 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 84 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 85 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 86 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 87 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 88 }; 89 90 static int btrfs_setsize(struct inode *inode, loff_t newsize); 91 static int btrfs_truncate(struct inode *inode); 92 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 93 static noinline int cow_file_range(struct inode *inode, 94 struct page *locked_page, 95 u64 start, u64 end, int *page_started, 96 unsigned long *nr_written, int unlock); 97 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, struct inode *inode); 99 100 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 101 struct inode *inode, struct inode *dir, 102 const struct qstr *qstr) 103 { 104 int err; 105 106 err = btrfs_init_acl(trans, inode, dir); 107 if (!err) 108 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 109 return err; 110 } 111 112 /* 113 * this does all the hard work for inserting an inline extent into 114 * the btree. The caller should have done a btrfs_drop_extents so that 115 * no overlapping inline items exist in the btree 116 */ 117 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 118 struct btrfs_root *root, struct inode *inode, 119 u64 start, size_t size, size_t compressed_size, 120 int compress_type, 121 struct page **compressed_pages) 122 { 123 struct btrfs_key key; 124 struct btrfs_path *path; 125 struct extent_buffer *leaf; 126 struct page *page = NULL; 127 char *kaddr; 128 unsigned long ptr; 129 struct btrfs_file_extent_item *ei; 130 int err = 0; 131 int ret; 132 size_t cur_size = size; 133 size_t datasize; 134 unsigned long offset; 135 136 if (compressed_size && compressed_pages) 137 cur_size = compressed_size; 138 139 path = btrfs_alloc_path(); 140 if (!path) 141 return -ENOMEM; 142 143 path->leave_spinning = 1; 144 145 key.objectid = btrfs_ino(inode); 146 key.offset = start; 147 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 148 datasize = btrfs_file_extent_calc_inline_size(cur_size); 149 150 inode_add_bytes(inode, size); 151 ret = btrfs_insert_empty_item(trans, root, path, &key, 152 datasize); 153 if (ret) { 154 err = ret; 155 goto fail; 156 } 157 leaf = path->nodes[0]; 158 ei = btrfs_item_ptr(leaf, path->slots[0], 159 struct btrfs_file_extent_item); 160 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 161 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 162 btrfs_set_file_extent_encryption(leaf, ei, 0); 163 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 164 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 165 ptr = btrfs_file_extent_inline_start(ei); 166 167 if (compress_type != BTRFS_COMPRESS_NONE) { 168 struct page *cpage; 169 int i = 0; 170 while (compressed_size > 0) { 171 cpage = compressed_pages[i]; 172 cur_size = min_t(unsigned long, compressed_size, 173 PAGE_CACHE_SIZE); 174 175 kaddr = kmap_atomic(cpage); 176 write_extent_buffer(leaf, kaddr, ptr, cur_size); 177 kunmap_atomic(kaddr); 178 179 i++; 180 ptr += cur_size; 181 compressed_size -= cur_size; 182 } 183 btrfs_set_file_extent_compression(leaf, ei, 184 compress_type); 185 } else { 186 page = find_get_page(inode->i_mapping, 187 start >> PAGE_CACHE_SHIFT); 188 btrfs_set_file_extent_compression(leaf, ei, 0); 189 kaddr = kmap_atomic(page); 190 offset = start & (PAGE_CACHE_SIZE - 1); 191 write_extent_buffer(leaf, kaddr + offset, ptr, size); 192 kunmap_atomic(kaddr); 193 page_cache_release(page); 194 } 195 btrfs_mark_buffer_dirty(leaf); 196 btrfs_free_path(path); 197 198 /* 199 * we're an inline extent, so nobody can 200 * extend the file past i_size without locking 201 * a page we already have locked. 202 * 203 * We must do any isize and inode updates 204 * before we unlock the pages. Otherwise we 205 * could end up racing with unlink. 206 */ 207 BTRFS_I(inode)->disk_i_size = inode->i_size; 208 ret = btrfs_update_inode(trans, root, inode); 209 210 return ret; 211 fail: 212 btrfs_free_path(path); 213 return err; 214 } 215 216 217 /* 218 * conditionally insert an inline extent into the file. This 219 * does the checks required to make sure the data is small enough 220 * to fit as an inline extent. 221 */ 222 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 223 struct btrfs_root *root, 224 struct inode *inode, u64 start, u64 end, 225 size_t compressed_size, int compress_type, 226 struct page **compressed_pages) 227 { 228 u64 isize = i_size_read(inode); 229 u64 actual_end = min(end + 1, isize); 230 u64 inline_len = actual_end - start; 231 u64 aligned_end = (end + root->sectorsize - 1) & 232 ~((u64)root->sectorsize - 1); 233 u64 hint_byte; 234 u64 data_len = inline_len; 235 int ret; 236 237 if (compressed_size) 238 data_len = compressed_size; 239 240 if (start > 0 || 241 actual_end >= PAGE_CACHE_SIZE || 242 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 243 (!compressed_size && 244 (actual_end & (root->sectorsize - 1)) == 0) || 245 end + 1 < isize || 246 data_len > root->fs_info->max_inline) { 247 return 1; 248 } 249 250 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 251 &hint_byte, 1); 252 if (ret) 253 return ret; 254 255 if (isize > actual_end) 256 inline_len = min_t(u64, isize, actual_end); 257 ret = insert_inline_extent(trans, root, inode, start, 258 inline_len, compressed_size, 259 compress_type, compressed_pages); 260 if (ret && ret != -ENOSPC) { 261 btrfs_abort_transaction(trans, root, ret); 262 return ret; 263 } else if (ret == -ENOSPC) { 264 return 1; 265 } 266 267 btrfs_delalloc_release_metadata(inode, end + 1 - start); 268 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 269 return 0; 270 } 271 272 struct async_extent { 273 u64 start; 274 u64 ram_size; 275 u64 compressed_size; 276 struct page **pages; 277 unsigned long nr_pages; 278 int compress_type; 279 struct list_head list; 280 }; 281 282 struct async_cow { 283 struct inode *inode; 284 struct btrfs_root *root; 285 struct page *locked_page; 286 u64 start; 287 u64 end; 288 struct list_head extents; 289 struct btrfs_work work; 290 }; 291 292 static noinline int add_async_extent(struct async_cow *cow, 293 u64 start, u64 ram_size, 294 u64 compressed_size, 295 struct page **pages, 296 unsigned long nr_pages, 297 int compress_type) 298 { 299 struct async_extent *async_extent; 300 301 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 302 BUG_ON(!async_extent); /* -ENOMEM */ 303 async_extent->start = start; 304 async_extent->ram_size = ram_size; 305 async_extent->compressed_size = compressed_size; 306 async_extent->pages = pages; 307 async_extent->nr_pages = nr_pages; 308 async_extent->compress_type = compress_type; 309 list_add_tail(&async_extent->list, &cow->extents); 310 return 0; 311 } 312 313 /* 314 * we create compressed extents in two phases. The first 315 * phase compresses a range of pages that have already been 316 * locked (both pages and state bits are locked). 317 * 318 * This is done inside an ordered work queue, and the compression 319 * is spread across many cpus. The actual IO submission is step 320 * two, and the ordered work queue takes care of making sure that 321 * happens in the same order things were put onto the queue by 322 * writepages and friends. 323 * 324 * If this code finds it can't get good compression, it puts an 325 * entry onto the work queue to write the uncompressed bytes. This 326 * makes sure that both compressed inodes and uncompressed inodes 327 * are written in the same order that pdflush sent them down. 328 */ 329 static noinline int compress_file_range(struct inode *inode, 330 struct page *locked_page, 331 u64 start, u64 end, 332 struct async_cow *async_cow, 333 int *num_added) 334 { 335 struct btrfs_root *root = BTRFS_I(inode)->root; 336 struct btrfs_trans_handle *trans; 337 u64 num_bytes; 338 u64 blocksize = root->sectorsize; 339 u64 actual_end; 340 u64 isize = i_size_read(inode); 341 int ret = 0; 342 struct page **pages = NULL; 343 unsigned long nr_pages; 344 unsigned long nr_pages_ret = 0; 345 unsigned long total_compressed = 0; 346 unsigned long total_in = 0; 347 unsigned long max_compressed = 128 * 1024; 348 unsigned long max_uncompressed = 128 * 1024; 349 int i; 350 int will_compress; 351 int compress_type = root->fs_info->compress_type; 352 353 /* if this is a small write inside eof, kick off a defrag */ 354 if ((end - start + 1) < 16 * 1024 && 355 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 356 btrfs_add_inode_defrag(NULL, inode); 357 358 actual_end = min_t(u64, isize, end + 1); 359 again: 360 will_compress = 0; 361 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 362 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 363 364 /* 365 * we don't want to send crud past the end of i_size through 366 * compression, that's just a waste of CPU time. So, if the 367 * end of the file is before the start of our current 368 * requested range of bytes, we bail out to the uncompressed 369 * cleanup code that can deal with all of this. 370 * 371 * It isn't really the fastest way to fix things, but this is a 372 * very uncommon corner. 373 */ 374 if (actual_end <= start) 375 goto cleanup_and_bail_uncompressed; 376 377 total_compressed = actual_end - start; 378 379 /* we want to make sure that amount of ram required to uncompress 380 * an extent is reasonable, so we limit the total size in ram 381 * of a compressed extent to 128k. This is a crucial number 382 * because it also controls how easily we can spread reads across 383 * cpus for decompression. 384 * 385 * We also want to make sure the amount of IO required to do 386 * a random read is reasonably small, so we limit the size of 387 * a compressed extent to 128k. 388 */ 389 total_compressed = min(total_compressed, max_uncompressed); 390 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 391 num_bytes = max(blocksize, num_bytes); 392 total_in = 0; 393 ret = 0; 394 395 /* 396 * we do compression for mount -o compress and when the 397 * inode has not been flagged as nocompress. This flag can 398 * change at any time if we discover bad compression ratios. 399 */ 400 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 401 (btrfs_test_opt(root, COMPRESS) || 402 (BTRFS_I(inode)->force_compress) || 403 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 404 WARN_ON(pages); 405 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 406 if (!pages) { 407 /* just bail out to the uncompressed code */ 408 goto cont; 409 } 410 411 if (BTRFS_I(inode)->force_compress) 412 compress_type = BTRFS_I(inode)->force_compress; 413 414 ret = btrfs_compress_pages(compress_type, 415 inode->i_mapping, start, 416 total_compressed, pages, 417 nr_pages, &nr_pages_ret, 418 &total_in, 419 &total_compressed, 420 max_compressed); 421 422 if (!ret) { 423 unsigned long offset = total_compressed & 424 (PAGE_CACHE_SIZE - 1); 425 struct page *page = pages[nr_pages_ret - 1]; 426 char *kaddr; 427 428 /* zero the tail end of the last page, we might be 429 * sending it down to disk 430 */ 431 if (offset) { 432 kaddr = kmap_atomic(page); 433 memset(kaddr + offset, 0, 434 PAGE_CACHE_SIZE - offset); 435 kunmap_atomic(kaddr); 436 } 437 will_compress = 1; 438 } 439 } 440 cont: 441 if (start == 0) { 442 trans = btrfs_join_transaction(root); 443 if (IS_ERR(trans)) { 444 ret = PTR_ERR(trans); 445 trans = NULL; 446 goto cleanup_and_out; 447 } 448 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 449 450 /* lets try to make an inline extent */ 451 if (ret || total_in < (actual_end - start)) { 452 /* we didn't compress the entire range, try 453 * to make an uncompressed inline extent. 454 */ 455 ret = cow_file_range_inline(trans, root, inode, 456 start, end, 0, 0, NULL); 457 } else { 458 /* try making a compressed inline extent */ 459 ret = cow_file_range_inline(trans, root, inode, 460 start, end, 461 total_compressed, 462 compress_type, pages); 463 } 464 if (ret <= 0) { 465 /* 466 * inline extent creation worked or returned error, 467 * we don't need to create any more async work items. 468 * Unlock and free up our temp pages. 469 */ 470 extent_clear_unlock_delalloc(inode, 471 &BTRFS_I(inode)->io_tree, 472 start, end, NULL, 473 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 474 EXTENT_CLEAR_DELALLOC | 475 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 476 477 btrfs_end_transaction(trans, root); 478 goto free_pages_out; 479 } 480 btrfs_end_transaction(trans, root); 481 } 482 483 if (will_compress) { 484 /* 485 * we aren't doing an inline extent round the compressed size 486 * up to a block size boundary so the allocator does sane 487 * things 488 */ 489 total_compressed = (total_compressed + blocksize - 1) & 490 ~(blocksize - 1); 491 492 /* 493 * one last check to make sure the compression is really a 494 * win, compare the page count read with the blocks on disk 495 */ 496 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 497 ~(PAGE_CACHE_SIZE - 1); 498 if (total_compressed >= total_in) { 499 will_compress = 0; 500 } else { 501 num_bytes = total_in; 502 } 503 } 504 if (!will_compress && pages) { 505 /* 506 * the compression code ran but failed to make things smaller, 507 * free any pages it allocated and our page pointer array 508 */ 509 for (i = 0; i < nr_pages_ret; i++) { 510 WARN_ON(pages[i]->mapping); 511 page_cache_release(pages[i]); 512 } 513 kfree(pages); 514 pages = NULL; 515 total_compressed = 0; 516 nr_pages_ret = 0; 517 518 /* flag the file so we don't compress in the future */ 519 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 520 !(BTRFS_I(inode)->force_compress)) { 521 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 522 } 523 } 524 if (will_compress) { 525 *num_added += 1; 526 527 /* the async work queues will take care of doing actual 528 * allocation on disk for these compressed pages, 529 * and will submit them to the elevator. 530 */ 531 add_async_extent(async_cow, start, num_bytes, 532 total_compressed, pages, nr_pages_ret, 533 compress_type); 534 535 if (start + num_bytes < end) { 536 start += num_bytes; 537 pages = NULL; 538 cond_resched(); 539 goto again; 540 } 541 } else { 542 cleanup_and_bail_uncompressed: 543 /* 544 * No compression, but we still need to write the pages in 545 * the file we've been given so far. redirty the locked 546 * page if it corresponds to our extent and set things up 547 * for the async work queue to run cow_file_range to do 548 * the normal delalloc dance 549 */ 550 if (page_offset(locked_page) >= start && 551 page_offset(locked_page) <= end) { 552 __set_page_dirty_nobuffers(locked_page); 553 /* unlocked later on in the async handlers */ 554 } 555 add_async_extent(async_cow, start, end - start + 1, 556 0, NULL, 0, BTRFS_COMPRESS_NONE); 557 *num_added += 1; 558 } 559 560 out: 561 return ret; 562 563 free_pages_out: 564 for (i = 0; i < nr_pages_ret; i++) { 565 WARN_ON(pages[i]->mapping); 566 page_cache_release(pages[i]); 567 } 568 kfree(pages); 569 570 goto out; 571 572 cleanup_and_out: 573 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 574 start, end, NULL, 575 EXTENT_CLEAR_UNLOCK_PAGE | 576 EXTENT_CLEAR_DIRTY | 577 EXTENT_CLEAR_DELALLOC | 578 EXTENT_SET_WRITEBACK | 579 EXTENT_END_WRITEBACK); 580 if (!trans || IS_ERR(trans)) 581 btrfs_error(root->fs_info, ret, "Failed to join transaction"); 582 else 583 btrfs_abort_transaction(trans, root, ret); 584 goto free_pages_out; 585 } 586 587 /* 588 * phase two of compressed writeback. This is the ordered portion 589 * of the code, which only gets called in the order the work was 590 * queued. We walk all the async extents created by compress_file_range 591 * and send them down to the disk. 592 */ 593 static noinline int submit_compressed_extents(struct inode *inode, 594 struct async_cow *async_cow) 595 { 596 struct async_extent *async_extent; 597 u64 alloc_hint = 0; 598 struct btrfs_trans_handle *trans; 599 struct btrfs_key ins; 600 struct extent_map *em; 601 struct btrfs_root *root = BTRFS_I(inode)->root; 602 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 603 struct extent_io_tree *io_tree; 604 int ret = 0; 605 606 if (list_empty(&async_cow->extents)) 607 return 0; 608 609 610 while (!list_empty(&async_cow->extents)) { 611 async_extent = list_entry(async_cow->extents.next, 612 struct async_extent, list); 613 list_del(&async_extent->list); 614 615 io_tree = &BTRFS_I(inode)->io_tree; 616 617 retry: 618 /* did the compression code fall back to uncompressed IO? */ 619 if (!async_extent->pages) { 620 int page_started = 0; 621 unsigned long nr_written = 0; 622 623 lock_extent(io_tree, async_extent->start, 624 async_extent->start + 625 async_extent->ram_size - 1); 626 627 /* allocate blocks */ 628 ret = cow_file_range(inode, async_cow->locked_page, 629 async_extent->start, 630 async_extent->start + 631 async_extent->ram_size - 1, 632 &page_started, &nr_written, 0); 633 634 /* JDM XXX */ 635 636 /* 637 * if page_started, cow_file_range inserted an 638 * inline extent and took care of all the unlocking 639 * and IO for us. Otherwise, we need to submit 640 * all those pages down to the drive. 641 */ 642 if (!page_started && !ret) 643 extent_write_locked_range(io_tree, 644 inode, async_extent->start, 645 async_extent->start + 646 async_extent->ram_size - 1, 647 btrfs_get_extent, 648 WB_SYNC_ALL); 649 kfree(async_extent); 650 cond_resched(); 651 continue; 652 } 653 654 lock_extent(io_tree, async_extent->start, 655 async_extent->start + async_extent->ram_size - 1); 656 657 trans = btrfs_join_transaction(root); 658 if (IS_ERR(trans)) { 659 ret = PTR_ERR(trans); 660 } else { 661 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 662 ret = btrfs_reserve_extent(trans, root, 663 async_extent->compressed_size, 664 async_extent->compressed_size, 665 0, alloc_hint, &ins, 1); 666 if (ret) 667 btrfs_abort_transaction(trans, root, ret); 668 btrfs_end_transaction(trans, root); 669 } 670 671 if (ret) { 672 int i; 673 for (i = 0; i < async_extent->nr_pages; i++) { 674 WARN_ON(async_extent->pages[i]->mapping); 675 page_cache_release(async_extent->pages[i]); 676 } 677 kfree(async_extent->pages); 678 async_extent->nr_pages = 0; 679 async_extent->pages = NULL; 680 unlock_extent(io_tree, async_extent->start, 681 async_extent->start + 682 async_extent->ram_size - 1); 683 if (ret == -ENOSPC) 684 goto retry; 685 goto out_free; /* JDM: Requeue? */ 686 } 687 688 /* 689 * here we're doing allocation and writeback of the 690 * compressed pages 691 */ 692 btrfs_drop_extent_cache(inode, async_extent->start, 693 async_extent->start + 694 async_extent->ram_size - 1, 0); 695 696 em = alloc_extent_map(); 697 BUG_ON(!em); /* -ENOMEM */ 698 em->start = async_extent->start; 699 em->len = async_extent->ram_size; 700 em->orig_start = em->start; 701 702 em->block_start = ins.objectid; 703 em->block_len = ins.offset; 704 em->bdev = root->fs_info->fs_devices->latest_bdev; 705 em->compress_type = async_extent->compress_type; 706 set_bit(EXTENT_FLAG_PINNED, &em->flags); 707 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 708 709 while (1) { 710 write_lock(&em_tree->lock); 711 ret = add_extent_mapping(em_tree, em); 712 write_unlock(&em_tree->lock); 713 if (ret != -EEXIST) { 714 free_extent_map(em); 715 break; 716 } 717 btrfs_drop_extent_cache(inode, async_extent->start, 718 async_extent->start + 719 async_extent->ram_size - 1, 0); 720 } 721 722 ret = btrfs_add_ordered_extent_compress(inode, 723 async_extent->start, 724 ins.objectid, 725 async_extent->ram_size, 726 ins.offset, 727 BTRFS_ORDERED_COMPRESSED, 728 async_extent->compress_type); 729 BUG_ON(ret); /* -ENOMEM */ 730 731 /* 732 * clear dirty, set writeback and unlock the pages. 733 */ 734 extent_clear_unlock_delalloc(inode, 735 &BTRFS_I(inode)->io_tree, 736 async_extent->start, 737 async_extent->start + 738 async_extent->ram_size - 1, 739 NULL, EXTENT_CLEAR_UNLOCK_PAGE | 740 EXTENT_CLEAR_UNLOCK | 741 EXTENT_CLEAR_DELALLOC | 742 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); 743 744 ret = btrfs_submit_compressed_write(inode, 745 async_extent->start, 746 async_extent->ram_size, 747 ins.objectid, 748 ins.offset, async_extent->pages, 749 async_extent->nr_pages); 750 751 BUG_ON(ret); /* -ENOMEM */ 752 alloc_hint = ins.objectid + ins.offset; 753 kfree(async_extent); 754 cond_resched(); 755 } 756 ret = 0; 757 out: 758 return ret; 759 out_free: 760 kfree(async_extent); 761 goto out; 762 } 763 764 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 765 u64 num_bytes) 766 { 767 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 768 struct extent_map *em; 769 u64 alloc_hint = 0; 770 771 read_lock(&em_tree->lock); 772 em = search_extent_mapping(em_tree, start, num_bytes); 773 if (em) { 774 /* 775 * if block start isn't an actual block number then find the 776 * first block in this inode and use that as a hint. If that 777 * block is also bogus then just don't worry about it. 778 */ 779 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 780 free_extent_map(em); 781 em = search_extent_mapping(em_tree, 0, 0); 782 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 783 alloc_hint = em->block_start; 784 if (em) 785 free_extent_map(em); 786 } else { 787 alloc_hint = em->block_start; 788 free_extent_map(em); 789 } 790 } 791 read_unlock(&em_tree->lock); 792 793 return alloc_hint; 794 } 795 796 /* 797 * when extent_io.c finds a delayed allocation range in the file, 798 * the call backs end up in this code. The basic idea is to 799 * allocate extents on disk for the range, and create ordered data structs 800 * in ram to track those extents. 801 * 802 * locked_page is the page that writepage had locked already. We use 803 * it to make sure we don't do extra locks or unlocks. 804 * 805 * *page_started is set to one if we unlock locked_page and do everything 806 * required to start IO on it. It may be clean and already done with 807 * IO when we return. 808 */ 809 static noinline int cow_file_range(struct inode *inode, 810 struct page *locked_page, 811 u64 start, u64 end, int *page_started, 812 unsigned long *nr_written, 813 int unlock) 814 { 815 struct btrfs_root *root = BTRFS_I(inode)->root; 816 struct btrfs_trans_handle *trans; 817 u64 alloc_hint = 0; 818 u64 num_bytes; 819 unsigned long ram_size; 820 u64 disk_num_bytes; 821 u64 cur_alloc_size; 822 u64 blocksize = root->sectorsize; 823 struct btrfs_key ins; 824 struct extent_map *em; 825 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 826 int ret = 0; 827 828 BUG_ON(btrfs_is_free_space_inode(inode)); 829 trans = btrfs_join_transaction(root); 830 if (IS_ERR(trans)) { 831 extent_clear_unlock_delalloc(inode, 832 &BTRFS_I(inode)->io_tree, 833 start, end, locked_page, 834 EXTENT_CLEAR_UNLOCK_PAGE | 835 EXTENT_CLEAR_UNLOCK | 836 EXTENT_CLEAR_DELALLOC | 837 EXTENT_CLEAR_DIRTY | 838 EXTENT_SET_WRITEBACK | 839 EXTENT_END_WRITEBACK); 840 return PTR_ERR(trans); 841 } 842 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 843 844 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 845 num_bytes = max(blocksize, num_bytes); 846 disk_num_bytes = num_bytes; 847 ret = 0; 848 849 /* if this is a small write inside eof, kick off defrag */ 850 if (num_bytes < 64 * 1024 && 851 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 852 btrfs_add_inode_defrag(trans, inode); 853 854 if (start == 0) { 855 /* lets try to make an inline extent */ 856 ret = cow_file_range_inline(trans, root, inode, 857 start, end, 0, 0, NULL); 858 if (ret == 0) { 859 extent_clear_unlock_delalloc(inode, 860 &BTRFS_I(inode)->io_tree, 861 start, end, NULL, 862 EXTENT_CLEAR_UNLOCK_PAGE | 863 EXTENT_CLEAR_UNLOCK | 864 EXTENT_CLEAR_DELALLOC | 865 EXTENT_CLEAR_DIRTY | 866 EXTENT_SET_WRITEBACK | 867 EXTENT_END_WRITEBACK); 868 869 *nr_written = *nr_written + 870 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 871 *page_started = 1; 872 goto out; 873 } else if (ret < 0) { 874 btrfs_abort_transaction(trans, root, ret); 875 goto out_unlock; 876 } 877 } 878 879 BUG_ON(disk_num_bytes > 880 btrfs_super_total_bytes(root->fs_info->super_copy)); 881 882 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 883 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 884 885 while (disk_num_bytes > 0) { 886 unsigned long op; 887 888 cur_alloc_size = disk_num_bytes; 889 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 890 root->sectorsize, 0, alloc_hint, 891 &ins, 1); 892 if (ret < 0) { 893 btrfs_abort_transaction(trans, root, ret); 894 goto out_unlock; 895 } 896 897 em = alloc_extent_map(); 898 BUG_ON(!em); /* -ENOMEM */ 899 em->start = start; 900 em->orig_start = em->start; 901 ram_size = ins.offset; 902 em->len = ins.offset; 903 904 em->block_start = ins.objectid; 905 em->block_len = ins.offset; 906 em->bdev = root->fs_info->fs_devices->latest_bdev; 907 set_bit(EXTENT_FLAG_PINNED, &em->flags); 908 909 while (1) { 910 write_lock(&em_tree->lock); 911 ret = add_extent_mapping(em_tree, em); 912 write_unlock(&em_tree->lock); 913 if (ret != -EEXIST) { 914 free_extent_map(em); 915 break; 916 } 917 btrfs_drop_extent_cache(inode, start, 918 start + ram_size - 1, 0); 919 } 920 921 cur_alloc_size = ins.offset; 922 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 923 ram_size, cur_alloc_size, 0); 924 BUG_ON(ret); /* -ENOMEM */ 925 926 if (root->root_key.objectid == 927 BTRFS_DATA_RELOC_TREE_OBJECTID) { 928 ret = btrfs_reloc_clone_csums(inode, start, 929 cur_alloc_size); 930 if (ret) { 931 btrfs_abort_transaction(trans, root, ret); 932 goto out_unlock; 933 } 934 } 935 936 if (disk_num_bytes < cur_alloc_size) 937 break; 938 939 /* we're not doing compressed IO, don't unlock the first 940 * page (which the caller expects to stay locked), don't 941 * clear any dirty bits and don't set any writeback bits 942 * 943 * Do set the Private2 bit so we know this page was properly 944 * setup for writepage 945 */ 946 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; 947 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 948 EXTENT_SET_PRIVATE2; 949 950 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 951 start, start + ram_size - 1, 952 locked_page, op); 953 disk_num_bytes -= cur_alloc_size; 954 num_bytes -= cur_alloc_size; 955 alloc_hint = ins.objectid + ins.offset; 956 start += cur_alloc_size; 957 } 958 ret = 0; 959 out: 960 btrfs_end_transaction(trans, root); 961 962 return ret; 963 out_unlock: 964 extent_clear_unlock_delalloc(inode, 965 &BTRFS_I(inode)->io_tree, 966 start, end, locked_page, 967 EXTENT_CLEAR_UNLOCK_PAGE | 968 EXTENT_CLEAR_UNLOCK | 969 EXTENT_CLEAR_DELALLOC | 970 EXTENT_CLEAR_DIRTY | 971 EXTENT_SET_WRITEBACK | 972 EXTENT_END_WRITEBACK); 973 974 goto out; 975 } 976 977 /* 978 * work queue call back to started compression on a file and pages 979 */ 980 static noinline void async_cow_start(struct btrfs_work *work) 981 { 982 struct async_cow *async_cow; 983 int num_added = 0; 984 async_cow = container_of(work, struct async_cow, work); 985 986 compress_file_range(async_cow->inode, async_cow->locked_page, 987 async_cow->start, async_cow->end, async_cow, 988 &num_added); 989 if (num_added == 0) { 990 btrfs_add_delayed_iput(async_cow->inode); 991 async_cow->inode = NULL; 992 } 993 } 994 995 /* 996 * work queue call back to submit previously compressed pages 997 */ 998 static noinline void async_cow_submit(struct btrfs_work *work) 999 { 1000 struct async_cow *async_cow; 1001 struct btrfs_root *root; 1002 unsigned long nr_pages; 1003 1004 async_cow = container_of(work, struct async_cow, work); 1005 1006 root = async_cow->root; 1007 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 1008 PAGE_CACHE_SHIFT; 1009 1010 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 1011 1012 if (atomic_read(&root->fs_info->async_delalloc_pages) < 1013 5 * 1024 * 1024 && 1014 waitqueue_active(&root->fs_info->async_submit_wait)) 1015 wake_up(&root->fs_info->async_submit_wait); 1016 1017 if (async_cow->inode) 1018 submit_compressed_extents(async_cow->inode, async_cow); 1019 } 1020 1021 static noinline void async_cow_free(struct btrfs_work *work) 1022 { 1023 struct async_cow *async_cow; 1024 async_cow = container_of(work, struct async_cow, work); 1025 if (async_cow->inode) 1026 btrfs_add_delayed_iput(async_cow->inode); 1027 kfree(async_cow); 1028 } 1029 1030 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1031 u64 start, u64 end, int *page_started, 1032 unsigned long *nr_written) 1033 { 1034 struct async_cow *async_cow; 1035 struct btrfs_root *root = BTRFS_I(inode)->root; 1036 unsigned long nr_pages; 1037 u64 cur_end; 1038 int limit = 10 * 1024 * 1024; 1039 1040 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1041 1, 0, NULL, GFP_NOFS); 1042 while (start < end) { 1043 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1044 BUG_ON(!async_cow); /* -ENOMEM */ 1045 async_cow->inode = igrab(inode); 1046 async_cow->root = root; 1047 async_cow->locked_page = locked_page; 1048 async_cow->start = start; 1049 1050 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1051 cur_end = end; 1052 else 1053 cur_end = min(end, start + 512 * 1024 - 1); 1054 1055 async_cow->end = cur_end; 1056 INIT_LIST_HEAD(&async_cow->extents); 1057 1058 async_cow->work.func = async_cow_start; 1059 async_cow->work.ordered_func = async_cow_submit; 1060 async_cow->work.ordered_free = async_cow_free; 1061 async_cow->work.flags = 0; 1062 1063 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1064 PAGE_CACHE_SHIFT; 1065 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1066 1067 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1068 &async_cow->work); 1069 1070 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1071 wait_event(root->fs_info->async_submit_wait, 1072 (atomic_read(&root->fs_info->async_delalloc_pages) < 1073 limit)); 1074 } 1075 1076 while (atomic_read(&root->fs_info->async_submit_draining) && 1077 atomic_read(&root->fs_info->async_delalloc_pages)) { 1078 wait_event(root->fs_info->async_submit_wait, 1079 (atomic_read(&root->fs_info->async_delalloc_pages) == 1080 0)); 1081 } 1082 1083 *nr_written += nr_pages; 1084 start = cur_end + 1; 1085 } 1086 *page_started = 1; 1087 return 0; 1088 } 1089 1090 static noinline int csum_exist_in_range(struct btrfs_root *root, 1091 u64 bytenr, u64 num_bytes) 1092 { 1093 int ret; 1094 struct btrfs_ordered_sum *sums; 1095 LIST_HEAD(list); 1096 1097 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1098 bytenr + num_bytes - 1, &list, 0); 1099 if (ret == 0 && list_empty(&list)) 1100 return 0; 1101 1102 while (!list_empty(&list)) { 1103 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1104 list_del(&sums->list); 1105 kfree(sums); 1106 } 1107 return 1; 1108 } 1109 1110 /* 1111 * when nowcow writeback call back. This checks for snapshots or COW copies 1112 * of the extents that exist in the file, and COWs the file as required. 1113 * 1114 * If no cow copies or snapshots exist, we write directly to the existing 1115 * blocks on disk 1116 */ 1117 static noinline int run_delalloc_nocow(struct inode *inode, 1118 struct page *locked_page, 1119 u64 start, u64 end, int *page_started, int force, 1120 unsigned long *nr_written) 1121 { 1122 struct btrfs_root *root = BTRFS_I(inode)->root; 1123 struct btrfs_trans_handle *trans; 1124 struct extent_buffer *leaf; 1125 struct btrfs_path *path; 1126 struct btrfs_file_extent_item *fi; 1127 struct btrfs_key found_key; 1128 u64 cow_start; 1129 u64 cur_offset; 1130 u64 extent_end; 1131 u64 extent_offset; 1132 u64 disk_bytenr; 1133 u64 num_bytes; 1134 int extent_type; 1135 int ret, err; 1136 int type; 1137 int nocow; 1138 int check_prev = 1; 1139 bool nolock; 1140 u64 ino = btrfs_ino(inode); 1141 1142 path = btrfs_alloc_path(); 1143 if (!path) { 1144 extent_clear_unlock_delalloc(inode, 1145 &BTRFS_I(inode)->io_tree, 1146 start, end, locked_page, 1147 EXTENT_CLEAR_UNLOCK_PAGE | 1148 EXTENT_CLEAR_UNLOCK | 1149 EXTENT_CLEAR_DELALLOC | 1150 EXTENT_CLEAR_DIRTY | 1151 EXTENT_SET_WRITEBACK | 1152 EXTENT_END_WRITEBACK); 1153 return -ENOMEM; 1154 } 1155 1156 nolock = btrfs_is_free_space_inode(inode); 1157 1158 if (nolock) 1159 trans = btrfs_join_transaction_nolock(root); 1160 else 1161 trans = btrfs_join_transaction(root); 1162 1163 if (IS_ERR(trans)) { 1164 extent_clear_unlock_delalloc(inode, 1165 &BTRFS_I(inode)->io_tree, 1166 start, end, locked_page, 1167 EXTENT_CLEAR_UNLOCK_PAGE | 1168 EXTENT_CLEAR_UNLOCK | 1169 EXTENT_CLEAR_DELALLOC | 1170 EXTENT_CLEAR_DIRTY | 1171 EXTENT_SET_WRITEBACK | 1172 EXTENT_END_WRITEBACK); 1173 btrfs_free_path(path); 1174 return PTR_ERR(trans); 1175 } 1176 1177 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1178 1179 cow_start = (u64)-1; 1180 cur_offset = start; 1181 while (1) { 1182 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1183 cur_offset, 0); 1184 if (ret < 0) { 1185 btrfs_abort_transaction(trans, root, ret); 1186 goto error; 1187 } 1188 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1189 leaf = path->nodes[0]; 1190 btrfs_item_key_to_cpu(leaf, &found_key, 1191 path->slots[0] - 1); 1192 if (found_key.objectid == ino && 1193 found_key.type == BTRFS_EXTENT_DATA_KEY) 1194 path->slots[0]--; 1195 } 1196 check_prev = 0; 1197 next_slot: 1198 leaf = path->nodes[0]; 1199 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1200 ret = btrfs_next_leaf(root, path); 1201 if (ret < 0) { 1202 btrfs_abort_transaction(trans, root, ret); 1203 goto error; 1204 } 1205 if (ret > 0) 1206 break; 1207 leaf = path->nodes[0]; 1208 } 1209 1210 nocow = 0; 1211 disk_bytenr = 0; 1212 num_bytes = 0; 1213 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1214 1215 if (found_key.objectid > ino || 1216 found_key.type > BTRFS_EXTENT_DATA_KEY || 1217 found_key.offset > end) 1218 break; 1219 1220 if (found_key.offset > cur_offset) { 1221 extent_end = found_key.offset; 1222 extent_type = 0; 1223 goto out_check; 1224 } 1225 1226 fi = btrfs_item_ptr(leaf, path->slots[0], 1227 struct btrfs_file_extent_item); 1228 extent_type = btrfs_file_extent_type(leaf, fi); 1229 1230 if (extent_type == BTRFS_FILE_EXTENT_REG || 1231 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1232 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1233 extent_offset = btrfs_file_extent_offset(leaf, fi); 1234 extent_end = found_key.offset + 1235 btrfs_file_extent_num_bytes(leaf, fi); 1236 if (extent_end <= start) { 1237 path->slots[0]++; 1238 goto next_slot; 1239 } 1240 if (disk_bytenr == 0) 1241 goto out_check; 1242 if (btrfs_file_extent_compression(leaf, fi) || 1243 btrfs_file_extent_encryption(leaf, fi) || 1244 btrfs_file_extent_other_encoding(leaf, fi)) 1245 goto out_check; 1246 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1247 goto out_check; 1248 if (btrfs_extent_readonly(root, disk_bytenr)) 1249 goto out_check; 1250 if (btrfs_cross_ref_exist(trans, root, ino, 1251 found_key.offset - 1252 extent_offset, disk_bytenr)) 1253 goto out_check; 1254 disk_bytenr += extent_offset; 1255 disk_bytenr += cur_offset - found_key.offset; 1256 num_bytes = min(end + 1, extent_end) - cur_offset; 1257 /* 1258 * force cow if csum exists in the range. 1259 * this ensure that csum for a given extent are 1260 * either valid or do not exist. 1261 */ 1262 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1263 goto out_check; 1264 nocow = 1; 1265 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1266 extent_end = found_key.offset + 1267 btrfs_file_extent_inline_len(leaf, fi); 1268 extent_end = ALIGN(extent_end, root->sectorsize); 1269 } else { 1270 BUG_ON(1); 1271 } 1272 out_check: 1273 if (extent_end <= start) { 1274 path->slots[0]++; 1275 goto next_slot; 1276 } 1277 if (!nocow) { 1278 if (cow_start == (u64)-1) 1279 cow_start = cur_offset; 1280 cur_offset = extent_end; 1281 if (cur_offset > end) 1282 break; 1283 path->slots[0]++; 1284 goto next_slot; 1285 } 1286 1287 btrfs_release_path(path); 1288 if (cow_start != (u64)-1) { 1289 ret = cow_file_range(inode, locked_page, cow_start, 1290 found_key.offset - 1, page_started, 1291 nr_written, 1); 1292 if (ret) { 1293 btrfs_abort_transaction(trans, root, ret); 1294 goto error; 1295 } 1296 cow_start = (u64)-1; 1297 } 1298 1299 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1300 struct extent_map *em; 1301 struct extent_map_tree *em_tree; 1302 em_tree = &BTRFS_I(inode)->extent_tree; 1303 em = alloc_extent_map(); 1304 BUG_ON(!em); /* -ENOMEM */ 1305 em->start = cur_offset; 1306 em->orig_start = em->start; 1307 em->len = num_bytes; 1308 em->block_len = num_bytes; 1309 em->block_start = disk_bytenr; 1310 em->bdev = root->fs_info->fs_devices->latest_bdev; 1311 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1312 while (1) { 1313 write_lock(&em_tree->lock); 1314 ret = add_extent_mapping(em_tree, em); 1315 write_unlock(&em_tree->lock); 1316 if (ret != -EEXIST) { 1317 free_extent_map(em); 1318 break; 1319 } 1320 btrfs_drop_extent_cache(inode, em->start, 1321 em->start + em->len - 1, 0); 1322 } 1323 type = BTRFS_ORDERED_PREALLOC; 1324 } else { 1325 type = BTRFS_ORDERED_NOCOW; 1326 } 1327 1328 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1329 num_bytes, num_bytes, type); 1330 BUG_ON(ret); /* -ENOMEM */ 1331 1332 if (root->root_key.objectid == 1333 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1334 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1335 num_bytes); 1336 if (ret) { 1337 btrfs_abort_transaction(trans, root, ret); 1338 goto error; 1339 } 1340 } 1341 1342 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1343 cur_offset, cur_offset + num_bytes - 1, 1344 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1345 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 1346 EXTENT_SET_PRIVATE2); 1347 cur_offset = extent_end; 1348 if (cur_offset > end) 1349 break; 1350 } 1351 btrfs_release_path(path); 1352 1353 if (cur_offset <= end && cow_start == (u64)-1) { 1354 cow_start = cur_offset; 1355 cur_offset = end; 1356 } 1357 1358 if (cow_start != (u64)-1) { 1359 ret = cow_file_range(inode, locked_page, cow_start, end, 1360 page_started, nr_written, 1); 1361 if (ret) { 1362 btrfs_abort_transaction(trans, root, ret); 1363 goto error; 1364 } 1365 } 1366 1367 error: 1368 if (nolock) { 1369 err = btrfs_end_transaction_nolock(trans, root); 1370 } else { 1371 err = btrfs_end_transaction(trans, root); 1372 } 1373 if (!ret) 1374 ret = err; 1375 1376 if (ret && cur_offset < end) 1377 extent_clear_unlock_delalloc(inode, 1378 &BTRFS_I(inode)->io_tree, 1379 cur_offset, end, locked_page, 1380 EXTENT_CLEAR_UNLOCK_PAGE | 1381 EXTENT_CLEAR_UNLOCK | 1382 EXTENT_CLEAR_DELALLOC | 1383 EXTENT_CLEAR_DIRTY | 1384 EXTENT_SET_WRITEBACK | 1385 EXTENT_END_WRITEBACK); 1386 1387 btrfs_free_path(path); 1388 return ret; 1389 } 1390 1391 /* 1392 * extent_io.c call back to do delayed allocation processing 1393 */ 1394 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1395 u64 start, u64 end, int *page_started, 1396 unsigned long *nr_written) 1397 { 1398 int ret; 1399 struct btrfs_root *root = BTRFS_I(inode)->root; 1400 1401 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1402 ret = run_delalloc_nocow(inode, locked_page, start, end, 1403 page_started, 1, nr_written); 1404 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1405 ret = run_delalloc_nocow(inode, locked_page, start, end, 1406 page_started, 0, nr_written); 1407 } else if (!btrfs_test_opt(root, COMPRESS) && 1408 !(BTRFS_I(inode)->force_compress) && 1409 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { 1410 ret = cow_file_range(inode, locked_page, start, end, 1411 page_started, nr_written, 1); 1412 } else { 1413 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1414 &BTRFS_I(inode)->runtime_flags); 1415 ret = cow_file_range_async(inode, locked_page, start, end, 1416 page_started, nr_written); 1417 } 1418 return ret; 1419 } 1420 1421 static void btrfs_split_extent_hook(struct inode *inode, 1422 struct extent_state *orig, u64 split) 1423 { 1424 /* not delalloc, ignore it */ 1425 if (!(orig->state & EXTENT_DELALLOC)) 1426 return; 1427 1428 spin_lock(&BTRFS_I(inode)->lock); 1429 BTRFS_I(inode)->outstanding_extents++; 1430 spin_unlock(&BTRFS_I(inode)->lock); 1431 } 1432 1433 /* 1434 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1435 * extents so we can keep track of new extents that are just merged onto old 1436 * extents, such as when we are doing sequential writes, so we can properly 1437 * account for the metadata space we'll need. 1438 */ 1439 static void btrfs_merge_extent_hook(struct inode *inode, 1440 struct extent_state *new, 1441 struct extent_state *other) 1442 { 1443 /* not delalloc, ignore it */ 1444 if (!(other->state & EXTENT_DELALLOC)) 1445 return; 1446 1447 spin_lock(&BTRFS_I(inode)->lock); 1448 BTRFS_I(inode)->outstanding_extents--; 1449 spin_unlock(&BTRFS_I(inode)->lock); 1450 } 1451 1452 /* 1453 * extent_io.c set_bit_hook, used to track delayed allocation 1454 * bytes in this file, and to maintain the list of inodes that 1455 * have pending delalloc work to be done. 1456 */ 1457 static void btrfs_set_bit_hook(struct inode *inode, 1458 struct extent_state *state, int *bits) 1459 { 1460 1461 /* 1462 * set_bit and clear bit hooks normally require _irqsave/restore 1463 * but in this case, we are only testing for the DELALLOC 1464 * bit, which is only set or cleared with irqs on 1465 */ 1466 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1467 struct btrfs_root *root = BTRFS_I(inode)->root; 1468 u64 len = state->end + 1 - state->start; 1469 bool do_list = !btrfs_is_free_space_inode(inode); 1470 1471 if (*bits & EXTENT_FIRST_DELALLOC) { 1472 *bits &= ~EXTENT_FIRST_DELALLOC; 1473 } else { 1474 spin_lock(&BTRFS_I(inode)->lock); 1475 BTRFS_I(inode)->outstanding_extents++; 1476 spin_unlock(&BTRFS_I(inode)->lock); 1477 } 1478 1479 spin_lock(&root->fs_info->delalloc_lock); 1480 BTRFS_I(inode)->delalloc_bytes += len; 1481 root->fs_info->delalloc_bytes += len; 1482 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1483 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1484 &root->fs_info->delalloc_inodes); 1485 } 1486 spin_unlock(&root->fs_info->delalloc_lock); 1487 } 1488 } 1489 1490 /* 1491 * extent_io.c clear_bit_hook, see set_bit_hook for why 1492 */ 1493 static void btrfs_clear_bit_hook(struct inode *inode, 1494 struct extent_state *state, int *bits) 1495 { 1496 /* 1497 * set_bit and clear bit hooks normally require _irqsave/restore 1498 * but in this case, we are only testing for the DELALLOC 1499 * bit, which is only set or cleared with irqs on 1500 */ 1501 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1502 struct btrfs_root *root = BTRFS_I(inode)->root; 1503 u64 len = state->end + 1 - state->start; 1504 bool do_list = !btrfs_is_free_space_inode(inode); 1505 1506 if (*bits & EXTENT_FIRST_DELALLOC) { 1507 *bits &= ~EXTENT_FIRST_DELALLOC; 1508 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1509 spin_lock(&BTRFS_I(inode)->lock); 1510 BTRFS_I(inode)->outstanding_extents--; 1511 spin_unlock(&BTRFS_I(inode)->lock); 1512 } 1513 1514 if (*bits & EXTENT_DO_ACCOUNTING) 1515 btrfs_delalloc_release_metadata(inode, len); 1516 1517 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1518 && do_list) 1519 btrfs_free_reserved_data_space(inode, len); 1520 1521 spin_lock(&root->fs_info->delalloc_lock); 1522 root->fs_info->delalloc_bytes -= len; 1523 BTRFS_I(inode)->delalloc_bytes -= len; 1524 1525 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1526 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1527 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1528 } 1529 spin_unlock(&root->fs_info->delalloc_lock); 1530 } 1531 } 1532 1533 /* 1534 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1535 * we don't create bios that span stripes or chunks 1536 */ 1537 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1538 size_t size, struct bio *bio, 1539 unsigned long bio_flags) 1540 { 1541 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1542 struct btrfs_mapping_tree *map_tree; 1543 u64 logical = (u64)bio->bi_sector << 9; 1544 u64 length = 0; 1545 u64 map_length; 1546 int ret; 1547 1548 if (bio_flags & EXTENT_BIO_COMPRESSED) 1549 return 0; 1550 1551 length = bio->bi_size; 1552 map_tree = &root->fs_info->mapping_tree; 1553 map_length = length; 1554 ret = btrfs_map_block(map_tree, READ, logical, 1555 &map_length, NULL, 0); 1556 /* Will always return 0 or 1 with map_multi == NULL */ 1557 BUG_ON(ret < 0); 1558 if (map_length < length + size) 1559 return 1; 1560 return 0; 1561 } 1562 1563 /* 1564 * in order to insert checksums into the metadata in large chunks, 1565 * we wait until bio submission time. All the pages in the bio are 1566 * checksummed and sums are attached onto the ordered extent record. 1567 * 1568 * At IO completion time the cums attached on the ordered extent record 1569 * are inserted into the btree 1570 */ 1571 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1572 struct bio *bio, int mirror_num, 1573 unsigned long bio_flags, 1574 u64 bio_offset) 1575 { 1576 struct btrfs_root *root = BTRFS_I(inode)->root; 1577 int ret = 0; 1578 1579 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1580 BUG_ON(ret); /* -ENOMEM */ 1581 return 0; 1582 } 1583 1584 /* 1585 * in order to insert checksums into the metadata in large chunks, 1586 * we wait until bio submission time. All the pages in the bio are 1587 * checksummed and sums are attached onto the ordered extent record. 1588 * 1589 * At IO completion time the cums attached on the ordered extent record 1590 * are inserted into the btree 1591 */ 1592 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1593 int mirror_num, unsigned long bio_flags, 1594 u64 bio_offset) 1595 { 1596 struct btrfs_root *root = BTRFS_I(inode)->root; 1597 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1598 } 1599 1600 /* 1601 * extent_io.c submission hook. This does the right thing for csum calculation 1602 * on write, or reading the csums from the tree before a read 1603 */ 1604 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1605 int mirror_num, unsigned long bio_flags, 1606 u64 bio_offset) 1607 { 1608 struct btrfs_root *root = BTRFS_I(inode)->root; 1609 int ret = 0; 1610 int skip_sum; 1611 int metadata = 0; 1612 1613 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1614 1615 if (btrfs_is_free_space_inode(inode)) 1616 metadata = 2; 1617 1618 if (!(rw & REQ_WRITE)) { 1619 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1620 if (ret) 1621 return ret; 1622 1623 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1624 return btrfs_submit_compressed_read(inode, bio, 1625 mirror_num, bio_flags); 1626 } else if (!skip_sum) { 1627 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1628 if (ret) 1629 return ret; 1630 } 1631 goto mapit; 1632 } else if (!skip_sum) { 1633 /* csum items have already been cloned */ 1634 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1635 goto mapit; 1636 /* we're doing a write, do the async checksumming */ 1637 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1638 inode, rw, bio, mirror_num, 1639 bio_flags, bio_offset, 1640 __btrfs_submit_bio_start, 1641 __btrfs_submit_bio_done); 1642 } 1643 1644 mapit: 1645 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1646 } 1647 1648 /* 1649 * given a list of ordered sums record them in the inode. This happens 1650 * at IO completion time based on sums calculated at bio submission time. 1651 */ 1652 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1653 struct inode *inode, u64 file_offset, 1654 struct list_head *list) 1655 { 1656 struct btrfs_ordered_sum *sum; 1657 1658 list_for_each_entry(sum, list, list) { 1659 btrfs_csum_file_blocks(trans, 1660 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1661 } 1662 return 0; 1663 } 1664 1665 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1666 struct extent_state **cached_state) 1667 { 1668 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1669 WARN_ON(1); 1670 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1671 cached_state, GFP_NOFS); 1672 } 1673 1674 /* see btrfs_writepage_start_hook for details on why this is required */ 1675 struct btrfs_writepage_fixup { 1676 struct page *page; 1677 struct btrfs_work work; 1678 }; 1679 1680 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1681 { 1682 struct btrfs_writepage_fixup *fixup; 1683 struct btrfs_ordered_extent *ordered; 1684 struct extent_state *cached_state = NULL; 1685 struct page *page; 1686 struct inode *inode; 1687 u64 page_start; 1688 u64 page_end; 1689 int ret; 1690 1691 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1692 page = fixup->page; 1693 again: 1694 lock_page(page); 1695 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1696 ClearPageChecked(page); 1697 goto out_page; 1698 } 1699 1700 inode = page->mapping->host; 1701 page_start = page_offset(page); 1702 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1703 1704 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1705 &cached_state); 1706 1707 /* already ordered? We're done */ 1708 if (PagePrivate2(page)) 1709 goto out; 1710 1711 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1712 if (ordered) { 1713 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1714 page_end, &cached_state, GFP_NOFS); 1715 unlock_page(page); 1716 btrfs_start_ordered_extent(inode, ordered, 1); 1717 btrfs_put_ordered_extent(ordered); 1718 goto again; 1719 } 1720 1721 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1722 if (ret) { 1723 mapping_set_error(page->mapping, ret); 1724 end_extent_writepage(page, ret, page_start, page_end); 1725 ClearPageChecked(page); 1726 goto out; 1727 } 1728 1729 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1730 ClearPageChecked(page); 1731 set_page_dirty(page); 1732 out: 1733 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1734 &cached_state, GFP_NOFS); 1735 out_page: 1736 unlock_page(page); 1737 page_cache_release(page); 1738 kfree(fixup); 1739 } 1740 1741 /* 1742 * There are a few paths in the higher layers of the kernel that directly 1743 * set the page dirty bit without asking the filesystem if it is a 1744 * good idea. This causes problems because we want to make sure COW 1745 * properly happens and the data=ordered rules are followed. 1746 * 1747 * In our case any range that doesn't have the ORDERED bit set 1748 * hasn't been properly setup for IO. We kick off an async process 1749 * to fix it up. The async helper will wait for ordered extents, set 1750 * the delalloc bit and make it safe to write the page. 1751 */ 1752 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1753 { 1754 struct inode *inode = page->mapping->host; 1755 struct btrfs_writepage_fixup *fixup; 1756 struct btrfs_root *root = BTRFS_I(inode)->root; 1757 1758 /* this page is properly in the ordered list */ 1759 if (TestClearPagePrivate2(page)) 1760 return 0; 1761 1762 if (PageChecked(page)) 1763 return -EAGAIN; 1764 1765 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1766 if (!fixup) 1767 return -EAGAIN; 1768 1769 SetPageChecked(page); 1770 page_cache_get(page); 1771 fixup->work.func = btrfs_writepage_fixup_worker; 1772 fixup->page = page; 1773 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1774 return -EBUSY; 1775 } 1776 1777 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1778 struct inode *inode, u64 file_pos, 1779 u64 disk_bytenr, u64 disk_num_bytes, 1780 u64 num_bytes, u64 ram_bytes, 1781 u8 compression, u8 encryption, 1782 u16 other_encoding, int extent_type) 1783 { 1784 struct btrfs_root *root = BTRFS_I(inode)->root; 1785 struct btrfs_file_extent_item *fi; 1786 struct btrfs_path *path; 1787 struct extent_buffer *leaf; 1788 struct btrfs_key ins; 1789 u64 hint; 1790 int ret; 1791 1792 path = btrfs_alloc_path(); 1793 if (!path) 1794 return -ENOMEM; 1795 1796 path->leave_spinning = 1; 1797 1798 /* 1799 * we may be replacing one extent in the tree with another. 1800 * The new extent is pinned in the extent map, and we don't want 1801 * to drop it from the cache until it is completely in the btree. 1802 * 1803 * So, tell btrfs_drop_extents to leave this extent in the cache. 1804 * the caller is expected to unpin it and allow it to be merged 1805 * with the others. 1806 */ 1807 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1808 &hint, 0); 1809 if (ret) 1810 goto out; 1811 1812 ins.objectid = btrfs_ino(inode); 1813 ins.offset = file_pos; 1814 ins.type = BTRFS_EXTENT_DATA_KEY; 1815 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1816 if (ret) 1817 goto out; 1818 leaf = path->nodes[0]; 1819 fi = btrfs_item_ptr(leaf, path->slots[0], 1820 struct btrfs_file_extent_item); 1821 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1822 btrfs_set_file_extent_type(leaf, fi, extent_type); 1823 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1824 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1825 btrfs_set_file_extent_offset(leaf, fi, 0); 1826 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1827 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1828 btrfs_set_file_extent_compression(leaf, fi, compression); 1829 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1830 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1831 1832 btrfs_unlock_up_safe(path, 1); 1833 btrfs_set_lock_blocking(leaf); 1834 1835 btrfs_mark_buffer_dirty(leaf); 1836 1837 inode_add_bytes(inode, num_bytes); 1838 1839 ins.objectid = disk_bytenr; 1840 ins.offset = disk_num_bytes; 1841 ins.type = BTRFS_EXTENT_ITEM_KEY; 1842 ret = btrfs_alloc_reserved_file_extent(trans, root, 1843 root->root_key.objectid, 1844 btrfs_ino(inode), file_pos, &ins); 1845 out: 1846 btrfs_free_path(path); 1847 1848 return ret; 1849 } 1850 1851 /* 1852 * helper function for btrfs_finish_ordered_io, this 1853 * just reads in some of the csum leaves to prime them into ram 1854 * before we start the transaction. It limits the amount of btree 1855 * reads required while inside the transaction. 1856 */ 1857 /* as ordered data IO finishes, this gets called so we can finish 1858 * an ordered extent if the range of bytes in the file it covers are 1859 * fully written. 1860 */ 1861 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 1862 { 1863 struct inode *inode = ordered_extent->inode; 1864 struct btrfs_root *root = BTRFS_I(inode)->root; 1865 struct btrfs_trans_handle *trans = NULL; 1866 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1867 struct extent_state *cached_state = NULL; 1868 int compress_type = 0; 1869 int ret; 1870 bool nolock; 1871 1872 nolock = btrfs_is_free_space_inode(inode); 1873 1874 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 1875 ret = -EIO; 1876 goto out; 1877 } 1878 1879 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1880 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1881 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1882 if (!ret) { 1883 if (nolock) 1884 trans = btrfs_join_transaction_nolock(root); 1885 else 1886 trans = btrfs_join_transaction(root); 1887 if (IS_ERR(trans)) 1888 return PTR_ERR(trans); 1889 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1890 ret = btrfs_update_inode_fallback(trans, root, inode); 1891 if (ret) /* -ENOMEM or corruption */ 1892 btrfs_abort_transaction(trans, root, ret); 1893 } 1894 goto out; 1895 } 1896 1897 lock_extent_bits(io_tree, ordered_extent->file_offset, 1898 ordered_extent->file_offset + ordered_extent->len - 1, 1899 0, &cached_state); 1900 1901 if (nolock) 1902 trans = btrfs_join_transaction_nolock(root); 1903 else 1904 trans = btrfs_join_transaction(root); 1905 if (IS_ERR(trans)) { 1906 ret = PTR_ERR(trans); 1907 trans = NULL; 1908 goto out_unlock; 1909 } 1910 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1911 1912 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1913 compress_type = ordered_extent->compress_type; 1914 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1915 BUG_ON(compress_type); 1916 ret = btrfs_mark_extent_written(trans, inode, 1917 ordered_extent->file_offset, 1918 ordered_extent->file_offset + 1919 ordered_extent->len); 1920 } else { 1921 BUG_ON(root == root->fs_info->tree_root); 1922 ret = insert_reserved_file_extent(trans, inode, 1923 ordered_extent->file_offset, 1924 ordered_extent->start, 1925 ordered_extent->disk_len, 1926 ordered_extent->len, 1927 ordered_extent->len, 1928 compress_type, 0, 0, 1929 BTRFS_FILE_EXTENT_REG); 1930 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1931 ordered_extent->file_offset, 1932 ordered_extent->len); 1933 } 1934 1935 if (ret < 0) { 1936 btrfs_abort_transaction(trans, root, ret); 1937 goto out_unlock; 1938 } 1939 1940 add_pending_csums(trans, inode, ordered_extent->file_offset, 1941 &ordered_extent->list); 1942 1943 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1944 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1945 ret = btrfs_update_inode_fallback(trans, root, inode); 1946 if (ret) { /* -ENOMEM or corruption */ 1947 btrfs_abort_transaction(trans, root, ret); 1948 goto out_unlock; 1949 } 1950 } 1951 ret = 0; 1952 out_unlock: 1953 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1954 ordered_extent->file_offset + 1955 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1956 out: 1957 if (root != root->fs_info->tree_root) 1958 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1959 if (trans) { 1960 if (nolock) 1961 btrfs_end_transaction_nolock(trans, root); 1962 else 1963 btrfs_end_transaction(trans, root); 1964 } 1965 1966 if (ret) 1967 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 1968 ordered_extent->file_offset + 1969 ordered_extent->len - 1, NULL, GFP_NOFS); 1970 1971 /* 1972 * This needs to be dont to make sure anybody waiting knows we are done 1973 * upating everything for this ordered extent. 1974 */ 1975 btrfs_remove_ordered_extent(inode, ordered_extent); 1976 1977 /* once for us */ 1978 btrfs_put_ordered_extent(ordered_extent); 1979 /* once for the tree */ 1980 btrfs_put_ordered_extent(ordered_extent); 1981 1982 return ret; 1983 } 1984 1985 static void finish_ordered_fn(struct btrfs_work *work) 1986 { 1987 struct btrfs_ordered_extent *ordered_extent; 1988 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 1989 btrfs_finish_ordered_io(ordered_extent); 1990 } 1991 1992 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1993 struct extent_state *state, int uptodate) 1994 { 1995 struct inode *inode = page->mapping->host; 1996 struct btrfs_root *root = BTRFS_I(inode)->root; 1997 struct btrfs_ordered_extent *ordered_extent = NULL; 1998 struct btrfs_workers *workers; 1999 2000 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2001 2002 ClearPagePrivate2(page); 2003 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 2004 end - start + 1, uptodate)) 2005 return 0; 2006 2007 ordered_extent->work.func = finish_ordered_fn; 2008 ordered_extent->work.flags = 0; 2009 2010 if (btrfs_is_free_space_inode(inode)) 2011 workers = &root->fs_info->endio_freespace_worker; 2012 else 2013 workers = &root->fs_info->endio_write_workers; 2014 btrfs_queue_worker(workers, &ordered_extent->work); 2015 2016 return 0; 2017 } 2018 2019 /* 2020 * when reads are done, we need to check csums to verify the data is correct 2021 * if there's a match, we allow the bio to finish. If not, the code in 2022 * extent_io.c will try to find good copies for us. 2023 */ 2024 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 2025 struct extent_state *state, int mirror) 2026 { 2027 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 2028 struct inode *inode = page->mapping->host; 2029 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2030 char *kaddr; 2031 u64 private = ~(u32)0; 2032 int ret; 2033 struct btrfs_root *root = BTRFS_I(inode)->root; 2034 u32 csum = ~(u32)0; 2035 2036 if (PageChecked(page)) { 2037 ClearPageChecked(page); 2038 goto good; 2039 } 2040 2041 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2042 goto good; 2043 2044 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2045 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2046 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 2047 GFP_NOFS); 2048 return 0; 2049 } 2050 2051 if (state && state->start == start) { 2052 private = state->private; 2053 ret = 0; 2054 } else { 2055 ret = get_state_private(io_tree, start, &private); 2056 } 2057 kaddr = kmap_atomic(page); 2058 if (ret) 2059 goto zeroit; 2060 2061 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 2062 btrfs_csum_final(csum, (char *)&csum); 2063 if (csum != private) 2064 goto zeroit; 2065 2066 kunmap_atomic(kaddr); 2067 good: 2068 return 0; 2069 2070 zeroit: 2071 printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " 2072 "private %llu\n", 2073 (unsigned long long)btrfs_ino(page->mapping->host), 2074 (unsigned long long)start, csum, 2075 (unsigned long long)private); 2076 memset(kaddr + offset, 1, end - start + 1); 2077 flush_dcache_page(page); 2078 kunmap_atomic(kaddr); 2079 if (private == 0) 2080 return 0; 2081 return -EIO; 2082 } 2083 2084 struct delayed_iput { 2085 struct list_head list; 2086 struct inode *inode; 2087 }; 2088 2089 /* JDM: If this is fs-wide, why can't we add a pointer to 2090 * btrfs_inode instead and avoid the allocation? */ 2091 void btrfs_add_delayed_iput(struct inode *inode) 2092 { 2093 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2094 struct delayed_iput *delayed; 2095 2096 if (atomic_add_unless(&inode->i_count, -1, 1)) 2097 return; 2098 2099 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 2100 delayed->inode = inode; 2101 2102 spin_lock(&fs_info->delayed_iput_lock); 2103 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 2104 spin_unlock(&fs_info->delayed_iput_lock); 2105 } 2106 2107 void btrfs_run_delayed_iputs(struct btrfs_root *root) 2108 { 2109 LIST_HEAD(list); 2110 struct btrfs_fs_info *fs_info = root->fs_info; 2111 struct delayed_iput *delayed; 2112 int empty; 2113 2114 spin_lock(&fs_info->delayed_iput_lock); 2115 empty = list_empty(&fs_info->delayed_iputs); 2116 spin_unlock(&fs_info->delayed_iput_lock); 2117 if (empty) 2118 return; 2119 2120 down_read(&root->fs_info->cleanup_work_sem); 2121 spin_lock(&fs_info->delayed_iput_lock); 2122 list_splice_init(&fs_info->delayed_iputs, &list); 2123 spin_unlock(&fs_info->delayed_iput_lock); 2124 2125 while (!list_empty(&list)) { 2126 delayed = list_entry(list.next, struct delayed_iput, list); 2127 list_del(&delayed->list); 2128 iput(delayed->inode); 2129 kfree(delayed); 2130 } 2131 up_read(&root->fs_info->cleanup_work_sem); 2132 } 2133 2134 enum btrfs_orphan_cleanup_state { 2135 ORPHAN_CLEANUP_STARTED = 1, 2136 ORPHAN_CLEANUP_DONE = 2, 2137 }; 2138 2139 /* 2140 * This is called in transaction commit time. If there are no orphan 2141 * files in the subvolume, it removes orphan item and frees block_rsv 2142 * structure. 2143 */ 2144 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2145 struct btrfs_root *root) 2146 { 2147 struct btrfs_block_rsv *block_rsv; 2148 int ret; 2149 2150 if (atomic_read(&root->orphan_inodes) || 2151 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2152 return; 2153 2154 spin_lock(&root->orphan_lock); 2155 if (atomic_read(&root->orphan_inodes)) { 2156 spin_unlock(&root->orphan_lock); 2157 return; 2158 } 2159 2160 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { 2161 spin_unlock(&root->orphan_lock); 2162 return; 2163 } 2164 2165 block_rsv = root->orphan_block_rsv; 2166 root->orphan_block_rsv = NULL; 2167 spin_unlock(&root->orphan_lock); 2168 2169 if (root->orphan_item_inserted && 2170 btrfs_root_refs(&root->root_item) > 0) { 2171 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2172 root->root_key.objectid); 2173 BUG_ON(ret); 2174 root->orphan_item_inserted = 0; 2175 } 2176 2177 if (block_rsv) { 2178 WARN_ON(block_rsv->size > 0); 2179 btrfs_free_block_rsv(root, block_rsv); 2180 } 2181 } 2182 2183 /* 2184 * This creates an orphan entry for the given inode in case something goes 2185 * wrong in the middle of an unlink/truncate. 2186 * 2187 * NOTE: caller of this function should reserve 5 units of metadata for 2188 * this function. 2189 */ 2190 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2191 { 2192 struct btrfs_root *root = BTRFS_I(inode)->root; 2193 struct btrfs_block_rsv *block_rsv = NULL; 2194 int reserve = 0; 2195 int insert = 0; 2196 int ret; 2197 2198 if (!root->orphan_block_rsv) { 2199 block_rsv = btrfs_alloc_block_rsv(root); 2200 if (!block_rsv) 2201 return -ENOMEM; 2202 } 2203 2204 spin_lock(&root->orphan_lock); 2205 if (!root->orphan_block_rsv) { 2206 root->orphan_block_rsv = block_rsv; 2207 } else if (block_rsv) { 2208 btrfs_free_block_rsv(root, block_rsv); 2209 block_rsv = NULL; 2210 } 2211 2212 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2213 &BTRFS_I(inode)->runtime_flags)) { 2214 #if 0 2215 /* 2216 * For proper ENOSPC handling, we should do orphan 2217 * cleanup when mounting. But this introduces backward 2218 * compatibility issue. 2219 */ 2220 if (!xchg(&root->orphan_item_inserted, 1)) 2221 insert = 2; 2222 else 2223 insert = 1; 2224 #endif 2225 insert = 1; 2226 atomic_dec(&root->orphan_inodes); 2227 } 2228 2229 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2230 &BTRFS_I(inode)->runtime_flags)) 2231 reserve = 1; 2232 spin_unlock(&root->orphan_lock); 2233 2234 /* grab metadata reservation from transaction handle */ 2235 if (reserve) { 2236 ret = btrfs_orphan_reserve_metadata(trans, inode); 2237 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ 2238 } 2239 2240 /* insert an orphan item to track this unlinked/truncated file */ 2241 if (insert >= 1) { 2242 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2243 if (ret && ret != -EEXIST) { 2244 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2245 &BTRFS_I(inode)->runtime_flags); 2246 btrfs_abort_transaction(trans, root, ret); 2247 return ret; 2248 } 2249 ret = 0; 2250 } 2251 2252 /* insert an orphan item to track subvolume contains orphan files */ 2253 if (insert >= 2) { 2254 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2255 root->root_key.objectid); 2256 if (ret && ret != -EEXIST) { 2257 btrfs_abort_transaction(trans, root, ret); 2258 return ret; 2259 } 2260 } 2261 return 0; 2262 } 2263 2264 /* 2265 * We have done the truncate/delete so we can go ahead and remove the orphan 2266 * item for this particular inode. 2267 */ 2268 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2269 { 2270 struct btrfs_root *root = BTRFS_I(inode)->root; 2271 int delete_item = 0; 2272 int release_rsv = 0; 2273 int ret = 0; 2274 2275 spin_lock(&root->orphan_lock); 2276 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2277 &BTRFS_I(inode)->runtime_flags)) 2278 delete_item = 1; 2279 2280 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2281 &BTRFS_I(inode)->runtime_flags)) 2282 release_rsv = 1; 2283 spin_unlock(&root->orphan_lock); 2284 2285 if (trans && delete_item) { 2286 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); 2287 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2288 } 2289 2290 if (release_rsv) { 2291 btrfs_orphan_release_metadata(inode); 2292 atomic_dec(&root->orphan_inodes); 2293 } 2294 2295 return 0; 2296 } 2297 2298 /* 2299 * this cleans up any orphans that may be left on the list from the last use 2300 * of this root. 2301 */ 2302 int btrfs_orphan_cleanup(struct btrfs_root *root) 2303 { 2304 struct btrfs_path *path; 2305 struct extent_buffer *leaf; 2306 struct btrfs_key key, found_key; 2307 struct btrfs_trans_handle *trans; 2308 struct inode *inode; 2309 u64 last_objectid = 0; 2310 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2311 2312 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2313 return 0; 2314 2315 path = btrfs_alloc_path(); 2316 if (!path) { 2317 ret = -ENOMEM; 2318 goto out; 2319 } 2320 path->reada = -1; 2321 2322 key.objectid = BTRFS_ORPHAN_OBJECTID; 2323 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2324 key.offset = (u64)-1; 2325 2326 while (1) { 2327 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2328 if (ret < 0) 2329 goto out; 2330 2331 /* 2332 * if ret == 0 means we found what we were searching for, which 2333 * is weird, but possible, so only screw with path if we didn't 2334 * find the key and see if we have stuff that matches 2335 */ 2336 if (ret > 0) { 2337 ret = 0; 2338 if (path->slots[0] == 0) 2339 break; 2340 path->slots[0]--; 2341 } 2342 2343 /* pull out the item */ 2344 leaf = path->nodes[0]; 2345 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2346 2347 /* make sure the item matches what we want */ 2348 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 2349 break; 2350 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 2351 break; 2352 2353 /* release the path since we're done with it */ 2354 btrfs_release_path(path); 2355 2356 /* 2357 * this is where we are basically btrfs_lookup, without the 2358 * crossing root thing. we store the inode number in the 2359 * offset of the orphan item. 2360 */ 2361 2362 if (found_key.offset == last_objectid) { 2363 printk(KERN_ERR "btrfs: Error removing orphan entry, " 2364 "stopping orphan cleanup\n"); 2365 ret = -EINVAL; 2366 goto out; 2367 } 2368 2369 last_objectid = found_key.offset; 2370 2371 found_key.objectid = found_key.offset; 2372 found_key.type = BTRFS_INODE_ITEM_KEY; 2373 found_key.offset = 0; 2374 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2375 ret = PTR_RET(inode); 2376 if (ret && ret != -ESTALE) 2377 goto out; 2378 2379 if (ret == -ESTALE && root == root->fs_info->tree_root) { 2380 struct btrfs_root *dead_root; 2381 struct btrfs_fs_info *fs_info = root->fs_info; 2382 int is_dead_root = 0; 2383 2384 /* 2385 * this is an orphan in the tree root. Currently these 2386 * could come from 2 sources: 2387 * a) a snapshot deletion in progress 2388 * b) a free space cache inode 2389 * We need to distinguish those two, as the snapshot 2390 * orphan must not get deleted. 2391 * find_dead_roots already ran before us, so if this 2392 * is a snapshot deletion, we should find the root 2393 * in the dead_roots list 2394 */ 2395 spin_lock(&fs_info->trans_lock); 2396 list_for_each_entry(dead_root, &fs_info->dead_roots, 2397 root_list) { 2398 if (dead_root->root_key.objectid == 2399 found_key.objectid) { 2400 is_dead_root = 1; 2401 break; 2402 } 2403 } 2404 spin_unlock(&fs_info->trans_lock); 2405 if (is_dead_root) { 2406 /* prevent this orphan from being found again */ 2407 key.offset = found_key.objectid - 1; 2408 continue; 2409 } 2410 } 2411 /* 2412 * Inode is already gone but the orphan item is still there, 2413 * kill the orphan item. 2414 */ 2415 if (ret == -ESTALE) { 2416 trans = btrfs_start_transaction(root, 1); 2417 if (IS_ERR(trans)) { 2418 ret = PTR_ERR(trans); 2419 goto out; 2420 } 2421 printk(KERN_ERR "auto deleting %Lu\n", 2422 found_key.objectid); 2423 ret = btrfs_del_orphan_item(trans, root, 2424 found_key.objectid); 2425 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2426 btrfs_end_transaction(trans, root); 2427 continue; 2428 } 2429 2430 /* 2431 * add this inode to the orphan list so btrfs_orphan_del does 2432 * the proper thing when we hit it 2433 */ 2434 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2435 &BTRFS_I(inode)->runtime_flags); 2436 2437 /* if we have links, this was a truncate, lets do that */ 2438 if (inode->i_nlink) { 2439 if (!S_ISREG(inode->i_mode)) { 2440 WARN_ON(1); 2441 iput(inode); 2442 continue; 2443 } 2444 nr_truncate++; 2445 ret = btrfs_truncate(inode); 2446 } else { 2447 nr_unlink++; 2448 } 2449 2450 /* this will do delete_inode and everything for us */ 2451 iput(inode); 2452 if (ret) 2453 goto out; 2454 } 2455 /* release the path since we're done with it */ 2456 btrfs_release_path(path); 2457 2458 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2459 2460 if (root->orphan_block_rsv) 2461 btrfs_block_rsv_release(root, root->orphan_block_rsv, 2462 (u64)-1); 2463 2464 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2465 trans = btrfs_join_transaction(root); 2466 if (!IS_ERR(trans)) 2467 btrfs_end_transaction(trans, root); 2468 } 2469 2470 if (nr_unlink) 2471 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2472 if (nr_truncate) 2473 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2474 2475 out: 2476 if (ret) 2477 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); 2478 btrfs_free_path(path); 2479 return ret; 2480 } 2481 2482 /* 2483 * very simple check to peek ahead in the leaf looking for xattrs. If we 2484 * don't find any xattrs, we know there can't be any acls. 2485 * 2486 * slot is the slot the inode is in, objectid is the objectid of the inode 2487 */ 2488 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2489 int slot, u64 objectid) 2490 { 2491 u32 nritems = btrfs_header_nritems(leaf); 2492 struct btrfs_key found_key; 2493 int scanned = 0; 2494 2495 slot++; 2496 while (slot < nritems) { 2497 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2498 2499 /* we found a different objectid, there must not be acls */ 2500 if (found_key.objectid != objectid) 2501 return 0; 2502 2503 /* we found an xattr, assume we've got an acl */ 2504 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2505 return 1; 2506 2507 /* 2508 * we found a key greater than an xattr key, there can't 2509 * be any acls later on 2510 */ 2511 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2512 return 0; 2513 2514 slot++; 2515 scanned++; 2516 2517 /* 2518 * it goes inode, inode backrefs, xattrs, extents, 2519 * so if there are a ton of hard links to an inode there can 2520 * be a lot of backrefs. Don't waste time searching too hard, 2521 * this is just an optimization 2522 */ 2523 if (scanned >= 8) 2524 break; 2525 } 2526 /* we hit the end of the leaf before we found an xattr or 2527 * something larger than an xattr. We have to assume the inode 2528 * has acls 2529 */ 2530 return 1; 2531 } 2532 2533 /* 2534 * read an inode from the btree into the in-memory inode 2535 */ 2536 static void btrfs_read_locked_inode(struct inode *inode) 2537 { 2538 struct btrfs_path *path; 2539 struct extent_buffer *leaf; 2540 struct btrfs_inode_item *inode_item; 2541 struct btrfs_timespec *tspec; 2542 struct btrfs_root *root = BTRFS_I(inode)->root; 2543 struct btrfs_key location; 2544 int maybe_acls; 2545 u32 rdev; 2546 int ret; 2547 bool filled = false; 2548 2549 ret = btrfs_fill_inode(inode, &rdev); 2550 if (!ret) 2551 filled = true; 2552 2553 path = btrfs_alloc_path(); 2554 if (!path) 2555 goto make_bad; 2556 2557 path->leave_spinning = 1; 2558 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2559 2560 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2561 if (ret) 2562 goto make_bad; 2563 2564 leaf = path->nodes[0]; 2565 2566 if (filled) 2567 goto cache_acl; 2568 2569 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2570 struct btrfs_inode_item); 2571 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2572 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 2573 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2574 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2575 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2576 2577 tspec = btrfs_inode_atime(inode_item); 2578 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2579 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2580 2581 tspec = btrfs_inode_mtime(inode_item); 2582 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2583 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2584 2585 tspec = btrfs_inode_ctime(inode_item); 2586 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2587 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2588 2589 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2590 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2591 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2592 inode->i_generation = BTRFS_I(inode)->generation; 2593 inode->i_rdev = 0; 2594 rdev = btrfs_inode_rdev(leaf, inode_item); 2595 2596 BTRFS_I(inode)->index_cnt = (u64)-1; 2597 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2598 cache_acl: 2599 /* 2600 * try to precache a NULL acl entry for files that don't have 2601 * any xattrs or acls 2602 */ 2603 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 2604 btrfs_ino(inode)); 2605 if (!maybe_acls) 2606 cache_no_acl(inode); 2607 2608 btrfs_free_path(path); 2609 2610 switch (inode->i_mode & S_IFMT) { 2611 case S_IFREG: 2612 inode->i_mapping->a_ops = &btrfs_aops; 2613 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2614 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2615 inode->i_fop = &btrfs_file_operations; 2616 inode->i_op = &btrfs_file_inode_operations; 2617 break; 2618 case S_IFDIR: 2619 inode->i_fop = &btrfs_dir_file_operations; 2620 if (root == root->fs_info->tree_root) 2621 inode->i_op = &btrfs_dir_ro_inode_operations; 2622 else 2623 inode->i_op = &btrfs_dir_inode_operations; 2624 break; 2625 case S_IFLNK: 2626 inode->i_op = &btrfs_symlink_inode_operations; 2627 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2628 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2629 break; 2630 default: 2631 inode->i_op = &btrfs_special_inode_operations; 2632 init_special_inode(inode, inode->i_mode, rdev); 2633 break; 2634 } 2635 2636 btrfs_update_iflags(inode); 2637 return; 2638 2639 make_bad: 2640 btrfs_free_path(path); 2641 make_bad_inode(inode); 2642 } 2643 2644 /* 2645 * given a leaf and an inode, copy the inode fields into the leaf 2646 */ 2647 static void fill_inode_item(struct btrfs_trans_handle *trans, 2648 struct extent_buffer *leaf, 2649 struct btrfs_inode_item *item, 2650 struct inode *inode) 2651 { 2652 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2653 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2654 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2655 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2656 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2657 2658 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2659 inode->i_atime.tv_sec); 2660 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2661 inode->i_atime.tv_nsec); 2662 2663 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2664 inode->i_mtime.tv_sec); 2665 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2666 inode->i_mtime.tv_nsec); 2667 2668 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2669 inode->i_ctime.tv_sec); 2670 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2671 inode->i_ctime.tv_nsec); 2672 2673 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2674 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2675 btrfs_set_inode_sequence(leaf, item, inode->i_version); 2676 btrfs_set_inode_transid(leaf, item, trans->transid); 2677 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2678 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2679 btrfs_set_inode_block_group(leaf, item, 0); 2680 } 2681 2682 /* 2683 * copy everything in the in-memory inode into the btree. 2684 */ 2685 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 2686 struct btrfs_root *root, struct inode *inode) 2687 { 2688 struct btrfs_inode_item *inode_item; 2689 struct btrfs_path *path; 2690 struct extent_buffer *leaf; 2691 int ret; 2692 2693 path = btrfs_alloc_path(); 2694 if (!path) 2695 return -ENOMEM; 2696 2697 path->leave_spinning = 1; 2698 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 2699 1); 2700 if (ret) { 2701 if (ret > 0) 2702 ret = -ENOENT; 2703 goto failed; 2704 } 2705 2706 btrfs_unlock_up_safe(path, 1); 2707 leaf = path->nodes[0]; 2708 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2709 struct btrfs_inode_item); 2710 2711 fill_inode_item(trans, leaf, inode_item, inode); 2712 btrfs_mark_buffer_dirty(leaf); 2713 btrfs_set_inode_last_trans(trans, inode); 2714 ret = 0; 2715 failed: 2716 btrfs_free_path(path); 2717 return ret; 2718 } 2719 2720 /* 2721 * copy everything in the in-memory inode into the btree. 2722 */ 2723 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2724 struct btrfs_root *root, struct inode *inode) 2725 { 2726 int ret; 2727 2728 /* 2729 * If the inode is a free space inode, we can deadlock during commit 2730 * if we put it into the delayed code. 2731 * 2732 * The data relocation inode should also be directly updated 2733 * without delay 2734 */ 2735 if (!btrfs_is_free_space_inode(inode) 2736 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2737 btrfs_update_root_times(trans, root); 2738 2739 ret = btrfs_delayed_update_inode(trans, root, inode); 2740 if (!ret) 2741 btrfs_set_inode_last_trans(trans, inode); 2742 return ret; 2743 } 2744 2745 return btrfs_update_inode_item(trans, root, inode); 2746 } 2747 2748 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 2749 struct btrfs_root *root, struct inode *inode) 2750 { 2751 int ret; 2752 2753 ret = btrfs_update_inode(trans, root, inode); 2754 if (ret == -ENOSPC) 2755 return btrfs_update_inode_item(trans, root, inode); 2756 return ret; 2757 } 2758 2759 /* 2760 * unlink helper that gets used here in inode.c and in the tree logging 2761 * recovery code. It remove a link in a directory with a given name, and 2762 * also drops the back refs in the inode to the directory 2763 */ 2764 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2765 struct btrfs_root *root, 2766 struct inode *dir, struct inode *inode, 2767 const char *name, int name_len) 2768 { 2769 struct btrfs_path *path; 2770 int ret = 0; 2771 struct extent_buffer *leaf; 2772 struct btrfs_dir_item *di; 2773 struct btrfs_key key; 2774 u64 index; 2775 u64 ino = btrfs_ino(inode); 2776 u64 dir_ino = btrfs_ino(dir); 2777 2778 path = btrfs_alloc_path(); 2779 if (!path) { 2780 ret = -ENOMEM; 2781 goto out; 2782 } 2783 2784 path->leave_spinning = 1; 2785 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2786 name, name_len, -1); 2787 if (IS_ERR(di)) { 2788 ret = PTR_ERR(di); 2789 goto err; 2790 } 2791 if (!di) { 2792 ret = -ENOENT; 2793 goto err; 2794 } 2795 leaf = path->nodes[0]; 2796 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2797 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2798 if (ret) 2799 goto err; 2800 btrfs_release_path(path); 2801 2802 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 2803 dir_ino, &index); 2804 if (ret) { 2805 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2806 "inode %llu parent %llu\n", name_len, name, 2807 (unsigned long long)ino, (unsigned long long)dir_ino); 2808 btrfs_abort_transaction(trans, root, ret); 2809 goto err; 2810 } 2811 2812 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 2813 if (ret) { 2814 btrfs_abort_transaction(trans, root, ret); 2815 goto err; 2816 } 2817 2818 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2819 inode, dir_ino); 2820 if (ret != 0 && ret != -ENOENT) { 2821 btrfs_abort_transaction(trans, root, ret); 2822 goto err; 2823 } 2824 2825 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2826 dir, index); 2827 if (ret == -ENOENT) 2828 ret = 0; 2829 err: 2830 btrfs_free_path(path); 2831 if (ret) 2832 goto out; 2833 2834 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2835 inode_inc_iversion(inode); 2836 inode_inc_iversion(dir); 2837 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2838 ret = btrfs_update_inode(trans, root, dir); 2839 out: 2840 return ret; 2841 } 2842 2843 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2844 struct btrfs_root *root, 2845 struct inode *dir, struct inode *inode, 2846 const char *name, int name_len) 2847 { 2848 int ret; 2849 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 2850 if (!ret) { 2851 btrfs_drop_nlink(inode); 2852 ret = btrfs_update_inode(trans, root, inode); 2853 } 2854 return ret; 2855 } 2856 2857 2858 /* helper to check if there is any shared block in the path */ 2859 static int check_path_shared(struct btrfs_root *root, 2860 struct btrfs_path *path) 2861 { 2862 struct extent_buffer *eb; 2863 int level; 2864 u64 refs = 1; 2865 2866 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2867 int ret; 2868 2869 if (!path->nodes[level]) 2870 break; 2871 eb = path->nodes[level]; 2872 if (!btrfs_block_can_be_shared(root, eb)) 2873 continue; 2874 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, 2875 &refs, NULL); 2876 if (refs > 1) 2877 return 1; 2878 } 2879 return 0; 2880 } 2881 2882 /* 2883 * helper to start transaction for unlink and rmdir. 2884 * 2885 * unlink and rmdir are special in btrfs, they do not always free space. 2886 * so in enospc case, we should make sure they will free space before 2887 * allowing them to use the global metadata reservation. 2888 */ 2889 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 2890 struct dentry *dentry) 2891 { 2892 struct btrfs_trans_handle *trans; 2893 struct btrfs_root *root = BTRFS_I(dir)->root; 2894 struct btrfs_path *path; 2895 struct btrfs_inode_ref *ref; 2896 struct btrfs_dir_item *di; 2897 struct inode *inode = dentry->d_inode; 2898 u64 index; 2899 int check_link = 1; 2900 int err = -ENOSPC; 2901 int ret; 2902 u64 ino = btrfs_ino(inode); 2903 u64 dir_ino = btrfs_ino(dir); 2904 2905 /* 2906 * 1 for the possible orphan item 2907 * 1 for the dir item 2908 * 1 for the dir index 2909 * 1 for the inode ref 2910 * 1 for the inode ref in the tree log 2911 * 2 for the dir entries in the log 2912 * 1 for the inode 2913 */ 2914 trans = btrfs_start_transaction(root, 8); 2915 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2916 return trans; 2917 2918 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 2919 return ERR_PTR(-ENOSPC); 2920 2921 /* check if there is someone else holds reference */ 2922 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) 2923 return ERR_PTR(-ENOSPC); 2924 2925 if (atomic_read(&inode->i_count) > 2) 2926 return ERR_PTR(-ENOSPC); 2927 2928 if (xchg(&root->fs_info->enospc_unlink, 1)) 2929 return ERR_PTR(-ENOSPC); 2930 2931 path = btrfs_alloc_path(); 2932 if (!path) { 2933 root->fs_info->enospc_unlink = 0; 2934 return ERR_PTR(-ENOMEM); 2935 } 2936 2937 /* 1 for the orphan item */ 2938 trans = btrfs_start_transaction(root, 1); 2939 if (IS_ERR(trans)) { 2940 btrfs_free_path(path); 2941 root->fs_info->enospc_unlink = 0; 2942 return trans; 2943 } 2944 2945 path->skip_locking = 1; 2946 path->search_commit_root = 1; 2947 2948 ret = btrfs_lookup_inode(trans, root, path, 2949 &BTRFS_I(dir)->location, 0); 2950 if (ret < 0) { 2951 err = ret; 2952 goto out; 2953 } 2954 if (ret == 0) { 2955 if (check_path_shared(root, path)) 2956 goto out; 2957 } else { 2958 check_link = 0; 2959 } 2960 btrfs_release_path(path); 2961 2962 ret = btrfs_lookup_inode(trans, root, path, 2963 &BTRFS_I(inode)->location, 0); 2964 if (ret < 0) { 2965 err = ret; 2966 goto out; 2967 } 2968 if (ret == 0) { 2969 if (check_path_shared(root, path)) 2970 goto out; 2971 } else { 2972 check_link = 0; 2973 } 2974 btrfs_release_path(path); 2975 2976 if (ret == 0 && S_ISREG(inode->i_mode)) { 2977 ret = btrfs_lookup_file_extent(trans, root, path, 2978 ino, (u64)-1, 0); 2979 if (ret < 0) { 2980 err = ret; 2981 goto out; 2982 } 2983 BUG_ON(ret == 0); /* Corruption */ 2984 if (check_path_shared(root, path)) 2985 goto out; 2986 btrfs_release_path(path); 2987 } 2988 2989 if (!check_link) { 2990 err = 0; 2991 goto out; 2992 } 2993 2994 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2995 dentry->d_name.name, dentry->d_name.len, 0); 2996 if (IS_ERR(di)) { 2997 err = PTR_ERR(di); 2998 goto out; 2999 } 3000 if (di) { 3001 if (check_path_shared(root, path)) 3002 goto out; 3003 } else { 3004 err = 0; 3005 goto out; 3006 } 3007 btrfs_release_path(path); 3008 3009 ref = btrfs_lookup_inode_ref(trans, root, path, 3010 dentry->d_name.name, dentry->d_name.len, 3011 ino, dir_ino, 0); 3012 if (IS_ERR(ref)) { 3013 err = PTR_ERR(ref); 3014 goto out; 3015 } 3016 BUG_ON(!ref); /* Logic error */ 3017 if (check_path_shared(root, path)) 3018 goto out; 3019 index = btrfs_inode_ref_index(path->nodes[0], ref); 3020 btrfs_release_path(path); 3021 3022 /* 3023 * This is a commit root search, if we can lookup inode item and other 3024 * relative items in the commit root, it means the transaction of 3025 * dir/file creation has been committed, and the dir index item that we 3026 * delay to insert has also been inserted into the commit root. So 3027 * we needn't worry about the delayed insertion of the dir index item 3028 * here. 3029 */ 3030 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, 3031 dentry->d_name.name, dentry->d_name.len, 0); 3032 if (IS_ERR(di)) { 3033 err = PTR_ERR(di); 3034 goto out; 3035 } 3036 BUG_ON(ret == -ENOENT); 3037 if (check_path_shared(root, path)) 3038 goto out; 3039 3040 err = 0; 3041 out: 3042 btrfs_free_path(path); 3043 /* Migrate the orphan reservation over */ 3044 if (!err) 3045 err = btrfs_block_rsv_migrate(trans->block_rsv, 3046 &root->fs_info->global_block_rsv, 3047 trans->bytes_reserved); 3048 3049 if (err) { 3050 btrfs_end_transaction(trans, root); 3051 root->fs_info->enospc_unlink = 0; 3052 return ERR_PTR(err); 3053 } 3054 3055 trans->block_rsv = &root->fs_info->global_block_rsv; 3056 return trans; 3057 } 3058 3059 static void __unlink_end_trans(struct btrfs_trans_handle *trans, 3060 struct btrfs_root *root) 3061 { 3062 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 3063 btrfs_block_rsv_release(root, trans->block_rsv, 3064 trans->bytes_reserved); 3065 trans->block_rsv = &root->fs_info->trans_block_rsv; 3066 BUG_ON(!root->fs_info->enospc_unlink); 3067 root->fs_info->enospc_unlink = 0; 3068 } 3069 btrfs_end_transaction(trans, root); 3070 } 3071 3072 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3073 { 3074 struct btrfs_root *root = BTRFS_I(dir)->root; 3075 struct btrfs_trans_handle *trans; 3076 struct inode *inode = dentry->d_inode; 3077 int ret; 3078 unsigned long nr = 0; 3079 3080 trans = __unlink_start_trans(dir, dentry); 3081 if (IS_ERR(trans)) 3082 return PTR_ERR(trans); 3083 3084 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 3085 3086 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3087 dentry->d_name.name, dentry->d_name.len); 3088 if (ret) 3089 goto out; 3090 3091 if (inode->i_nlink == 0) { 3092 ret = btrfs_orphan_add(trans, inode); 3093 if (ret) 3094 goto out; 3095 } 3096 3097 out: 3098 nr = trans->blocks_used; 3099 __unlink_end_trans(trans, root); 3100 btrfs_btree_balance_dirty(root, nr); 3101 return ret; 3102 } 3103 3104 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 3105 struct btrfs_root *root, 3106 struct inode *dir, u64 objectid, 3107 const char *name, int name_len) 3108 { 3109 struct btrfs_path *path; 3110 struct extent_buffer *leaf; 3111 struct btrfs_dir_item *di; 3112 struct btrfs_key key; 3113 u64 index; 3114 int ret; 3115 u64 dir_ino = btrfs_ino(dir); 3116 3117 path = btrfs_alloc_path(); 3118 if (!path) 3119 return -ENOMEM; 3120 3121 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3122 name, name_len, -1); 3123 if (IS_ERR_OR_NULL(di)) { 3124 if (!di) 3125 ret = -ENOENT; 3126 else 3127 ret = PTR_ERR(di); 3128 goto out; 3129 } 3130 3131 leaf = path->nodes[0]; 3132 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3133 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3134 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3135 if (ret) { 3136 btrfs_abort_transaction(trans, root, ret); 3137 goto out; 3138 } 3139 btrfs_release_path(path); 3140 3141 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 3142 objectid, root->root_key.objectid, 3143 dir_ino, &index, name, name_len); 3144 if (ret < 0) { 3145 if (ret != -ENOENT) { 3146 btrfs_abort_transaction(trans, root, ret); 3147 goto out; 3148 } 3149 di = btrfs_search_dir_index_item(root, path, dir_ino, 3150 name, name_len); 3151 if (IS_ERR_OR_NULL(di)) { 3152 if (!di) 3153 ret = -ENOENT; 3154 else 3155 ret = PTR_ERR(di); 3156 btrfs_abort_transaction(trans, root, ret); 3157 goto out; 3158 } 3159 3160 leaf = path->nodes[0]; 3161 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3162 btrfs_release_path(path); 3163 index = key.offset; 3164 } 3165 btrfs_release_path(path); 3166 3167 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 3168 if (ret) { 3169 btrfs_abort_transaction(trans, root, ret); 3170 goto out; 3171 } 3172 3173 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3174 inode_inc_iversion(dir); 3175 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3176 ret = btrfs_update_inode(trans, root, dir); 3177 if (ret) 3178 btrfs_abort_transaction(trans, root, ret); 3179 out: 3180 btrfs_free_path(path); 3181 return ret; 3182 } 3183 3184 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 3185 { 3186 struct inode *inode = dentry->d_inode; 3187 int err = 0; 3188 struct btrfs_root *root = BTRFS_I(dir)->root; 3189 struct btrfs_trans_handle *trans; 3190 unsigned long nr = 0; 3191 3192 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3193 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3194 return -ENOTEMPTY; 3195 3196 trans = __unlink_start_trans(dir, dentry); 3197 if (IS_ERR(trans)) 3198 return PTR_ERR(trans); 3199 3200 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 3201 err = btrfs_unlink_subvol(trans, root, dir, 3202 BTRFS_I(inode)->location.objectid, 3203 dentry->d_name.name, 3204 dentry->d_name.len); 3205 goto out; 3206 } 3207 3208 err = btrfs_orphan_add(trans, inode); 3209 if (err) 3210 goto out; 3211 3212 /* now the directory is empty */ 3213 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3214 dentry->d_name.name, dentry->d_name.len); 3215 if (!err) 3216 btrfs_i_size_write(inode, 0); 3217 out: 3218 nr = trans->blocks_used; 3219 __unlink_end_trans(trans, root); 3220 btrfs_btree_balance_dirty(root, nr); 3221 3222 return err; 3223 } 3224 3225 /* 3226 * this can truncate away extent items, csum items and directory items. 3227 * It starts at a high offset and removes keys until it can't find 3228 * any higher than new_size 3229 * 3230 * csum items that cross the new i_size are truncated to the new size 3231 * as well. 3232 * 3233 * min_type is the minimum key type to truncate down to. If set to 0, this 3234 * will kill all the items on this inode, including the INODE_ITEM_KEY. 3235 */ 3236 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3237 struct btrfs_root *root, 3238 struct inode *inode, 3239 u64 new_size, u32 min_type) 3240 { 3241 struct btrfs_path *path; 3242 struct extent_buffer *leaf; 3243 struct btrfs_file_extent_item *fi; 3244 struct btrfs_key key; 3245 struct btrfs_key found_key; 3246 u64 extent_start = 0; 3247 u64 extent_num_bytes = 0; 3248 u64 extent_offset = 0; 3249 u64 item_end = 0; 3250 u64 mask = root->sectorsize - 1; 3251 u32 found_type = (u8)-1; 3252 int found_extent; 3253 int del_item; 3254 int pending_del_nr = 0; 3255 int pending_del_slot = 0; 3256 int extent_type = -1; 3257 int ret; 3258 int err = 0; 3259 u64 ino = btrfs_ino(inode); 3260 3261 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3262 3263 path = btrfs_alloc_path(); 3264 if (!path) 3265 return -ENOMEM; 3266 path->reada = -1; 3267 3268 if (root->ref_cows || root == root->fs_info->tree_root) 3269 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3270 3271 /* 3272 * This function is also used to drop the items in the log tree before 3273 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 3274 * it is used to drop the loged items. So we shouldn't kill the delayed 3275 * items. 3276 */ 3277 if (min_type == 0 && root == BTRFS_I(inode)->root) 3278 btrfs_kill_delayed_inode_items(inode); 3279 3280 key.objectid = ino; 3281 key.offset = (u64)-1; 3282 key.type = (u8)-1; 3283 3284 search_again: 3285 path->leave_spinning = 1; 3286 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3287 if (ret < 0) { 3288 err = ret; 3289 goto out; 3290 } 3291 3292 if (ret > 0) { 3293 /* there are no items in the tree for us to truncate, we're 3294 * done 3295 */ 3296 if (path->slots[0] == 0) 3297 goto out; 3298 path->slots[0]--; 3299 } 3300 3301 while (1) { 3302 fi = NULL; 3303 leaf = path->nodes[0]; 3304 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3305 found_type = btrfs_key_type(&found_key); 3306 3307 if (found_key.objectid != ino) 3308 break; 3309 3310 if (found_type < min_type) 3311 break; 3312 3313 item_end = found_key.offset; 3314 if (found_type == BTRFS_EXTENT_DATA_KEY) { 3315 fi = btrfs_item_ptr(leaf, path->slots[0], 3316 struct btrfs_file_extent_item); 3317 extent_type = btrfs_file_extent_type(leaf, fi); 3318 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3319 item_end += 3320 btrfs_file_extent_num_bytes(leaf, fi); 3321 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3322 item_end += btrfs_file_extent_inline_len(leaf, 3323 fi); 3324 } 3325 item_end--; 3326 } 3327 if (found_type > min_type) { 3328 del_item = 1; 3329 } else { 3330 if (item_end < new_size) 3331 break; 3332 if (found_key.offset >= new_size) 3333 del_item = 1; 3334 else 3335 del_item = 0; 3336 } 3337 found_extent = 0; 3338 /* FIXME, shrink the extent if the ref count is only 1 */ 3339 if (found_type != BTRFS_EXTENT_DATA_KEY) 3340 goto delete; 3341 3342 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3343 u64 num_dec; 3344 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3345 if (!del_item) { 3346 u64 orig_num_bytes = 3347 btrfs_file_extent_num_bytes(leaf, fi); 3348 extent_num_bytes = new_size - 3349 found_key.offset + root->sectorsize - 1; 3350 extent_num_bytes = extent_num_bytes & 3351 ~((u64)root->sectorsize - 1); 3352 btrfs_set_file_extent_num_bytes(leaf, fi, 3353 extent_num_bytes); 3354 num_dec = (orig_num_bytes - 3355 extent_num_bytes); 3356 if (root->ref_cows && extent_start != 0) 3357 inode_sub_bytes(inode, num_dec); 3358 btrfs_mark_buffer_dirty(leaf); 3359 } else { 3360 extent_num_bytes = 3361 btrfs_file_extent_disk_num_bytes(leaf, 3362 fi); 3363 extent_offset = found_key.offset - 3364 btrfs_file_extent_offset(leaf, fi); 3365 3366 /* FIXME blocksize != 4096 */ 3367 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 3368 if (extent_start != 0) { 3369 found_extent = 1; 3370 if (root->ref_cows) 3371 inode_sub_bytes(inode, num_dec); 3372 } 3373 } 3374 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3375 /* 3376 * we can't truncate inline items that have had 3377 * special encodings 3378 */ 3379 if (!del_item && 3380 btrfs_file_extent_compression(leaf, fi) == 0 && 3381 btrfs_file_extent_encryption(leaf, fi) == 0 && 3382 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 3383 u32 size = new_size - found_key.offset; 3384 3385 if (root->ref_cows) { 3386 inode_sub_bytes(inode, item_end + 1 - 3387 new_size); 3388 } 3389 size = 3390 btrfs_file_extent_calc_inline_size(size); 3391 btrfs_truncate_item(trans, root, path, 3392 size, 1); 3393 } else if (root->ref_cows) { 3394 inode_sub_bytes(inode, item_end + 1 - 3395 found_key.offset); 3396 } 3397 } 3398 delete: 3399 if (del_item) { 3400 if (!pending_del_nr) { 3401 /* no pending yet, add ourselves */ 3402 pending_del_slot = path->slots[0]; 3403 pending_del_nr = 1; 3404 } else if (pending_del_nr && 3405 path->slots[0] + 1 == pending_del_slot) { 3406 /* hop on the pending chunk */ 3407 pending_del_nr++; 3408 pending_del_slot = path->slots[0]; 3409 } else { 3410 BUG(); 3411 } 3412 } else { 3413 break; 3414 } 3415 if (found_extent && (root->ref_cows || 3416 root == root->fs_info->tree_root)) { 3417 btrfs_set_path_blocking(path); 3418 ret = btrfs_free_extent(trans, root, extent_start, 3419 extent_num_bytes, 0, 3420 btrfs_header_owner(leaf), 3421 ino, extent_offset, 0); 3422 BUG_ON(ret); 3423 } 3424 3425 if (found_type == BTRFS_INODE_ITEM_KEY) 3426 break; 3427 3428 if (path->slots[0] == 0 || 3429 path->slots[0] != pending_del_slot) { 3430 if (root->ref_cows && 3431 BTRFS_I(inode)->location.objectid != 3432 BTRFS_FREE_INO_OBJECTID) { 3433 err = -EAGAIN; 3434 goto out; 3435 } 3436 if (pending_del_nr) { 3437 ret = btrfs_del_items(trans, root, path, 3438 pending_del_slot, 3439 pending_del_nr); 3440 if (ret) { 3441 btrfs_abort_transaction(trans, 3442 root, ret); 3443 goto error; 3444 } 3445 pending_del_nr = 0; 3446 } 3447 btrfs_release_path(path); 3448 goto search_again; 3449 } else { 3450 path->slots[0]--; 3451 } 3452 } 3453 out: 3454 if (pending_del_nr) { 3455 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3456 pending_del_nr); 3457 if (ret) 3458 btrfs_abort_transaction(trans, root, ret); 3459 } 3460 error: 3461 btrfs_free_path(path); 3462 return err; 3463 } 3464 3465 /* 3466 * taken from block_truncate_page, but does cow as it zeros out 3467 * any bytes left in the last page in the file. 3468 */ 3469 static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3470 { 3471 struct inode *inode = mapping->host; 3472 struct btrfs_root *root = BTRFS_I(inode)->root; 3473 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3474 struct btrfs_ordered_extent *ordered; 3475 struct extent_state *cached_state = NULL; 3476 char *kaddr; 3477 u32 blocksize = root->sectorsize; 3478 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3479 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3480 struct page *page; 3481 gfp_t mask = btrfs_alloc_write_mask(mapping); 3482 int ret = 0; 3483 u64 page_start; 3484 u64 page_end; 3485 3486 if ((offset & (blocksize - 1)) == 0) 3487 goto out; 3488 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3489 if (ret) 3490 goto out; 3491 3492 ret = -ENOMEM; 3493 again: 3494 page = find_or_create_page(mapping, index, mask); 3495 if (!page) { 3496 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3497 goto out; 3498 } 3499 3500 page_start = page_offset(page); 3501 page_end = page_start + PAGE_CACHE_SIZE - 1; 3502 3503 if (!PageUptodate(page)) { 3504 ret = btrfs_readpage(NULL, page); 3505 lock_page(page); 3506 if (page->mapping != mapping) { 3507 unlock_page(page); 3508 page_cache_release(page); 3509 goto again; 3510 } 3511 if (!PageUptodate(page)) { 3512 ret = -EIO; 3513 goto out_unlock; 3514 } 3515 } 3516 wait_on_page_writeback(page); 3517 3518 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 3519 set_page_extent_mapped(page); 3520 3521 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3522 if (ordered) { 3523 unlock_extent_cached(io_tree, page_start, page_end, 3524 &cached_state, GFP_NOFS); 3525 unlock_page(page); 3526 page_cache_release(page); 3527 btrfs_start_ordered_extent(inode, ordered, 1); 3528 btrfs_put_ordered_extent(ordered); 3529 goto again; 3530 } 3531 3532 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3533 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3534 0, 0, &cached_state, GFP_NOFS); 3535 3536 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3537 &cached_state); 3538 if (ret) { 3539 unlock_extent_cached(io_tree, page_start, page_end, 3540 &cached_state, GFP_NOFS); 3541 goto out_unlock; 3542 } 3543 3544 ret = 0; 3545 if (offset != PAGE_CACHE_SIZE) { 3546 kaddr = kmap(page); 3547 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3548 flush_dcache_page(page); 3549 kunmap(page); 3550 } 3551 ClearPageChecked(page); 3552 set_page_dirty(page); 3553 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 3554 GFP_NOFS); 3555 3556 out_unlock: 3557 if (ret) 3558 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3559 unlock_page(page); 3560 page_cache_release(page); 3561 out: 3562 return ret; 3563 } 3564 3565 /* 3566 * This function puts in dummy file extents for the area we're creating a hole 3567 * for. So if we are truncating this file to a larger size we need to insert 3568 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 3569 * the range between oldsize and size 3570 */ 3571 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 3572 { 3573 struct btrfs_trans_handle *trans; 3574 struct btrfs_root *root = BTRFS_I(inode)->root; 3575 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3576 struct extent_map *em = NULL; 3577 struct extent_state *cached_state = NULL; 3578 u64 mask = root->sectorsize - 1; 3579 u64 hole_start = (oldsize + mask) & ~mask; 3580 u64 block_end = (size + mask) & ~mask; 3581 u64 last_byte; 3582 u64 cur_offset; 3583 u64 hole_size; 3584 int err = 0; 3585 3586 if (size <= hole_start) 3587 return 0; 3588 3589 while (1) { 3590 struct btrfs_ordered_extent *ordered; 3591 btrfs_wait_ordered_range(inode, hole_start, 3592 block_end - hole_start); 3593 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 3594 &cached_state); 3595 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3596 if (!ordered) 3597 break; 3598 unlock_extent_cached(io_tree, hole_start, block_end - 1, 3599 &cached_state, GFP_NOFS); 3600 btrfs_put_ordered_extent(ordered); 3601 } 3602 3603 cur_offset = hole_start; 3604 while (1) { 3605 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3606 block_end - cur_offset, 0); 3607 if (IS_ERR(em)) { 3608 err = PTR_ERR(em); 3609 break; 3610 } 3611 last_byte = min(extent_map_end(em), block_end); 3612 last_byte = (last_byte + mask) & ~mask; 3613 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3614 u64 hint_byte = 0; 3615 hole_size = last_byte - cur_offset; 3616 3617 trans = btrfs_start_transaction(root, 3); 3618 if (IS_ERR(trans)) { 3619 err = PTR_ERR(trans); 3620 break; 3621 } 3622 3623 err = btrfs_drop_extents(trans, inode, cur_offset, 3624 cur_offset + hole_size, 3625 &hint_byte, 1); 3626 if (err) { 3627 btrfs_abort_transaction(trans, root, err); 3628 btrfs_end_transaction(trans, root); 3629 break; 3630 } 3631 3632 err = btrfs_insert_file_extent(trans, root, 3633 btrfs_ino(inode), cur_offset, 0, 3634 0, hole_size, 0, hole_size, 3635 0, 0, 0); 3636 if (err) { 3637 btrfs_abort_transaction(trans, root, err); 3638 btrfs_end_transaction(trans, root); 3639 break; 3640 } 3641 3642 btrfs_drop_extent_cache(inode, hole_start, 3643 last_byte - 1, 0); 3644 3645 btrfs_update_inode(trans, root, inode); 3646 btrfs_end_transaction(trans, root); 3647 } 3648 free_extent_map(em); 3649 em = NULL; 3650 cur_offset = last_byte; 3651 if (cur_offset >= block_end) 3652 break; 3653 } 3654 3655 free_extent_map(em); 3656 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3657 GFP_NOFS); 3658 return err; 3659 } 3660 3661 static int btrfs_setsize(struct inode *inode, loff_t newsize) 3662 { 3663 struct btrfs_root *root = BTRFS_I(inode)->root; 3664 struct btrfs_trans_handle *trans; 3665 loff_t oldsize = i_size_read(inode); 3666 int ret; 3667 3668 if (newsize == oldsize) 3669 return 0; 3670 3671 if (newsize > oldsize) { 3672 truncate_pagecache(inode, oldsize, newsize); 3673 ret = btrfs_cont_expand(inode, oldsize, newsize); 3674 if (ret) 3675 return ret; 3676 3677 trans = btrfs_start_transaction(root, 1); 3678 if (IS_ERR(trans)) 3679 return PTR_ERR(trans); 3680 3681 i_size_write(inode, newsize); 3682 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3683 ret = btrfs_update_inode(trans, root, inode); 3684 btrfs_end_transaction(trans, root); 3685 } else { 3686 3687 /* 3688 * We're truncating a file that used to have good data down to 3689 * zero. Make sure it gets into the ordered flush list so that 3690 * any new writes get down to disk quickly. 3691 */ 3692 if (newsize == 0) 3693 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 3694 &BTRFS_I(inode)->runtime_flags); 3695 3696 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3697 truncate_setsize(inode, newsize); 3698 ret = btrfs_truncate(inode); 3699 } 3700 3701 return ret; 3702 } 3703 3704 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3705 { 3706 struct inode *inode = dentry->d_inode; 3707 struct btrfs_root *root = BTRFS_I(inode)->root; 3708 int err; 3709 3710 if (btrfs_root_readonly(root)) 3711 return -EROFS; 3712 3713 err = inode_change_ok(inode, attr); 3714 if (err) 3715 return err; 3716 3717 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3718 err = btrfs_setsize(inode, attr->ia_size); 3719 if (err) 3720 return err; 3721 } 3722 3723 if (attr->ia_valid) { 3724 setattr_copy(inode, attr); 3725 inode_inc_iversion(inode); 3726 err = btrfs_dirty_inode(inode); 3727 3728 if (!err && attr->ia_valid & ATTR_MODE) 3729 err = btrfs_acl_chmod(inode); 3730 } 3731 3732 return err; 3733 } 3734 3735 void btrfs_evict_inode(struct inode *inode) 3736 { 3737 struct btrfs_trans_handle *trans; 3738 struct btrfs_root *root = BTRFS_I(inode)->root; 3739 struct btrfs_block_rsv *rsv, *global_rsv; 3740 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3741 unsigned long nr; 3742 int ret; 3743 3744 trace_btrfs_inode_evict(inode); 3745 3746 truncate_inode_pages(&inode->i_data, 0); 3747 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3748 btrfs_is_free_space_inode(inode))) 3749 goto no_delete; 3750 3751 if (is_bad_inode(inode)) { 3752 btrfs_orphan_del(NULL, inode); 3753 goto no_delete; 3754 } 3755 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 3756 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3757 3758 if (root->fs_info->log_root_recovering) { 3759 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3760 &BTRFS_I(inode)->runtime_flags)); 3761 goto no_delete; 3762 } 3763 3764 if (inode->i_nlink > 0) { 3765 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3766 goto no_delete; 3767 } 3768 3769 rsv = btrfs_alloc_block_rsv(root); 3770 if (!rsv) { 3771 btrfs_orphan_del(NULL, inode); 3772 goto no_delete; 3773 } 3774 rsv->size = min_size; 3775 global_rsv = &root->fs_info->global_block_rsv; 3776 3777 btrfs_i_size_write(inode, 0); 3778 3779 /* 3780 * This is a bit simpler than btrfs_truncate since 3781 * 3782 * 1) We've already reserved our space for our orphan item in the 3783 * unlink. 3784 * 2) We're going to delete the inode item, so we don't need to update 3785 * it at all. 3786 * 3787 * So we just need to reserve some slack space in case we add bytes when 3788 * doing the truncate. 3789 */ 3790 while (1) { 3791 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3792 3793 /* 3794 * Try and steal from the global reserve since we will 3795 * likely not use this space anyway, we want to try as 3796 * hard as possible to get this to work. 3797 */ 3798 if (ret) 3799 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); 3800 3801 if (ret) { 3802 printk(KERN_WARNING "Could not get space for a " 3803 "delete, will truncate on mount %d\n", ret); 3804 btrfs_orphan_del(NULL, inode); 3805 btrfs_free_block_rsv(root, rsv); 3806 goto no_delete; 3807 } 3808 3809 trans = btrfs_start_transaction(root, 0); 3810 if (IS_ERR(trans)) { 3811 btrfs_orphan_del(NULL, inode); 3812 btrfs_free_block_rsv(root, rsv); 3813 goto no_delete; 3814 } 3815 3816 trans->block_rsv = rsv; 3817 3818 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3819 if (ret != -EAGAIN) 3820 break; 3821 3822 nr = trans->blocks_used; 3823 btrfs_end_transaction(trans, root); 3824 trans = NULL; 3825 btrfs_btree_balance_dirty(root, nr); 3826 } 3827 3828 btrfs_free_block_rsv(root, rsv); 3829 3830 if (ret == 0) { 3831 trans->block_rsv = root->orphan_block_rsv; 3832 ret = btrfs_orphan_del(trans, inode); 3833 BUG_ON(ret); 3834 } 3835 3836 trans->block_rsv = &root->fs_info->trans_block_rsv; 3837 if (!(root == root->fs_info->tree_root || 3838 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3839 btrfs_return_ino(root, btrfs_ino(inode)); 3840 3841 nr = trans->blocks_used; 3842 btrfs_end_transaction(trans, root); 3843 btrfs_btree_balance_dirty(root, nr); 3844 no_delete: 3845 clear_inode(inode); 3846 return; 3847 } 3848 3849 /* 3850 * this returns the key found in the dir entry in the location pointer. 3851 * If no dir entries were found, location->objectid is 0. 3852 */ 3853 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3854 struct btrfs_key *location) 3855 { 3856 const char *name = dentry->d_name.name; 3857 int namelen = dentry->d_name.len; 3858 struct btrfs_dir_item *di; 3859 struct btrfs_path *path; 3860 struct btrfs_root *root = BTRFS_I(dir)->root; 3861 int ret = 0; 3862 3863 path = btrfs_alloc_path(); 3864 if (!path) 3865 return -ENOMEM; 3866 3867 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 3868 namelen, 0); 3869 if (IS_ERR(di)) 3870 ret = PTR_ERR(di); 3871 3872 if (IS_ERR_OR_NULL(di)) 3873 goto out_err; 3874 3875 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3876 out: 3877 btrfs_free_path(path); 3878 return ret; 3879 out_err: 3880 location->objectid = 0; 3881 goto out; 3882 } 3883 3884 /* 3885 * when we hit a tree root in a directory, the btrfs part of the inode 3886 * needs to be changed to reflect the root directory of the tree root. This 3887 * is kind of like crossing a mount point. 3888 */ 3889 static int fixup_tree_root_location(struct btrfs_root *root, 3890 struct inode *dir, 3891 struct dentry *dentry, 3892 struct btrfs_key *location, 3893 struct btrfs_root **sub_root) 3894 { 3895 struct btrfs_path *path; 3896 struct btrfs_root *new_root; 3897 struct btrfs_root_ref *ref; 3898 struct extent_buffer *leaf; 3899 int ret; 3900 int err = 0; 3901 3902 path = btrfs_alloc_path(); 3903 if (!path) { 3904 err = -ENOMEM; 3905 goto out; 3906 } 3907 3908 err = -ENOENT; 3909 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3910 BTRFS_I(dir)->root->root_key.objectid, 3911 location->objectid); 3912 if (ret) { 3913 if (ret < 0) 3914 err = ret; 3915 goto out; 3916 } 3917 3918 leaf = path->nodes[0]; 3919 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3920 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 3921 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3922 goto out; 3923 3924 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3925 (unsigned long)(ref + 1), 3926 dentry->d_name.len); 3927 if (ret) 3928 goto out; 3929 3930 btrfs_release_path(path); 3931 3932 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3933 if (IS_ERR(new_root)) { 3934 err = PTR_ERR(new_root); 3935 goto out; 3936 } 3937 3938 if (btrfs_root_refs(&new_root->root_item) == 0) { 3939 err = -ENOENT; 3940 goto out; 3941 } 3942 3943 *sub_root = new_root; 3944 location->objectid = btrfs_root_dirid(&new_root->root_item); 3945 location->type = BTRFS_INODE_ITEM_KEY; 3946 location->offset = 0; 3947 err = 0; 3948 out: 3949 btrfs_free_path(path); 3950 return err; 3951 } 3952 3953 static void inode_tree_add(struct inode *inode) 3954 { 3955 struct btrfs_root *root = BTRFS_I(inode)->root; 3956 struct btrfs_inode *entry; 3957 struct rb_node **p; 3958 struct rb_node *parent; 3959 u64 ino = btrfs_ino(inode); 3960 again: 3961 p = &root->inode_tree.rb_node; 3962 parent = NULL; 3963 3964 if (inode_unhashed(inode)) 3965 return; 3966 3967 spin_lock(&root->inode_lock); 3968 while (*p) { 3969 parent = *p; 3970 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3971 3972 if (ino < btrfs_ino(&entry->vfs_inode)) 3973 p = &parent->rb_left; 3974 else if (ino > btrfs_ino(&entry->vfs_inode)) 3975 p = &parent->rb_right; 3976 else { 3977 WARN_ON(!(entry->vfs_inode.i_state & 3978 (I_WILL_FREE | I_FREEING))); 3979 rb_erase(parent, &root->inode_tree); 3980 RB_CLEAR_NODE(parent); 3981 spin_unlock(&root->inode_lock); 3982 goto again; 3983 } 3984 } 3985 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3986 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3987 spin_unlock(&root->inode_lock); 3988 } 3989 3990 static void inode_tree_del(struct inode *inode) 3991 { 3992 struct btrfs_root *root = BTRFS_I(inode)->root; 3993 int empty = 0; 3994 3995 spin_lock(&root->inode_lock); 3996 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3997 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3998 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3999 empty = RB_EMPTY_ROOT(&root->inode_tree); 4000 } 4001 spin_unlock(&root->inode_lock); 4002 4003 /* 4004 * Free space cache has inodes in the tree root, but the tree root has a 4005 * root_refs of 0, so this could end up dropping the tree root as a 4006 * snapshot, so we need the extra !root->fs_info->tree_root check to 4007 * make sure we don't drop it. 4008 */ 4009 if (empty && btrfs_root_refs(&root->root_item) == 0 && 4010 root != root->fs_info->tree_root) { 4011 synchronize_srcu(&root->fs_info->subvol_srcu); 4012 spin_lock(&root->inode_lock); 4013 empty = RB_EMPTY_ROOT(&root->inode_tree); 4014 spin_unlock(&root->inode_lock); 4015 if (empty) 4016 btrfs_add_dead_root(root); 4017 } 4018 } 4019 4020 void btrfs_invalidate_inodes(struct btrfs_root *root) 4021 { 4022 struct rb_node *node; 4023 struct rb_node *prev; 4024 struct btrfs_inode *entry; 4025 struct inode *inode; 4026 u64 objectid = 0; 4027 4028 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4029 4030 spin_lock(&root->inode_lock); 4031 again: 4032 node = root->inode_tree.rb_node; 4033 prev = NULL; 4034 while (node) { 4035 prev = node; 4036 entry = rb_entry(node, struct btrfs_inode, rb_node); 4037 4038 if (objectid < btrfs_ino(&entry->vfs_inode)) 4039 node = node->rb_left; 4040 else if (objectid > btrfs_ino(&entry->vfs_inode)) 4041 node = node->rb_right; 4042 else 4043 break; 4044 } 4045 if (!node) { 4046 while (prev) { 4047 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4048 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 4049 node = prev; 4050 break; 4051 } 4052 prev = rb_next(prev); 4053 } 4054 } 4055 while (node) { 4056 entry = rb_entry(node, struct btrfs_inode, rb_node); 4057 objectid = btrfs_ino(&entry->vfs_inode) + 1; 4058 inode = igrab(&entry->vfs_inode); 4059 if (inode) { 4060 spin_unlock(&root->inode_lock); 4061 if (atomic_read(&inode->i_count) > 1) 4062 d_prune_aliases(inode); 4063 /* 4064 * btrfs_drop_inode will have it removed from 4065 * the inode cache when its usage count 4066 * hits zero. 4067 */ 4068 iput(inode); 4069 cond_resched(); 4070 spin_lock(&root->inode_lock); 4071 goto again; 4072 } 4073 4074 if (cond_resched_lock(&root->inode_lock)) 4075 goto again; 4076 4077 node = rb_next(node); 4078 } 4079 spin_unlock(&root->inode_lock); 4080 } 4081 4082 static int btrfs_init_locked_inode(struct inode *inode, void *p) 4083 { 4084 struct btrfs_iget_args *args = p; 4085 inode->i_ino = args->ino; 4086 BTRFS_I(inode)->root = args->root; 4087 return 0; 4088 } 4089 4090 static int btrfs_find_actor(struct inode *inode, void *opaque) 4091 { 4092 struct btrfs_iget_args *args = opaque; 4093 return args->ino == btrfs_ino(inode) && 4094 args->root == BTRFS_I(inode)->root; 4095 } 4096 4097 static struct inode *btrfs_iget_locked(struct super_block *s, 4098 u64 objectid, 4099 struct btrfs_root *root) 4100 { 4101 struct inode *inode; 4102 struct btrfs_iget_args args; 4103 args.ino = objectid; 4104 args.root = root; 4105 4106 inode = iget5_locked(s, objectid, btrfs_find_actor, 4107 btrfs_init_locked_inode, 4108 (void *)&args); 4109 return inode; 4110 } 4111 4112 /* Get an inode object given its location and corresponding root. 4113 * Returns in *is_new if the inode was read from disk 4114 */ 4115 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 4116 struct btrfs_root *root, int *new) 4117 { 4118 struct inode *inode; 4119 4120 inode = btrfs_iget_locked(s, location->objectid, root); 4121 if (!inode) 4122 return ERR_PTR(-ENOMEM); 4123 4124 if (inode->i_state & I_NEW) { 4125 BTRFS_I(inode)->root = root; 4126 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4127 btrfs_read_locked_inode(inode); 4128 if (!is_bad_inode(inode)) { 4129 inode_tree_add(inode); 4130 unlock_new_inode(inode); 4131 if (new) 4132 *new = 1; 4133 } else { 4134 unlock_new_inode(inode); 4135 iput(inode); 4136 inode = ERR_PTR(-ESTALE); 4137 } 4138 } 4139 4140 return inode; 4141 } 4142 4143 static struct inode *new_simple_dir(struct super_block *s, 4144 struct btrfs_key *key, 4145 struct btrfs_root *root) 4146 { 4147 struct inode *inode = new_inode(s); 4148 4149 if (!inode) 4150 return ERR_PTR(-ENOMEM); 4151 4152 BTRFS_I(inode)->root = root; 4153 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4154 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 4155 4156 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4157 inode->i_op = &btrfs_dir_ro_inode_operations; 4158 inode->i_fop = &simple_dir_operations; 4159 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 4160 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4161 4162 return inode; 4163 } 4164 4165 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 4166 { 4167 struct inode *inode; 4168 struct btrfs_root *root = BTRFS_I(dir)->root; 4169 struct btrfs_root *sub_root = root; 4170 struct btrfs_key location; 4171 int index; 4172 int ret = 0; 4173 4174 if (dentry->d_name.len > BTRFS_NAME_LEN) 4175 return ERR_PTR(-ENAMETOOLONG); 4176 4177 if (unlikely(d_need_lookup(dentry))) { 4178 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); 4179 kfree(dentry->d_fsdata); 4180 dentry->d_fsdata = NULL; 4181 /* This thing is hashed, drop it for now */ 4182 d_drop(dentry); 4183 } else { 4184 ret = btrfs_inode_by_name(dir, dentry, &location); 4185 } 4186 4187 if (ret < 0) 4188 return ERR_PTR(ret); 4189 4190 if (location.objectid == 0) 4191 return NULL; 4192 4193 if (location.type == BTRFS_INODE_ITEM_KEY) { 4194 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 4195 return inode; 4196 } 4197 4198 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 4199 4200 index = srcu_read_lock(&root->fs_info->subvol_srcu); 4201 ret = fixup_tree_root_location(root, dir, dentry, 4202 &location, &sub_root); 4203 if (ret < 0) { 4204 if (ret != -ENOENT) 4205 inode = ERR_PTR(ret); 4206 else 4207 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4208 } else { 4209 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 4210 } 4211 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4212 4213 if (!IS_ERR(inode) && root != sub_root) { 4214 down_read(&root->fs_info->cleanup_work_sem); 4215 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4216 ret = btrfs_orphan_cleanup(sub_root); 4217 up_read(&root->fs_info->cleanup_work_sem); 4218 if (ret) 4219 inode = ERR_PTR(ret); 4220 } 4221 4222 return inode; 4223 } 4224 4225 static int btrfs_dentry_delete(const struct dentry *dentry) 4226 { 4227 struct btrfs_root *root; 4228 struct inode *inode = dentry->d_inode; 4229 4230 if (!inode && !IS_ROOT(dentry)) 4231 inode = dentry->d_parent->d_inode; 4232 4233 if (inode) { 4234 root = BTRFS_I(inode)->root; 4235 if (btrfs_root_refs(&root->root_item) == 0) 4236 return 1; 4237 4238 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 4239 return 1; 4240 } 4241 return 0; 4242 } 4243 4244 static void btrfs_dentry_release(struct dentry *dentry) 4245 { 4246 if (dentry->d_fsdata) 4247 kfree(dentry->d_fsdata); 4248 } 4249 4250 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4251 unsigned int flags) 4252 { 4253 struct dentry *ret; 4254 4255 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4256 if (unlikely(d_need_lookup(dentry))) { 4257 spin_lock(&dentry->d_lock); 4258 dentry->d_flags &= ~DCACHE_NEED_LOOKUP; 4259 spin_unlock(&dentry->d_lock); 4260 } 4261 return ret; 4262 } 4263 4264 unsigned char btrfs_filetype_table[] = { 4265 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 4266 }; 4267 4268 static int btrfs_real_readdir(struct file *filp, void *dirent, 4269 filldir_t filldir) 4270 { 4271 struct inode *inode = filp->f_dentry->d_inode; 4272 struct btrfs_root *root = BTRFS_I(inode)->root; 4273 struct btrfs_item *item; 4274 struct btrfs_dir_item *di; 4275 struct btrfs_key key; 4276 struct btrfs_key found_key; 4277 struct btrfs_path *path; 4278 struct list_head ins_list; 4279 struct list_head del_list; 4280 int ret; 4281 struct extent_buffer *leaf; 4282 int slot; 4283 unsigned char d_type; 4284 int over = 0; 4285 u32 di_cur; 4286 u32 di_total; 4287 u32 di_len; 4288 int key_type = BTRFS_DIR_INDEX_KEY; 4289 char tmp_name[32]; 4290 char *name_ptr; 4291 int name_len; 4292 int is_curr = 0; /* filp->f_pos points to the current index? */ 4293 4294 /* FIXME, use a real flag for deciding about the key type */ 4295 if (root->fs_info->tree_root == root) 4296 key_type = BTRFS_DIR_ITEM_KEY; 4297 4298 /* special case for "." */ 4299 if (filp->f_pos == 0) { 4300 over = filldir(dirent, ".", 1, 4301 filp->f_pos, btrfs_ino(inode), DT_DIR); 4302 if (over) 4303 return 0; 4304 filp->f_pos = 1; 4305 } 4306 /* special case for .., just use the back ref */ 4307 if (filp->f_pos == 1) { 4308 u64 pino = parent_ino(filp->f_path.dentry); 4309 over = filldir(dirent, "..", 2, 4310 filp->f_pos, pino, DT_DIR); 4311 if (over) 4312 return 0; 4313 filp->f_pos = 2; 4314 } 4315 path = btrfs_alloc_path(); 4316 if (!path) 4317 return -ENOMEM; 4318 4319 path->reada = 1; 4320 4321 if (key_type == BTRFS_DIR_INDEX_KEY) { 4322 INIT_LIST_HEAD(&ins_list); 4323 INIT_LIST_HEAD(&del_list); 4324 btrfs_get_delayed_items(inode, &ins_list, &del_list); 4325 } 4326 4327 btrfs_set_key_type(&key, key_type); 4328 key.offset = filp->f_pos; 4329 key.objectid = btrfs_ino(inode); 4330 4331 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4332 if (ret < 0) 4333 goto err; 4334 4335 while (1) { 4336 leaf = path->nodes[0]; 4337 slot = path->slots[0]; 4338 if (slot >= btrfs_header_nritems(leaf)) { 4339 ret = btrfs_next_leaf(root, path); 4340 if (ret < 0) 4341 goto err; 4342 else if (ret > 0) 4343 break; 4344 continue; 4345 } 4346 4347 item = btrfs_item_nr(leaf, slot); 4348 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4349 4350 if (found_key.objectid != key.objectid) 4351 break; 4352 if (btrfs_key_type(&found_key) != key_type) 4353 break; 4354 if (found_key.offset < filp->f_pos) 4355 goto next; 4356 if (key_type == BTRFS_DIR_INDEX_KEY && 4357 btrfs_should_delete_dir_index(&del_list, 4358 found_key.offset)) 4359 goto next; 4360 4361 filp->f_pos = found_key.offset; 4362 is_curr = 1; 4363 4364 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 4365 di_cur = 0; 4366 di_total = btrfs_item_size(leaf, item); 4367 4368 while (di_cur < di_total) { 4369 struct btrfs_key location; 4370 4371 if (verify_dir_item(root, leaf, di)) 4372 break; 4373 4374 name_len = btrfs_dir_name_len(leaf, di); 4375 if (name_len <= sizeof(tmp_name)) { 4376 name_ptr = tmp_name; 4377 } else { 4378 name_ptr = kmalloc(name_len, GFP_NOFS); 4379 if (!name_ptr) { 4380 ret = -ENOMEM; 4381 goto err; 4382 } 4383 } 4384 read_extent_buffer(leaf, name_ptr, 4385 (unsigned long)(di + 1), name_len); 4386 4387 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4388 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4389 4390 4391 /* is this a reference to our own snapshot? If so 4392 * skip it. 4393 * 4394 * In contrast to old kernels, we insert the snapshot's 4395 * dir item and dir index after it has been created, so 4396 * we won't find a reference to our own snapshot. We 4397 * still keep the following code for backward 4398 * compatibility. 4399 */ 4400 if (location.type == BTRFS_ROOT_ITEM_KEY && 4401 location.objectid == root->root_key.objectid) { 4402 over = 0; 4403 goto skip; 4404 } 4405 over = filldir(dirent, name_ptr, name_len, 4406 found_key.offset, location.objectid, 4407 d_type); 4408 4409 skip: 4410 if (name_ptr != tmp_name) 4411 kfree(name_ptr); 4412 4413 if (over) 4414 goto nopos; 4415 di_len = btrfs_dir_name_len(leaf, di) + 4416 btrfs_dir_data_len(leaf, di) + sizeof(*di); 4417 di_cur += di_len; 4418 di = (struct btrfs_dir_item *)((char *)di + di_len); 4419 } 4420 next: 4421 path->slots[0]++; 4422 } 4423 4424 if (key_type == BTRFS_DIR_INDEX_KEY) { 4425 if (is_curr) 4426 filp->f_pos++; 4427 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, 4428 &ins_list); 4429 if (ret) 4430 goto nopos; 4431 } 4432 4433 /* Reached end of directory/root. Bump pos past the last item. */ 4434 if (key_type == BTRFS_DIR_INDEX_KEY) 4435 /* 4436 * 32-bit glibc will use getdents64, but then strtol - 4437 * so the last number we can serve is this. 4438 */ 4439 filp->f_pos = 0x7fffffff; 4440 else 4441 filp->f_pos++; 4442 nopos: 4443 ret = 0; 4444 err: 4445 if (key_type == BTRFS_DIR_INDEX_KEY) 4446 btrfs_put_delayed_items(&ins_list, &del_list); 4447 btrfs_free_path(path); 4448 return ret; 4449 } 4450 4451 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 4452 { 4453 struct btrfs_root *root = BTRFS_I(inode)->root; 4454 struct btrfs_trans_handle *trans; 4455 int ret = 0; 4456 bool nolock = false; 4457 4458 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4459 return 0; 4460 4461 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) 4462 nolock = true; 4463 4464 if (wbc->sync_mode == WB_SYNC_ALL) { 4465 if (nolock) 4466 trans = btrfs_join_transaction_nolock(root); 4467 else 4468 trans = btrfs_join_transaction(root); 4469 if (IS_ERR(trans)) 4470 return PTR_ERR(trans); 4471 if (nolock) 4472 ret = btrfs_end_transaction_nolock(trans, root); 4473 else 4474 ret = btrfs_commit_transaction(trans, root); 4475 } 4476 return ret; 4477 } 4478 4479 /* 4480 * This is somewhat expensive, updating the tree every time the 4481 * inode changes. But, it is most likely to find the inode in cache. 4482 * FIXME, needs more benchmarking...there are no reasons other than performance 4483 * to keep or drop this code. 4484 */ 4485 int btrfs_dirty_inode(struct inode *inode) 4486 { 4487 struct btrfs_root *root = BTRFS_I(inode)->root; 4488 struct btrfs_trans_handle *trans; 4489 int ret; 4490 4491 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4492 return 0; 4493 4494 trans = btrfs_join_transaction(root); 4495 if (IS_ERR(trans)) 4496 return PTR_ERR(trans); 4497 4498 ret = btrfs_update_inode(trans, root, inode); 4499 if (ret && ret == -ENOSPC) { 4500 /* whoops, lets try again with the full transaction */ 4501 btrfs_end_transaction(trans, root); 4502 trans = btrfs_start_transaction(root, 1); 4503 if (IS_ERR(trans)) 4504 return PTR_ERR(trans); 4505 4506 ret = btrfs_update_inode(trans, root, inode); 4507 } 4508 btrfs_end_transaction(trans, root); 4509 if (BTRFS_I(inode)->delayed_node) 4510 btrfs_balance_delayed_items(root); 4511 4512 return ret; 4513 } 4514 4515 /* 4516 * This is a copy of file_update_time. We need this so we can return error on 4517 * ENOSPC for updating the inode in the case of file write and mmap writes. 4518 */ 4519 static int btrfs_update_time(struct inode *inode, struct timespec *now, 4520 int flags) 4521 { 4522 struct btrfs_root *root = BTRFS_I(inode)->root; 4523 4524 if (btrfs_root_readonly(root)) 4525 return -EROFS; 4526 4527 if (flags & S_VERSION) 4528 inode_inc_iversion(inode); 4529 if (flags & S_CTIME) 4530 inode->i_ctime = *now; 4531 if (flags & S_MTIME) 4532 inode->i_mtime = *now; 4533 if (flags & S_ATIME) 4534 inode->i_atime = *now; 4535 return btrfs_dirty_inode(inode); 4536 } 4537 4538 /* 4539 * find the highest existing sequence number in a directory 4540 * and then set the in-memory index_cnt variable to reflect 4541 * free sequence numbers 4542 */ 4543 static int btrfs_set_inode_index_count(struct inode *inode) 4544 { 4545 struct btrfs_root *root = BTRFS_I(inode)->root; 4546 struct btrfs_key key, found_key; 4547 struct btrfs_path *path; 4548 struct extent_buffer *leaf; 4549 int ret; 4550 4551 key.objectid = btrfs_ino(inode); 4552 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 4553 key.offset = (u64)-1; 4554 4555 path = btrfs_alloc_path(); 4556 if (!path) 4557 return -ENOMEM; 4558 4559 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4560 if (ret < 0) 4561 goto out; 4562 /* FIXME: we should be able to handle this */ 4563 if (ret == 0) 4564 goto out; 4565 ret = 0; 4566 4567 /* 4568 * MAGIC NUMBER EXPLANATION: 4569 * since we search a directory based on f_pos we have to start at 2 4570 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 4571 * else has to start at 2 4572 */ 4573 if (path->slots[0] == 0) { 4574 BTRFS_I(inode)->index_cnt = 2; 4575 goto out; 4576 } 4577 4578 path->slots[0]--; 4579 4580 leaf = path->nodes[0]; 4581 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4582 4583 if (found_key.objectid != btrfs_ino(inode) || 4584 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 4585 BTRFS_I(inode)->index_cnt = 2; 4586 goto out; 4587 } 4588 4589 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 4590 out: 4591 btrfs_free_path(path); 4592 return ret; 4593 } 4594 4595 /* 4596 * helper to find a free sequence number in a given directory. This current 4597 * code is very simple, later versions will do smarter things in the btree 4598 */ 4599 int btrfs_set_inode_index(struct inode *dir, u64 *index) 4600 { 4601 int ret = 0; 4602 4603 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 4604 ret = btrfs_inode_delayed_dir_index_count(dir); 4605 if (ret) { 4606 ret = btrfs_set_inode_index_count(dir); 4607 if (ret) 4608 return ret; 4609 } 4610 } 4611 4612 *index = BTRFS_I(dir)->index_cnt; 4613 BTRFS_I(dir)->index_cnt++; 4614 4615 return ret; 4616 } 4617 4618 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 4619 struct btrfs_root *root, 4620 struct inode *dir, 4621 const char *name, int name_len, 4622 u64 ref_objectid, u64 objectid, 4623 umode_t mode, u64 *index) 4624 { 4625 struct inode *inode; 4626 struct btrfs_inode_item *inode_item; 4627 struct btrfs_key *location; 4628 struct btrfs_path *path; 4629 struct btrfs_inode_ref *ref; 4630 struct btrfs_key key[2]; 4631 u32 sizes[2]; 4632 unsigned long ptr; 4633 int ret; 4634 int owner; 4635 4636 path = btrfs_alloc_path(); 4637 if (!path) 4638 return ERR_PTR(-ENOMEM); 4639 4640 inode = new_inode(root->fs_info->sb); 4641 if (!inode) { 4642 btrfs_free_path(path); 4643 return ERR_PTR(-ENOMEM); 4644 } 4645 4646 /* 4647 * we have to initialize this early, so we can reclaim the inode 4648 * number if we fail afterwards in this function. 4649 */ 4650 inode->i_ino = objectid; 4651 4652 if (dir) { 4653 trace_btrfs_inode_request(dir); 4654 4655 ret = btrfs_set_inode_index(dir, index); 4656 if (ret) { 4657 btrfs_free_path(path); 4658 iput(inode); 4659 return ERR_PTR(ret); 4660 } 4661 } 4662 /* 4663 * index_cnt is ignored for everything but a dir, 4664 * btrfs_get_inode_index_count has an explanation for the magic 4665 * number 4666 */ 4667 BTRFS_I(inode)->index_cnt = 2; 4668 BTRFS_I(inode)->root = root; 4669 BTRFS_I(inode)->generation = trans->transid; 4670 inode->i_generation = BTRFS_I(inode)->generation; 4671 4672 if (S_ISDIR(mode)) 4673 owner = 0; 4674 else 4675 owner = 1; 4676 4677 key[0].objectid = objectid; 4678 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4679 key[0].offset = 0; 4680 4681 key[1].objectid = objectid; 4682 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4683 key[1].offset = ref_objectid; 4684 4685 sizes[0] = sizeof(struct btrfs_inode_item); 4686 sizes[1] = name_len + sizeof(*ref); 4687 4688 path->leave_spinning = 1; 4689 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 4690 if (ret != 0) 4691 goto fail; 4692 4693 inode_init_owner(inode, dir, mode); 4694 inode_set_bytes(inode, 0); 4695 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4696 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4697 struct btrfs_inode_item); 4698 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, 4699 sizeof(*inode_item)); 4700 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4701 4702 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4703 struct btrfs_inode_ref); 4704 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 4705 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 4706 ptr = (unsigned long)(ref + 1); 4707 write_extent_buffer(path->nodes[0], name, ptr, name_len); 4708 4709 btrfs_mark_buffer_dirty(path->nodes[0]); 4710 btrfs_free_path(path); 4711 4712 location = &BTRFS_I(inode)->location; 4713 location->objectid = objectid; 4714 location->offset = 0; 4715 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 4716 4717 btrfs_inherit_iflags(inode, dir); 4718 4719 if (S_ISREG(mode)) { 4720 if (btrfs_test_opt(root, NODATASUM)) 4721 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4722 if (btrfs_test_opt(root, NODATACOW) || 4723 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) 4724 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4725 } 4726 4727 insert_inode_hash(inode); 4728 inode_tree_add(inode); 4729 4730 trace_btrfs_inode_new(inode); 4731 btrfs_set_inode_last_trans(trans, inode); 4732 4733 btrfs_update_root_times(trans, root); 4734 4735 return inode; 4736 fail: 4737 if (dir) 4738 BTRFS_I(dir)->index_cnt--; 4739 btrfs_free_path(path); 4740 iput(inode); 4741 return ERR_PTR(ret); 4742 } 4743 4744 static inline u8 btrfs_inode_type(struct inode *inode) 4745 { 4746 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 4747 } 4748 4749 /* 4750 * utility function to add 'inode' into 'parent_inode' with 4751 * a give name and a given sequence number. 4752 * if 'add_backref' is true, also insert a backref from the 4753 * inode to the parent directory. 4754 */ 4755 int btrfs_add_link(struct btrfs_trans_handle *trans, 4756 struct inode *parent_inode, struct inode *inode, 4757 const char *name, int name_len, int add_backref, u64 index) 4758 { 4759 int ret = 0; 4760 struct btrfs_key key; 4761 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4762 u64 ino = btrfs_ino(inode); 4763 u64 parent_ino = btrfs_ino(parent_inode); 4764 4765 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4766 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4767 } else { 4768 key.objectid = ino; 4769 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4770 key.offset = 0; 4771 } 4772 4773 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4774 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4775 key.objectid, root->root_key.objectid, 4776 parent_ino, index, name, name_len); 4777 } else if (add_backref) { 4778 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 4779 parent_ino, index); 4780 } 4781 4782 /* Nothing to clean up yet */ 4783 if (ret) 4784 return ret; 4785 4786 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4787 parent_inode, &key, 4788 btrfs_inode_type(inode), index); 4789 if (ret == -EEXIST) 4790 goto fail_dir_item; 4791 else if (ret) { 4792 btrfs_abort_transaction(trans, root, ret); 4793 return ret; 4794 } 4795 4796 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4797 name_len * 2); 4798 inode_inc_iversion(parent_inode); 4799 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4800 ret = btrfs_update_inode(trans, root, parent_inode); 4801 if (ret) 4802 btrfs_abort_transaction(trans, root, ret); 4803 return ret; 4804 4805 fail_dir_item: 4806 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4807 u64 local_index; 4808 int err; 4809 err = btrfs_del_root_ref(trans, root->fs_info->tree_root, 4810 key.objectid, root->root_key.objectid, 4811 parent_ino, &local_index, name, name_len); 4812 4813 } else if (add_backref) { 4814 u64 local_index; 4815 int err; 4816 4817 err = btrfs_del_inode_ref(trans, root, name, name_len, 4818 ino, parent_ino, &local_index); 4819 } 4820 return ret; 4821 } 4822 4823 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4824 struct inode *dir, struct dentry *dentry, 4825 struct inode *inode, int backref, u64 index) 4826 { 4827 int err = btrfs_add_link(trans, dir, inode, 4828 dentry->d_name.name, dentry->d_name.len, 4829 backref, index); 4830 if (err > 0) 4831 err = -EEXIST; 4832 return err; 4833 } 4834 4835 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4836 umode_t mode, dev_t rdev) 4837 { 4838 struct btrfs_trans_handle *trans; 4839 struct btrfs_root *root = BTRFS_I(dir)->root; 4840 struct inode *inode = NULL; 4841 int err; 4842 int drop_inode = 0; 4843 u64 objectid; 4844 unsigned long nr = 0; 4845 u64 index = 0; 4846 4847 if (!new_valid_dev(rdev)) 4848 return -EINVAL; 4849 4850 /* 4851 * 2 for inode item and ref 4852 * 2 for dir items 4853 * 1 for xattr if selinux is on 4854 */ 4855 trans = btrfs_start_transaction(root, 5); 4856 if (IS_ERR(trans)) 4857 return PTR_ERR(trans); 4858 4859 err = btrfs_find_free_ino(root, &objectid); 4860 if (err) 4861 goto out_unlock; 4862 4863 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4864 dentry->d_name.len, btrfs_ino(dir), objectid, 4865 mode, &index); 4866 if (IS_ERR(inode)) { 4867 err = PTR_ERR(inode); 4868 goto out_unlock; 4869 } 4870 4871 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4872 if (err) { 4873 drop_inode = 1; 4874 goto out_unlock; 4875 } 4876 4877 /* 4878 * If the active LSM wants to access the inode during 4879 * d_instantiate it needs these. Smack checks to see 4880 * if the filesystem supports xattrs by looking at the 4881 * ops vector. 4882 */ 4883 4884 inode->i_op = &btrfs_special_inode_operations; 4885 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4886 if (err) 4887 drop_inode = 1; 4888 else { 4889 init_special_inode(inode, inode->i_mode, rdev); 4890 btrfs_update_inode(trans, root, inode); 4891 d_instantiate(dentry, inode); 4892 } 4893 out_unlock: 4894 nr = trans->blocks_used; 4895 btrfs_end_transaction(trans, root); 4896 btrfs_btree_balance_dirty(root, nr); 4897 if (drop_inode) { 4898 inode_dec_link_count(inode); 4899 iput(inode); 4900 } 4901 return err; 4902 } 4903 4904 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4905 umode_t mode, bool excl) 4906 { 4907 struct btrfs_trans_handle *trans; 4908 struct btrfs_root *root = BTRFS_I(dir)->root; 4909 struct inode *inode = NULL; 4910 int drop_inode = 0; 4911 int err; 4912 unsigned long nr = 0; 4913 u64 objectid; 4914 u64 index = 0; 4915 4916 /* 4917 * 2 for inode item and ref 4918 * 2 for dir items 4919 * 1 for xattr if selinux is on 4920 */ 4921 trans = btrfs_start_transaction(root, 5); 4922 if (IS_ERR(trans)) 4923 return PTR_ERR(trans); 4924 4925 err = btrfs_find_free_ino(root, &objectid); 4926 if (err) 4927 goto out_unlock; 4928 4929 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4930 dentry->d_name.len, btrfs_ino(dir), objectid, 4931 mode, &index); 4932 if (IS_ERR(inode)) { 4933 err = PTR_ERR(inode); 4934 goto out_unlock; 4935 } 4936 4937 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4938 if (err) { 4939 drop_inode = 1; 4940 goto out_unlock; 4941 } 4942 4943 /* 4944 * If the active LSM wants to access the inode during 4945 * d_instantiate it needs these. Smack checks to see 4946 * if the filesystem supports xattrs by looking at the 4947 * ops vector. 4948 */ 4949 inode->i_fop = &btrfs_file_operations; 4950 inode->i_op = &btrfs_file_inode_operations; 4951 4952 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4953 if (err) 4954 drop_inode = 1; 4955 else { 4956 inode->i_mapping->a_ops = &btrfs_aops; 4957 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4958 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4959 d_instantiate(dentry, inode); 4960 } 4961 out_unlock: 4962 nr = trans->blocks_used; 4963 btrfs_end_transaction(trans, root); 4964 if (drop_inode) { 4965 inode_dec_link_count(inode); 4966 iput(inode); 4967 } 4968 btrfs_btree_balance_dirty(root, nr); 4969 return err; 4970 } 4971 4972 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 4973 struct dentry *dentry) 4974 { 4975 struct btrfs_trans_handle *trans; 4976 struct btrfs_root *root = BTRFS_I(dir)->root; 4977 struct inode *inode = old_dentry->d_inode; 4978 u64 index; 4979 unsigned long nr = 0; 4980 int err; 4981 int drop_inode = 0; 4982 4983 /* do not allow sys_link's with other subvols of the same device */ 4984 if (root->objectid != BTRFS_I(inode)->root->objectid) 4985 return -EXDEV; 4986 4987 if (inode->i_nlink == ~0U) 4988 return -EMLINK; 4989 4990 err = btrfs_set_inode_index(dir, &index); 4991 if (err) 4992 goto fail; 4993 4994 /* 4995 * 2 items for inode and inode ref 4996 * 2 items for dir items 4997 * 1 item for parent inode 4998 */ 4999 trans = btrfs_start_transaction(root, 5); 5000 if (IS_ERR(trans)) { 5001 err = PTR_ERR(trans); 5002 goto fail; 5003 } 5004 5005 btrfs_inc_nlink(inode); 5006 inode_inc_iversion(inode); 5007 inode->i_ctime = CURRENT_TIME; 5008 ihold(inode); 5009 5010 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5011 5012 if (err) { 5013 drop_inode = 1; 5014 } else { 5015 struct dentry *parent = dentry->d_parent; 5016 err = btrfs_update_inode(trans, root, inode); 5017 if (err) 5018 goto fail; 5019 d_instantiate(dentry, inode); 5020 btrfs_log_new_name(trans, inode, NULL, parent); 5021 } 5022 5023 nr = trans->blocks_used; 5024 btrfs_end_transaction(trans, root); 5025 fail: 5026 if (drop_inode) { 5027 inode_dec_link_count(inode); 5028 iput(inode); 5029 } 5030 btrfs_btree_balance_dirty(root, nr); 5031 return err; 5032 } 5033 5034 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 5035 { 5036 struct inode *inode = NULL; 5037 struct btrfs_trans_handle *trans; 5038 struct btrfs_root *root = BTRFS_I(dir)->root; 5039 int err = 0; 5040 int drop_on_err = 0; 5041 u64 objectid = 0; 5042 u64 index = 0; 5043 unsigned long nr = 1; 5044 5045 /* 5046 * 2 items for inode and ref 5047 * 2 items for dir items 5048 * 1 for xattr if selinux is on 5049 */ 5050 trans = btrfs_start_transaction(root, 5); 5051 if (IS_ERR(trans)) 5052 return PTR_ERR(trans); 5053 5054 err = btrfs_find_free_ino(root, &objectid); 5055 if (err) 5056 goto out_fail; 5057 5058 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5059 dentry->d_name.len, btrfs_ino(dir), objectid, 5060 S_IFDIR | mode, &index); 5061 if (IS_ERR(inode)) { 5062 err = PTR_ERR(inode); 5063 goto out_fail; 5064 } 5065 5066 drop_on_err = 1; 5067 5068 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5069 if (err) 5070 goto out_fail; 5071 5072 inode->i_op = &btrfs_dir_inode_operations; 5073 inode->i_fop = &btrfs_dir_file_operations; 5074 5075 btrfs_i_size_write(inode, 0); 5076 err = btrfs_update_inode(trans, root, inode); 5077 if (err) 5078 goto out_fail; 5079 5080 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 5081 dentry->d_name.len, 0, index); 5082 if (err) 5083 goto out_fail; 5084 5085 d_instantiate(dentry, inode); 5086 drop_on_err = 0; 5087 5088 out_fail: 5089 nr = trans->blocks_used; 5090 btrfs_end_transaction(trans, root); 5091 if (drop_on_err) 5092 iput(inode); 5093 btrfs_btree_balance_dirty(root, nr); 5094 return err; 5095 } 5096 5097 /* helper for btfs_get_extent. Given an existing extent in the tree, 5098 * and an extent that you want to insert, deal with overlap and insert 5099 * the new extent into the tree. 5100 */ 5101 static int merge_extent_mapping(struct extent_map_tree *em_tree, 5102 struct extent_map *existing, 5103 struct extent_map *em, 5104 u64 map_start, u64 map_len) 5105 { 5106 u64 start_diff; 5107 5108 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 5109 start_diff = map_start - em->start; 5110 em->start = map_start; 5111 em->len = map_len; 5112 if (em->block_start < EXTENT_MAP_LAST_BYTE && 5113 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 5114 em->block_start += start_diff; 5115 em->block_len -= start_diff; 5116 } 5117 return add_extent_mapping(em_tree, em); 5118 } 5119 5120 static noinline int uncompress_inline(struct btrfs_path *path, 5121 struct inode *inode, struct page *page, 5122 size_t pg_offset, u64 extent_offset, 5123 struct btrfs_file_extent_item *item) 5124 { 5125 int ret; 5126 struct extent_buffer *leaf = path->nodes[0]; 5127 char *tmp; 5128 size_t max_size; 5129 unsigned long inline_size; 5130 unsigned long ptr; 5131 int compress_type; 5132 5133 WARN_ON(pg_offset != 0); 5134 compress_type = btrfs_file_extent_compression(leaf, item); 5135 max_size = btrfs_file_extent_ram_bytes(leaf, item); 5136 inline_size = btrfs_file_extent_inline_item_len(leaf, 5137 btrfs_item_nr(leaf, path->slots[0])); 5138 tmp = kmalloc(inline_size, GFP_NOFS); 5139 if (!tmp) 5140 return -ENOMEM; 5141 ptr = btrfs_file_extent_inline_start(item); 5142 5143 read_extent_buffer(leaf, tmp, ptr, inline_size); 5144 5145 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 5146 ret = btrfs_decompress(compress_type, tmp, page, 5147 extent_offset, inline_size, max_size); 5148 if (ret) { 5149 char *kaddr = kmap_atomic(page); 5150 unsigned long copy_size = min_t(u64, 5151 PAGE_CACHE_SIZE - pg_offset, 5152 max_size - extent_offset); 5153 memset(kaddr + pg_offset, 0, copy_size); 5154 kunmap_atomic(kaddr); 5155 } 5156 kfree(tmp); 5157 return 0; 5158 } 5159 5160 /* 5161 * a bit scary, this does extent mapping from logical file offset to the disk. 5162 * the ugly parts come from merging extents from the disk with the in-ram 5163 * representation. This gets more complex because of the data=ordered code, 5164 * where the in-ram extents might be locked pending data=ordered completion. 5165 * 5166 * This also copies inline extents directly into the page. 5167 */ 5168 5169 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 5170 size_t pg_offset, u64 start, u64 len, 5171 int create) 5172 { 5173 int ret; 5174 int err = 0; 5175 u64 bytenr; 5176 u64 extent_start = 0; 5177 u64 extent_end = 0; 5178 u64 objectid = btrfs_ino(inode); 5179 u32 found_type; 5180 struct btrfs_path *path = NULL; 5181 struct btrfs_root *root = BTRFS_I(inode)->root; 5182 struct btrfs_file_extent_item *item; 5183 struct extent_buffer *leaf; 5184 struct btrfs_key found_key; 5185 struct extent_map *em = NULL; 5186 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5187 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5188 struct btrfs_trans_handle *trans = NULL; 5189 int compress_type; 5190 5191 again: 5192 read_lock(&em_tree->lock); 5193 em = lookup_extent_mapping(em_tree, start, len); 5194 if (em) 5195 em->bdev = root->fs_info->fs_devices->latest_bdev; 5196 read_unlock(&em_tree->lock); 5197 5198 if (em) { 5199 if (em->start > start || em->start + em->len <= start) 5200 free_extent_map(em); 5201 else if (em->block_start == EXTENT_MAP_INLINE && page) 5202 free_extent_map(em); 5203 else 5204 goto out; 5205 } 5206 em = alloc_extent_map(); 5207 if (!em) { 5208 err = -ENOMEM; 5209 goto out; 5210 } 5211 em->bdev = root->fs_info->fs_devices->latest_bdev; 5212 em->start = EXTENT_MAP_HOLE; 5213 em->orig_start = EXTENT_MAP_HOLE; 5214 em->len = (u64)-1; 5215 em->block_len = (u64)-1; 5216 5217 if (!path) { 5218 path = btrfs_alloc_path(); 5219 if (!path) { 5220 err = -ENOMEM; 5221 goto out; 5222 } 5223 /* 5224 * Chances are we'll be called again, so go ahead and do 5225 * readahead 5226 */ 5227 path->reada = 1; 5228 } 5229 5230 ret = btrfs_lookup_file_extent(trans, root, path, 5231 objectid, start, trans != NULL); 5232 if (ret < 0) { 5233 err = ret; 5234 goto out; 5235 } 5236 5237 if (ret != 0) { 5238 if (path->slots[0] == 0) 5239 goto not_found; 5240 path->slots[0]--; 5241 } 5242 5243 leaf = path->nodes[0]; 5244 item = btrfs_item_ptr(leaf, path->slots[0], 5245 struct btrfs_file_extent_item); 5246 /* are we inside the extent that was found? */ 5247 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5248 found_type = btrfs_key_type(&found_key); 5249 if (found_key.objectid != objectid || 5250 found_type != BTRFS_EXTENT_DATA_KEY) { 5251 goto not_found; 5252 } 5253 5254 found_type = btrfs_file_extent_type(leaf, item); 5255 extent_start = found_key.offset; 5256 compress_type = btrfs_file_extent_compression(leaf, item); 5257 if (found_type == BTRFS_FILE_EXTENT_REG || 5258 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5259 extent_end = extent_start + 5260 btrfs_file_extent_num_bytes(leaf, item); 5261 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5262 size_t size; 5263 size = btrfs_file_extent_inline_len(leaf, item); 5264 extent_end = (extent_start + size + root->sectorsize - 1) & 5265 ~((u64)root->sectorsize - 1); 5266 } 5267 5268 if (start >= extent_end) { 5269 path->slots[0]++; 5270 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 5271 ret = btrfs_next_leaf(root, path); 5272 if (ret < 0) { 5273 err = ret; 5274 goto out; 5275 } 5276 if (ret > 0) 5277 goto not_found; 5278 leaf = path->nodes[0]; 5279 } 5280 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5281 if (found_key.objectid != objectid || 5282 found_key.type != BTRFS_EXTENT_DATA_KEY) 5283 goto not_found; 5284 if (start + len <= found_key.offset) 5285 goto not_found; 5286 em->start = start; 5287 em->len = found_key.offset - start; 5288 goto not_found_em; 5289 } 5290 5291 if (found_type == BTRFS_FILE_EXTENT_REG || 5292 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5293 em->start = extent_start; 5294 em->len = extent_end - extent_start; 5295 em->orig_start = extent_start - 5296 btrfs_file_extent_offset(leaf, item); 5297 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5298 if (bytenr == 0) { 5299 em->block_start = EXTENT_MAP_HOLE; 5300 goto insert; 5301 } 5302 if (compress_type != BTRFS_COMPRESS_NONE) { 5303 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5304 em->compress_type = compress_type; 5305 em->block_start = bytenr; 5306 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5307 item); 5308 } else { 5309 bytenr += btrfs_file_extent_offset(leaf, item); 5310 em->block_start = bytenr; 5311 em->block_len = em->len; 5312 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 5313 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5314 } 5315 goto insert; 5316 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5317 unsigned long ptr; 5318 char *map; 5319 size_t size; 5320 size_t extent_offset; 5321 size_t copy_size; 5322 5323 em->block_start = EXTENT_MAP_INLINE; 5324 if (!page || create) { 5325 em->start = extent_start; 5326 em->len = extent_end - extent_start; 5327 goto out; 5328 } 5329 5330 size = btrfs_file_extent_inline_len(leaf, item); 5331 extent_offset = page_offset(page) + pg_offset - extent_start; 5332 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 5333 size - extent_offset); 5334 em->start = extent_start + extent_offset; 5335 em->len = (copy_size + root->sectorsize - 1) & 5336 ~((u64)root->sectorsize - 1); 5337 em->orig_start = EXTENT_MAP_INLINE; 5338 if (compress_type) { 5339 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5340 em->compress_type = compress_type; 5341 } 5342 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5343 if (create == 0 && !PageUptodate(page)) { 5344 if (btrfs_file_extent_compression(leaf, item) != 5345 BTRFS_COMPRESS_NONE) { 5346 ret = uncompress_inline(path, inode, page, 5347 pg_offset, 5348 extent_offset, item); 5349 BUG_ON(ret); /* -ENOMEM */ 5350 } else { 5351 map = kmap(page); 5352 read_extent_buffer(leaf, map + pg_offset, ptr, 5353 copy_size); 5354 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 5355 memset(map + pg_offset + copy_size, 0, 5356 PAGE_CACHE_SIZE - pg_offset - 5357 copy_size); 5358 } 5359 kunmap(page); 5360 } 5361 flush_dcache_page(page); 5362 } else if (create && PageUptodate(page)) { 5363 BUG(); 5364 if (!trans) { 5365 kunmap(page); 5366 free_extent_map(em); 5367 em = NULL; 5368 5369 btrfs_release_path(path); 5370 trans = btrfs_join_transaction(root); 5371 5372 if (IS_ERR(trans)) 5373 return ERR_CAST(trans); 5374 goto again; 5375 } 5376 map = kmap(page); 5377 write_extent_buffer(leaf, map + pg_offset, ptr, 5378 copy_size); 5379 kunmap(page); 5380 btrfs_mark_buffer_dirty(leaf); 5381 } 5382 set_extent_uptodate(io_tree, em->start, 5383 extent_map_end(em) - 1, NULL, GFP_NOFS); 5384 goto insert; 5385 } else { 5386 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5387 WARN_ON(1); 5388 } 5389 not_found: 5390 em->start = start; 5391 em->len = len; 5392 not_found_em: 5393 em->block_start = EXTENT_MAP_HOLE; 5394 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 5395 insert: 5396 btrfs_release_path(path); 5397 if (em->start > start || extent_map_end(em) <= start) { 5398 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 5399 "[%llu %llu]\n", (unsigned long long)em->start, 5400 (unsigned long long)em->len, 5401 (unsigned long long)start, 5402 (unsigned long long)len); 5403 err = -EIO; 5404 goto out; 5405 } 5406 5407 err = 0; 5408 write_lock(&em_tree->lock); 5409 ret = add_extent_mapping(em_tree, em); 5410 /* it is possible that someone inserted the extent into the tree 5411 * while we had the lock dropped. It is also possible that 5412 * an overlapping map exists in the tree 5413 */ 5414 if (ret == -EEXIST) { 5415 struct extent_map *existing; 5416 5417 ret = 0; 5418 5419 existing = lookup_extent_mapping(em_tree, start, len); 5420 if (existing && (existing->start > start || 5421 existing->start + existing->len <= start)) { 5422 free_extent_map(existing); 5423 existing = NULL; 5424 } 5425 if (!existing) { 5426 existing = lookup_extent_mapping(em_tree, em->start, 5427 em->len); 5428 if (existing) { 5429 err = merge_extent_mapping(em_tree, existing, 5430 em, start, 5431 root->sectorsize); 5432 free_extent_map(existing); 5433 if (err) { 5434 free_extent_map(em); 5435 em = NULL; 5436 } 5437 } else { 5438 err = -EIO; 5439 free_extent_map(em); 5440 em = NULL; 5441 } 5442 } else { 5443 free_extent_map(em); 5444 em = existing; 5445 err = 0; 5446 } 5447 } 5448 write_unlock(&em_tree->lock); 5449 out: 5450 5451 trace_btrfs_get_extent(root, em); 5452 5453 if (path) 5454 btrfs_free_path(path); 5455 if (trans) { 5456 ret = btrfs_end_transaction(trans, root); 5457 if (!err) 5458 err = ret; 5459 } 5460 if (err) { 5461 free_extent_map(em); 5462 return ERR_PTR(err); 5463 } 5464 BUG_ON(!em); /* Error is always set */ 5465 return em; 5466 } 5467 5468 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 5469 size_t pg_offset, u64 start, u64 len, 5470 int create) 5471 { 5472 struct extent_map *em; 5473 struct extent_map *hole_em = NULL; 5474 u64 range_start = start; 5475 u64 end; 5476 u64 found; 5477 u64 found_end; 5478 int err = 0; 5479 5480 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 5481 if (IS_ERR(em)) 5482 return em; 5483 if (em) { 5484 /* 5485 * if our em maps to a hole, there might 5486 * actually be delalloc bytes behind it 5487 */ 5488 if (em->block_start != EXTENT_MAP_HOLE) 5489 return em; 5490 else 5491 hole_em = em; 5492 } 5493 5494 /* check to see if we've wrapped (len == -1 or similar) */ 5495 end = start + len; 5496 if (end < start) 5497 end = (u64)-1; 5498 else 5499 end -= 1; 5500 5501 em = NULL; 5502 5503 /* ok, we didn't find anything, lets look for delalloc */ 5504 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 5505 end, len, EXTENT_DELALLOC, 1); 5506 found_end = range_start + found; 5507 if (found_end < range_start) 5508 found_end = (u64)-1; 5509 5510 /* 5511 * we didn't find anything useful, return 5512 * the original results from get_extent() 5513 */ 5514 if (range_start > end || found_end <= start) { 5515 em = hole_em; 5516 hole_em = NULL; 5517 goto out; 5518 } 5519 5520 /* adjust the range_start to make sure it doesn't 5521 * go backwards from the start they passed in 5522 */ 5523 range_start = max(start,range_start); 5524 found = found_end - range_start; 5525 5526 if (found > 0) { 5527 u64 hole_start = start; 5528 u64 hole_len = len; 5529 5530 em = alloc_extent_map(); 5531 if (!em) { 5532 err = -ENOMEM; 5533 goto out; 5534 } 5535 /* 5536 * when btrfs_get_extent can't find anything it 5537 * returns one huge hole 5538 * 5539 * make sure what it found really fits our range, and 5540 * adjust to make sure it is based on the start from 5541 * the caller 5542 */ 5543 if (hole_em) { 5544 u64 calc_end = extent_map_end(hole_em); 5545 5546 if (calc_end <= start || (hole_em->start > end)) { 5547 free_extent_map(hole_em); 5548 hole_em = NULL; 5549 } else { 5550 hole_start = max(hole_em->start, start); 5551 hole_len = calc_end - hole_start; 5552 } 5553 } 5554 em->bdev = NULL; 5555 if (hole_em && range_start > hole_start) { 5556 /* our hole starts before our delalloc, so we 5557 * have to return just the parts of the hole 5558 * that go until the delalloc starts 5559 */ 5560 em->len = min(hole_len, 5561 range_start - hole_start); 5562 em->start = hole_start; 5563 em->orig_start = hole_start; 5564 /* 5565 * don't adjust block start at all, 5566 * it is fixed at EXTENT_MAP_HOLE 5567 */ 5568 em->block_start = hole_em->block_start; 5569 em->block_len = hole_len; 5570 } else { 5571 em->start = range_start; 5572 em->len = found; 5573 em->orig_start = range_start; 5574 em->block_start = EXTENT_MAP_DELALLOC; 5575 em->block_len = found; 5576 } 5577 } else if (hole_em) { 5578 return hole_em; 5579 } 5580 out: 5581 5582 free_extent_map(hole_em); 5583 if (err) { 5584 free_extent_map(em); 5585 return ERR_PTR(err); 5586 } 5587 return em; 5588 } 5589 5590 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5591 struct extent_map *em, 5592 u64 start, u64 len) 5593 { 5594 struct btrfs_root *root = BTRFS_I(inode)->root; 5595 struct btrfs_trans_handle *trans; 5596 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5597 struct btrfs_key ins; 5598 u64 alloc_hint; 5599 int ret; 5600 bool insert = false; 5601 5602 /* 5603 * Ok if the extent map we looked up is a hole and is for the exact 5604 * range we want, there is no reason to allocate a new one, however if 5605 * it is not right then we need to free this one and drop the cache for 5606 * our range. 5607 */ 5608 if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5609 em->len != len) { 5610 free_extent_map(em); 5611 em = NULL; 5612 insert = true; 5613 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5614 } 5615 5616 trans = btrfs_join_transaction(root); 5617 if (IS_ERR(trans)) 5618 return ERR_CAST(trans); 5619 5620 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) 5621 btrfs_add_inode_defrag(trans, inode); 5622 5623 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5624 5625 alloc_hint = get_extent_allocation_hint(inode, start, len); 5626 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, 5627 alloc_hint, &ins, 1); 5628 if (ret) { 5629 em = ERR_PTR(ret); 5630 goto out; 5631 } 5632 5633 if (!em) { 5634 em = alloc_extent_map(); 5635 if (!em) { 5636 em = ERR_PTR(-ENOMEM); 5637 goto out; 5638 } 5639 } 5640 5641 em->start = start; 5642 em->orig_start = em->start; 5643 em->len = ins.offset; 5644 5645 em->block_start = ins.objectid; 5646 em->block_len = ins.offset; 5647 em->bdev = root->fs_info->fs_devices->latest_bdev; 5648 5649 /* 5650 * We need to do this because if we're using the original em we searched 5651 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5652 */ 5653 em->flags = 0; 5654 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5655 5656 while (insert) { 5657 write_lock(&em_tree->lock); 5658 ret = add_extent_mapping(em_tree, em); 5659 write_unlock(&em_tree->lock); 5660 if (ret != -EEXIST) 5661 break; 5662 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5663 } 5664 5665 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5666 ins.offset, ins.offset, 0); 5667 if (ret) { 5668 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 5669 em = ERR_PTR(ret); 5670 } 5671 out: 5672 btrfs_end_transaction(trans, root); 5673 return em; 5674 } 5675 5676 /* 5677 * returns 1 when the nocow is safe, < 1 on error, 0 if the 5678 * block must be cow'd 5679 */ 5680 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 5681 struct inode *inode, u64 offset, u64 len) 5682 { 5683 struct btrfs_path *path; 5684 int ret; 5685 struct extent_buffer *leaf; 5686 struct btrfs_root *root = BTRFS_I(inode)->root; 5687 struct btrfs_file_extent_item *fi; 5688 struct btrfs_key key; 5689 u64 disk_bytenr; 5690 u64 backref_offset; 5691 u64 extent_end; 5692 u64 num_bytes; 5693 int slot; 5694 int found_type; 5695 5696 path = btrfs_alloc_path(); 5697 if (!path) 5698 return -ENOMEM; 5699 5700 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 5701 offset, 0); 5702 if (ret < 0) 5703 goto out; 5704 5705 slot = path->slots[0]; 5706 if (ret == 1) { 5707 if (slot == 0) { 5708 /* can't find the item, must cow */ 5709 ret = 0; 5710 goto out; 5711 } 5712 slot--; 5713 } 5714 ret = 0; 5715 leaf = path->nodes[0]; 5716 btrfs_item_key_to_cpu(leaf, &key, slot); 5717 if (key.objectid != btrfs_ino(inode) || 5718 key.type != BTRFS_EXTENT_DATA_KEY) { 5719 /* not our file or wrong item type, must cow */ 5720 goto out; 5721 } 5722 5723 if (key.offset > offset) { 5724 /* Wrong offset, must cow */ 5725 goto out; 5726 } 5727 5728 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5729 found_type = btrfs_file_extent_type(leaf, fi); 5730 if (found_type != BTRFS_FILE_EXTENT_REG && 5731 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 5732 /* not a regular extent, must cow */ 5733 goto out; 5734 } 5735 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 5736 backref_offset = btrfs_file_extent_offset(leaf, fi); 5737 5738 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 5739 if (extent_end < offset + len) { 5740 /* extent doesn't include our full range, must cow */ 5741 goto out; 5742 } 5743 5744 if (btrfs_extent_readonly(root, disk_bytenr)) 5745 goto out; 5746 5747 /* 5748 * look for other files referencing this extent, if we 5749 * find any we must cow 5750 */ 5751 if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), 5752 key.offset - backref_offset, disk_bytenr)) 5753 goto out; 5754 5755 /* 5756 * adjust disk_bytenr and num_bytes to cover just the bytes 5757 * in this extent we are about to write. If there 5758 * are any csums in that range we have to cow in order 5759 * to keep the csums correct 5760 */ 5761 disk_bytenr += backref_offset; 5762 disk_bytenr += offset - key.offset; 5763 num_bytes = min(offset + len, extent_end) - offset; 5764 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 5765 goto out; 5766 /* 5767 * all of the above have passed, it is safe to overwrite this extent 5768 * without cow 5769 */ 5770 ret = 1; 5771 out: 5772 btrfs_free_path(path); 5773 return ret; 5774 } 5775 5776 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5777 struct buffer_head *bh_result, int create) 5778 { 5779 struct extent_map *em; 5780 struct btrfs_root *root = BTRFS_I(inode)->root; 5781 u64 start = iblock << inode->i_blkbits; 5782 u64 len = bh_result->b_size; 5783 struct btrfs_trans_handle *trans; 5784 5785 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5786 if (IS_ERR(em)) 5787 return PTR_ERR(em); 5788 5789 /* 5790 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5791 * io. INLINE is special, and we could probably kludge it in here, but 5792 * it's still buffered so for safety lets just fall back to the generic 5793 * buffered path. 5794 * 5795 * For COMPRESSED we _have_ to read the entire extent in so we can 5796 * decompress it, so there will be buffering required no matter what we 5797 * do, so go ahead and fallback to buffered. 5798 * 5799 * We return -ENOTBLK because thats what makes DIO go ahead and go back 5800 * to buffered IO. Don't blame me, this is the price we pay for using 5801 * the generic code. 5802 */ 5803 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5804 em->block_start == EXTENT_MAP_INLINE) { 5805 free_extent_map(em); 5806 return -ENOTBLK; 5807 } 5808 5809 /* Just a good old fashioned hole, return */ 5810 if (!create && (em->block_start == EXTENT_MAP_HOLE || 5811 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5812 free_extent_map(em); 5813 /* DIO will do one hole at a time, so just unlock a sector */ 5814 unlock_extent(&BTRFS_I(inode)->io_tree, start, 5815 start + root->sectorsize - 1); 5816 return 0; 5817 } 5818 5819 /* 5820 * We don't allocate a new extent in the following cases 5821 * 5822 * 1) The inode is marked as NODATACOW. In this case we'll just use the 5823 * existing extent. 5824 * 2) The extent is marked as PREALLOC. We're good to go here and can 5825 * just use the extent. 5826 * 5827 */ 5828 if (!create) { 5829 len = em->len - (start - em->start); 5830 goto map; 5831 } 5832 5833 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 5834 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 5835 em->block_start != EXTENT_MAP_HOLE)) { 5836 int type; 5837 int ret; 5838 u64 block_start; 5839 5840 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5841 type = BTRFS_ORDERED_PREALLOC; 5842 else 5843 type = BTRFS_ORDERED_NOCOW; 5844 len = min(len, em->len - (start - em->start)); 5845 block_start = em->block_start + (start - em->start); 5846 5847 /* 5848 * we're not going to log anything, but we do need 5849 * to make sure the current transaction stays open 5850 * while we look for nocow cross refs 5851 */ 5852 trans = btrfs_join_transaction(root); 5853 if (IS_ERR(trans)) 5854 goto must_cow; 5855 5856 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5857 ret = btrfs_add_ordered_extent_dio(inode, start, 5858 block_start, len, len, type); 5859 btrfs_end_transaction(trans, root); 5860 if (ret) { 5861 free_extent_map(em); 5862 return ret; 5863 } 5864 goto unlock; 5865 } 5866 btrfs_end_transaction(trans, root); 5867 } 5868 must_cow: 5869 /* 5870 * this will cow the extent, reset the len in case we changed 5871 * it above 5872 */ 5873 len = bh_result->b_size; 5874 em = btrfs_new_extent_direct(inode, em, start, len); 5875 if (IS_ERR(em)) 5876 return PTR_ERR(em); 5877 len = min(len, em->len - (start - em->start)); 5878 unlock: 5879 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, 5880 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, 5881 0, NULL, GFP_NOFS); 5882 map: 5883 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 5884 inode->i_blkbits; 5885 bh_result->b_size = len; 5886 bh_result->b_bdev = em->bdev; 5887 set_buffer_mapped(bh_result); 5888 if (create) { 5889 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5890 set_buffer_new(bh_result); 5891 5892 /* 5893 * Need to update the i_size under the extent lock so buffered 5894 * readers will get the updated i_size when we unlock. 5895 */ 5896 if (start + len > i_size_read(inode)) 5897 i_size_write(inode, start + len); 5898 } 5899 5900 free_extent_map(em); 5901 5902 return 0; 5903 } 5904 5905 struct btrfs_dio_private { 5906 struct inode *inode; 5907 u64 logical_offset; 5908 u64 disk_bytenr; 5909 u64 bytes; 5910 u32 *csums; 5911 void *private; 5912 5913 /* number of bios pending for this dio */ 5914 atomic_t pending_bios; 5915 5916 /* IO errors */ 5917 int errors; 5918 5919 struct bio *orig_bio; 5920 }; 5921 5922 static void btrfs_endio_direct_read(struct bio *bio, int err) 5923 { 5924 struct btrfs_dio_private *dip = bio->bi_private; 5925 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5926 struct bio_vec *bvec = bio->bi_io_vec; 5927 struct inode *inode = dip->inode; 5928 struct btrfs_root *root = BTRFS_I(inode)->root; 5929 u64 start; 5930 u32 *private = dip->csums; 5931 5932 start = dip->logical_offset; 5933 do { 5934 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 5935 struct page *page = bvec->bv_page; 5936 char *kaddr; 5937 u32 csum = ~(u32)0; 5938 unsigned long flags; 5939 5940 local_irq_save(flags); 5941 kaddr = kmap_atomic(page); 5942 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, 5943 csum, bvec->bv_len); 5944 btrfs_csum_final(csum, (char *)&csum); 5945 kunmap_atomic(kaddr); 5946 local_irq_restore(flags); 5947 5948 flush_dcache_page(bvec->bv_page); 5949 if (csum != *private) { 5950 printk(KERN_ERR "btrfs csum failed ino %llu off" 5951 " %llu csum %u private %u\n", 5952 (unsigned long long)btrfs_ino(inode), 5953 (unsigned long long)start, 5954 csum, *private); 5955 err = -EIO; 5956 } 5957 } 5958 5959 start += bvec->bv_len; 5960 private++; 5961 bvec++; 5962 } while (bvec <= bvec_end); 5963 5964 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 5965 dip->logical_offset + dip->bytes - 1); 5966 bio->bi_private = dip->private; 5967 5968 kfree(dip->csums); 5969 kfree(dip); 5970 5971 /* If we had a csum failure make sure to clear the uptodate flag */ 5972 if (err) 5973 clear_bit(BIO_UPTODATE, &bio->bi_flags); 5974 dio_end_io(bio, err); 5975 } 5976 5977 static void btrfs_endio_direct_write(struct bio *bio, int err) 5978 { 5979 struct btrfs_dio_private *dip = bio->bi_private; 5980 struct inode *inode = dip->inode; 5981 struct btrfs_root *root = BTRFS_I(inode)->root; 5982 struct btrfs_ordered_extent *ordered = NULL; 5983 u64 ordered_offset = dip->logical_offset; 5984 u64 ordered_bytes = dip->bytes; 5985 int ret; 5986 5987 if (err) 5988 goto out_done; 5989 again: 5990 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5991 &ordered_offset, 5992 ordered_bytes, !err); 5993 if (!ret) 5994 goto out_test; 5995 5996 ordered->work.func = finish_ordered_fn; 5997 ordered->work.flags = 0; 5998 btrfs_queue_worker(&root->fs_info->endio_write_workers, 5999 &ordered->work); 6000 out_test: 6001 /* 6002 * our bio might span multiple ordered extents. If we haven't 6003 * completed the accounting for the whole dio, go back and try again 6004 */ 6005 if (ordered_offset < dip->logical_offset + dip->bytes) { 6006 ordered_bytes = dip->logical_offset + dip->bytes - 6007 ordered_offset; 6008 ordered = NULL; 6009 goto again; 6010 } 6011 out_done: 6012 bio->bi_private = dip->private; 6013 6014 kfree(dip); 6015 6016 /* If we had an error make sure to clear the uptodate flag */ 6017 if (err) 6018 clear_bit(BIO_UPTODATE, &bio->bi_flags); 6019 dio_end_io(bio, err); 6020 } 6021 6022 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 6023 struct bio *bio, int mirror_num, 6024 unsigned long bio_flags, u64 offset) 6025 { 6026 int ret; 6027 struct btrfs_root *root = BTRFS_I(inode)->root; 6028 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 6029 BUG_ON(ret); /* -ENOMEM */ 6030 return 0; 6031 } 6032 6033 static void btrfs_end_dio_bio(struct bio *bio, int err) 6034 { 6035 struct btrfs_dio_private *dip = bio->bi_private; 6036 6037 if (err) { 6038 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " 6039 "sector %#Lx len %u err no %d\n", 6040 (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, 6041 (unsigned long long)bio->bi_sector, bio->bi_size, err); 6042 dip->errors = 1; 6043 6044 /* 6045 * before atomic variable goto zero, we must make sure 6046 * dip->errors is perceived to be set. 6047 */ 6048 smp_mb__before_atomic_dec(); 6049 } 6050 6051 /* if there are more bios still pending for this dio, just exit */ 6052 if (!atomic_dec_and_test(&dip->pending_bios)) 6053 goto out; 6054 6055 if (dip->errors) 6056 bio_io_error(dip->orig_bio); 6057 else { 6058 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); 6059 bio_endio(dip->orig_bio, 0); 6060 } 6061 out: 6062 bio_put(bio); 6063 } 6064 6065 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 6066 u64 first_sector, gfp_t gfp_flags) 6067 { 6068 int nr_vecs = bio_get_nr_vecs(bdev); 6069 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 6070 } 6071 6072 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 6073 int rw, u64 file_offset, int skip_sum, 6074 u32 *csums, int async_submit) 6075 { 6076 int write = rw & REQ_WRITE; 6077 struct btrfs_root *root = BTRFS_I(inode)->root; 6078 int ret; 6079 6080 bio_get(bio); 6081 6082 if (!write) { 6083 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6084 if (ret) 6085 goto err; 6086 } 6087 6088 if (skip_sum) 6089 goto map; 6090 6091 if (write && async_submit) { 6092 ret = btrfs_wq_submit_bio(root->fs_info, 6093 inode, rw, bio, 0, 0, 6094 file_offset, 6095 __btrfs_submit_bio_start_direct_io, 6096 __btrfs_submit_bio_done); 6097 goto err; 6098 } else if (write) { 6099 /* 6100 * If we aren't doing async submit, calculate the csum of the 6101 * bio now. 6102 */ 6103 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 6104 if (ret) 6105 goto err; 6106 } else if (!skip_sum) { 6107 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, 6108 file_offset, csums); 6109 if (ret) 6110 goto err; 6111 } 6112 6113 map: 6114 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 6115 err: 6116 bio_put(bio); 6117 return ret; 6118 } 6119 6120 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 6121 int skip_sum) 6122 { 6123 struct inode *inode = dip->inode; 6124 struct btrfs_root *root = BTRFS_I(inode)->root; 6125 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 6126 struct bio *bio; 6127 struct bio *orig_bio = dip->orig_bio; 6128 struct bio_vec *bvec = orig_bio->bi_io_vec; 6129 u64 start_sector = orig_bio->bi_sector; 6130 u64 file_offset = dip->logical_offset; 6131 u64 submit_len = 0; 6132 u64 map_length; 6133 int nr_pages = 0; 6134 u32 *csums = dip->csums; 6135 int ret = 0; 6136 int async_submit = 0; 6137 int write = rw & REQ_WRITE; 6138 6139 map_length = orig_bio->bi_size; 6140 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6141 &map_length, NULL, 0); 6142 if (ret) { 6143 bio_put(orig_bio); 6144 return -EIO; 6145 } 6146 6147 if (map_length >= orig_bio->bi_size) { 6148 bio = orig_bio; 6149 goto submit; 6150 } 6151 6152 async_submit = 1; 6153 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6154 if (!bio) 6155 return -ENOMEM; 6156 bio->bi_private = dip; 6157 bio->bi_end_io = btrfs_end_dio_bio; 6158 atomic_inc(&dip->pending_bios); 6159 6160 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6161 if (unlikely(map_length < submit_len + bvec->bv_len || 6162 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 6163 bvec->bv_offset) < bvec->bv_len)) { 6164 /* 6165 * inc the count before we submit the bio so 6166 * we know the end IO handler won't happen before 6167 * we inc the count. Otherwise, the dip might get freed 6168 * before we're done setting it up 6169 */ 6170 atomic_inc(&dip->pending_bios); 6171 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6172 file_offset, skip_sum, 6173 csums, async_submit); 6174 if (ret) { 6175 bio_put(bio); 6176 atomic_dec(&dip->pending_bios); 6177 goto out_err; 6178 } 6179 6180 /* Write's use the ordered csums */ 6181 if (!write && !skip_sum) 6182 csums = csums + nr_pages; 6183 start_sector += submit_len >> 9; 6184 file_offset += submit_len; 6185 6186 submit_len = 0; 6187 nr_pages = 0; 6188 6189 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 6190 start_sector, GFP_NOFS); 6191 if (!bio) 6192 goto out_err; 6193 bio->bi_private = dip; 6194 bio->bi_end_io = btrfs_end_dio_bio; 6195 6196 map_length = orig_bio->bi_size; 6197 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6198 &map_length, NULL, 0); 6199 if (ret) { 6200 bio_put(bio); 6201 goto out_err; 6202 } 6203 } else { 6204 submit_len += bvec->bv_len; 6205 nr_pages ++; 6206 bvec++; 6207 } 6208 } 6209 6210 submit: 6211 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6212 csums, async_submit); 6213 if (!ret) 6214 return 0; 6215 6216 bio_put(bio); 6217 out_err: 6218 dip->errors = 1; 6219 /* 6220 * before atomic variable goto zero, we must 6221 * make sure dip->errors is perceived to be set. 6222 */ 6223 smp_mb__before_atomic_dec(); 6224 if (atomic_dec_and_test(&dip->pending_bios)) 6225 bio_io_error(dip->orig_bio); 6226 6227 /* bio_end_io() will handle error, so we needn't return it */ 6228 return 0; 6229 } 6230 6231 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 6232 loff_t file_offset) 6233 { 6234 struct btrfs_root *root = BTRFS_I(inode)->root; 6235 struct btrfs_dio_private *dip; 6236 struct bio_vec *bvec = bio->bi_io_vec; 6237 int skip_sum; 6238 int write = rw & REQ_WRITE; 6239 int ret = 0; 6240 6241 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 6242 6243 dip = kmalloc(sizeof(*dip), GFP_NOFS); 6244 if (!dip) { 6245 ret = -ENOMEM; 6246 goto free_ordered; 6247 } 6248 dip->csums = NULL; 6249 6250 /* Write's use the ordered csum stuff, so we don't need dip->csums */ 6251 if (!write && !skip_sum) { 6252 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6253 if (!dip->csums) { 6254 kfree(dip); 6255 ret = -ENOMEM; 6256 goto free_ordered; 6257 } 6258 } 6259 6260 dip->private = bio->bi_private; 6261 dip->inode = inode; 6262 dip->logical_offset = file_offset; 6263 6264 dip->bytes = 0; 6265 do { 6266 dip->bytes += bvec->bv_len; 6267 bvec++; 6268 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); 6269 6270 dip->disk_bytenr = (u64)bio->bi_sector << 9; 6271 bio->bi_private = dip; 6272 dip->errors = 0; 6273 dip->orig_bio = bio; 6274 atomic_set(&dip->pending_bios, 0); 6275 6276 if (write) 6277 bio->bi_end_io = btrfs_endio_direct_write; 6278 else 6279 bio->bi_end_io = btrfs_endio_direct_read; 6280 6281 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 6282 if (!ret) 6283 return; 6284 free_ordered: 6285 /* 6286 * If this is a write, we need to clean up the reserved space and kill 6287 * the ordered extent. 6288 */ 6289 if (write) { 6290 struct btrfs_ordered_extent *ordered; 6291 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 6292 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 6293 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 6294 btrfs_free_reserved_extent(root, ordered->start, 6295 ordered->disk_len); 6296 btrfs_put_ordered_extent(ordered); 6297 btrfs_put_ordered_extent(ordered); 6298 } 6299 bio_endio(bio, ret); 6300 } 6301 6302 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 6303 const struct iovec *iov, loff_t offset, 6304 unsigned long nr_segs) 6305 { 6306 int seg; 6307 int i; 6308 size_t size; 6309 unsigned long addr; 6310 unsigned blocksize_mask = root->sectorsize - 1; 6311 ssize_t retval = -EINVAL; 6312 loff_t end = offset; 6313 6314 if (offset & blocksize_mask) 6315 goto out; 6316 6317 /* Check the memory alignment. Blocks cannot straddle pages */ 6318 for (seg = 0; seg < nr_segs; seg++) { 6319 addr = (unsigned long)iov[seg].iov_base; 6320 size = iov[seg].iov_len; 6321 end += size; 6322 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6323 goto out; 6324 6325 /* If this is a write we don't need to check anymore */ 6326 if (rw & WRITE) 6327 continue; 6328 6329 /* 6330 * Check to make sure we don't have duplicate iov_base's in this 6331 * iovec, if so return EINVAL, otherwise we'll get csum errors 6332 * when reading back. 6333 */ 6334 for (i = seg + 1; i < nr_segs; i++) { 6335 if (iov[seg].iov_base == iov[i].iov_base) 6336 goto out; 6337 } 6338 } 6339 retval = 0; 6340 out: 6341 return retval; 6342 } 6343 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 6344 const struct iovec *iov, loff_t offset, 6345 unsigned long nr_segs) 6346 { 6347 struct file *file = iocb->ki_filp; 6348 struct inode *inode = file->f_mapping->host; 6349 struct btrfs_ordered_extent *ordered; 6350 struct extent_state *cached_state = NULL; 6351 u64 lockstart, lockend; 6352 ssize_t ret; 6353 int writing = rw & WRITE; 6354 int write_bits = 0; 6355 size_t count = iov_length(iov, nr_segs); 6356 6357 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6358 offset, nr_segs)) { 6359 return 0; 6360 } 6361 6362 lockstart = offset; 6363 lockend = offset + count - 1; 6364 6365 if (writing) { 6366 ret = btrfs_delalloc_reserve_space(inode, count); 6367 if (ret) 6368 goto out; 6369 } 6370 6371 while (1) { 6372 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6373 0, &cached_state); 6374 /* 6375 * We're concerned with the entire range that we're going to be 6376 * doing DIO to, so we need to make sure theres no ordered 6377 * extents in this range. 6378 */ 6379 ordered = btrfs_lookup_ordered_range(inode, lockstart, 6380 lockend - lockstart + 1); 6381 6382 /* 6383 * We need to make sure there are no buffered pages in this 6384 * range either, we could have raced between the invalidate in 6385 * generic_file_direct_write and locking the extent. The 6386 * invalidate needs to happen so that reads after a write do not 6387 * get stale data. 6388 */ 6389 if (!ordered && (!writing || 6390 !test_range_bit(&BTRFS_I(inode)->io_tree, 6391 lockstart, lockend, EXTENT_UPTODATE, 0, 6392 cached_state))) 6393 break; 6394 6395 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6396 &cached_state, GFP_NOFS); 6397 6398 if (ordered) { 6399 btrfs_start_ordered_extent(inode, ordered, 1); 6400 btrfs_put_ordered_extent(ordered); 6401 } else { 6402 /* Screw you mmap */ 6403 ret = filemap_write_and_wait_range(file->f_mapping, 6404 lockstart, 6405 lockend); 6406 if (ret) 6407 goto out; 6408 6409 /* 6410 * If we found a page that couldn't be invalidated just 6411 * fall back to buffered. 6412 */ 6413 ret = invalidate_inode_pages2_range(file->f_mapping, 6414 lockstart >> PAGE_CACHE_SHIFT, 6415 lockend >> PAGE_CACHE_SHIFT); 6416 if (ret) { 6417 if (ret == -EBUSY) 6418 ret = 0; 6419 goto out; 6420 } 6421 } 6422 6423 cond_resched(); 6424 } 6425 6426 /* 6427 * we don't use btrfs_set_extent_delalloc because we don't want 6428 * the dirty or uptodate bits 6429 */ 6430 if (writing) { 6431 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; 6432 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6433 EXTENT_DELALLOC, NULL, &cached_state, 6434 GFP_NOFS); 6435 if (ret) { 6436 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6437 lockend, EXTENT_LOCKED | write_bits, 6438 1, 0, &cached_state, GFP_NOFS); 6439 goto out; 6440 } 6441 } 6442 6443 free_extent_state(cached_state); 6444 cached_state = NULL; 6445 6446 ret = __blockdev_direct_IO(rw, iocb, inode, 6447 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6448 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6449 btrfs_submit_direct, 0); 6450 6451 if (ret < 0 && ret != -EIOCBQUEUED) { 6452 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, 6453 offset + iov_length(iov, nr_segs) - 1, 6454 EXTENT_LOCKED | write_bits, 1, 0, 6455 &cached_state, GFP_NOFS); 6456 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { 6457 /* 6458 * We're falling back to buffered, unlock the section we didn't 6459 * do IO on. 6460 */ 6461 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, 6462 offset + iov_length(iov, nr_segs) - 1, 6463 EXTENT_LOCKED | write_bits, 1, 0, 6464 &cached_state, GFP_NOFS); 6465 } 6466 out: 6467 free_extent_state(cached_state); 6468 return ret; 6469 } 6470 6471 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6472 __u64 start, __u64 len) 6473 { 6474 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6475 } 6476 6477 int btrfs_readpage(struct file *file, struct page *page) 6478 { 6479 struct extent_io_tree *tree; 6480 tree = &BTRFS_I(page->mapping->host)->io_tree; 6481 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 6482 } 6483 6484 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6485 { 6486 struct extent_io_tree *tree; 6487 6488 6489 if (current->flags & PF_MEMALLOC) { 6490 redirty_page_for_writepage(wbc, page); 6491 unlock_page(page); 6492 return 0; 6493 } 6494 tree = &BTRFS_I(page->mapping->host)->io_tree; 6495 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 6496 } 6497 6498 int btrfs_writepages(struct address_space *mapping, 6499 struct writeback_control *wbc) 6500 { 6501 struct extent_io_tree *tree; 6502 6503 tree = &BTRFS_I(mapping->host)->io_tree; 6504 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 6505 } 6506 6507 static int 6508 btrfs_readpages(struct file *file, struct address_space *mapping, 6509 struct list_head *pages, unsigned nr_pages) 6510 { 6511 struct extent_io_tree *tree; 6512 tree = &BTRFS_I(mapping->host)->io_tree; 6513 return extent_readpages(tree, mapping, pages, nr_pages, 6514 btrfs_get_extent); 6515 } 6516 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6517 { 6518 struct extent_io_tree *tree; 6519 struct extent_map_tree *map; 6520 int ret; 6521 6522 tree = &BTRFS_I(page->mapping->host)->io_tree; 6523 map = &BTRFS_I(page->mapping->host)->extent_tree; 6524 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 6525 if (ret == 1) { 6526 ClearPagePrivate(page); 6527 set_page_private(page, 0); 6528 page_cache_release(page); 6529 } 6530 return ret; 6531 } 6532 6533 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6534 { 6535 if (PageWriteback(page) || PageDirty(page)) 6536 return 0; 6537 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 6538 } 6539 6540 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6541 { 6542 struct inode *inode = page->mapping->host; 6543 struct extent_io_tree *tree; 6544 struct btrfs_ordered_extent *ordered; 6545 struct extent_state *cached_state = NULL; 6546 u64 page_start = page_offset(page); 6547 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6548 6549 /* 6550 * we have the page locked, so new writeback can't start, 6551 * and the dirty bit won't be cleared while we are here. 6552 * 6553 * Wait for IO on this page so that we can safely clear 6554 * the PagePrivate2 bit and do ordered accounting 6555 */ 6556 wait_on_page_writeback(page); 6557 6558 tree = &BTRFS_I(inode)->io_tree; 6559 if (offset) { 6560 btrfs_releasepage(page, GFP_NOFS); 6561 return; 6562 } 6563 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6564 ordered = btrfs_lookup_ordered_extent(inode, 6565 page_offset(page)); 6566 if (ordered) { 6567 /* 6568 * IO on this page will never be started, so we need 6569 * to account for any ordered extents now 6570 */ 6571 clear_extent_bit(tree, page_start, page_end, 6572 EXTENT_DIRTY | EXTENT_DELALLOC | 6573 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6574 &cached_state, GFP_NOFS); 6575 /* 6576 * whoever cleared the private bit is responsible 6577 * for the finish_ordered_io 6578 */ 6579 if (TestClearPagePrivate2(page) && 6580 btrfs_dec_test_ordered_pending(inode, &ordered, page_start, 6581 PAGE_CACHE_SIZE, 1)) { 6582 btrfs_finish_ordered_io(ordered); 6583 } 6584 btrfs_put_ordered_extent(ordered); 6585 cached_state = NULL; 6586 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6587 } 6588 clear_extent_bit(tree, page_start, page_end, 6589 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6590 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6591 __btrfs_releasepage(page, GFP_NOFS); 6592 6593 ClearPageChecked(page); 6594 if (PagePrivate(page)) { 6595 ClearPagePrivate(page); 6596 set_page_private(page, 0); 6597 page_cache_release(page); 6598 } 6599 } 6600 6601 /* 6602 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 6603 * called from a page fault handler when a page is first dirtied. Hence we must 6604 * be careful to check for EOF conditions here. We set the page up correctly 6605 * for a written page which means we get ENOSPC checking when writing into 6606 * holes and correct delalloc and unwritten extent mapping on filesystems that 6607 * support these features. 6608 * 6609 * We are not allowed to take the i_mutex here so we have to play games to 6610 * protect against truncate races as the page could now be beyond EOF. Because 6611 * vmtruncate() writes the inode size before removing pages, once we have the 6612 * page lock we can determine safely if the page is beyond EOF. If it is not 6613 * beyond EOF, then the page is guaranteed safe against truncation until we 6614 * unlock the page. 6615 */ 6616 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 6617 { 6618 struct page *page = vmf->page; 6619 struct inode *inode = fdentry(vma->vm_file)->d_inode; 6620 struct btrfs_root *root = BTRFS_I(inode)->root; 6621 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6622 struct btrfs_ordered_extent *ordered; 6623 struct extent_state *cached_state = NULL; 6624 char *kaddr; 6625 unsigned long zero_start; 6626 loff_t size; 6627 int ret; 6628 int reserved = 0; 6629 u64 page_start; 6630 u64 page_end; 6631 6632 sb_start_pagefault(inode->i_sb); 6633 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6634 if (!ret) { 6635 ret = file_update_time(vma->vm_file); 6636 reserved = 1; 6637 } 6638 if (ret) { 6639 if (ret == -ENOMEM) 6640 ret = VM_FAULT_OOM; 6641 else /* -ENOSPC, -EIO, etc */ 6642 ret = VM_FAULT_SIGBUS; 6643 if (reserved) 6644 goto out; 6645 goto out_noreserve; 6646 } 6647 6648 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6649 again: 6650 lock_page(page); 6651 size = i_size_read(inode); 6652 page_start = page_offset(page); 6653 page_end = page_start + PAGE_CACHE_SIZE - 1; 6654 6655 if ((page->mapping != inode->i_mapping) || 6656 (page_start >= size)) { 6657 /* page got truncated out from underneath us */ 6658 goto out_unlock; 6659 } 6660 wait_on_page_writeback(page); 6661 6662 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 6663 set_page_extent_mapped(page); 6664 6665 /* 6666 * we can't set the delalloc bits if there are pending ordered 6667 * extents. Drop our locks and wait for them to finish 6668 */ 6669 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6670 if (ordered) { 6671 unlock_extent_cached(io_tree, page_start, page_end, 6672 &cached_state, GFP_NOFS); 6673 unlock_page(page); 6674 btrfs_start_ordered_extent(inode, ordered, 1); 6675 btrfs_put_ordered_extent(ordered); 6676 goto again; 6677 } 6678 6679 /* 6680 * XXX - page_mkwrite gets called every time the page is dirtied, even 6681 * if it was already dirty, so for space accounting reasons we need to 6682 * clear any delalloc bits for the range we are fixing to save. There 6683 * is probably a better way to do this, but for now keep consistent with 6684 * prepare_pages in the normal write path. 6685 */ 6686 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6687 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6688 0, 0, &cached_state, GFP_NOFS); 6689 6690 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6691 &cached_state); 6692 if (ret) { 6693 unlock_extent_cached(io_tree, page_start, page_end, 6694 &cached_state, GFP_NOFS); 6695 ret = VM_FAULT_SIGBUS; 6696 goto out_unlock; 6697 } 6698 ret = 0; 6699 6700 /* page is wholly or partially inside EOF */ 6701 if (page_start + PAGE_CACHE_SIZE > size) 6702 zero_start = size & ~PAGE_CACHE_MASK; 6703 else 6704 zero_start = PAGE_CACHE_SIZE; 6705 6706 if (zero_start != PAGE_CACHE_SIZE) { 6707 kaddr = kmap(page); 6708 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 6709 flush_dcache_page(page); 6710 kunmap(page); 6711 } 6712 ClearPageChecked(page); 6713 set_page_dirty(page); 6714 SetPageUptodate(page); 6715 6716 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6717 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6718 6719 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6720 6721 out_unlock: 6722 if (!ret) { 6723 sb_end_pagefault(inode->i_sb); 6724 return VM_FAULT_LOCKED; 6725 } 6726 unlock_page(page); 6727 out: 6728 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6729 out_noreserve: 6730 sb_end_pagefault(inode->i_sb); 6731 return ret; 6732 } 6733 6734 static int btrfs_truncate(struct inode *inode) 6735 { 6736 struct btrfs_root *root = BTRFS_I(inode)->root; 6737 struct btrfs_block_rsv *rsv; 6738 int ret; 6739 int err = 0; 6740 struct btrfs_trans_handle *trans; 6741 unsigned long nr; 6742 u64 mask = root->sectorsize - 1; 6743 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6744 6745 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6746 if (ret) 6747 return ret; 6748 6749 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6750 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6751 6752 /* 6753 * Yes ladies and gentelment, this is indeed ugly. The fact is we have 6754 * 3 things going on here 6755 * 6756 * 1) We need to reserve space for our orphan item and the space to 6757 * delete our orphan item. Lord knows we don't want to have a dangling 6758 * orphan item because we didn't reserve space to remove it. 6759 * 6760 * 2) We need to reserve space to update our inode. 6761 * 6762 * 3) We need to have something to cache all the space that is going to 6763 * be free'd up by the truncate operation, but also have some slack 6764 * space reserved in case it uses space during the truncate (thank you 6765 * very much snapshotting). 6766 * 6767 * And we need these to all be seperate. The fact is we can use alot of 6768 * space doing the truncate, and we have no earthly idea how much space 6769 * we will use, so we need the truncate reservation to be seperate so it 6770 * doesn't end up using space reserved for updating the inode or 6771 * removing the orphan item. We also need to be able to stop the 6772 * transaction and start a new one, which means we need to be able to 6773 * update the inode several times, and we have no idea of knowing how 6774 * many times that will be, so we can't just reserve 1 item for the 6775 * entirety of the opration, so that has to be done seperately as well. 6776 * Then there is the orphan item, which does indeed need to be held on 6777 * to for the whole operation, and we need nobody to touch this reserved 6778 * space except the orphan code. 6779 * 6780 * So that leaves us with 6781 * 6782 * 1) root->orphan_block_rsv - for the orphan deletion. 6783 * 2) rsv - for the truncate reservation, which we will steal from the 6784 * transaction reservation. 6785 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6786 * updating the inode. 6787 */ 6788 rsv = btrfs_alloc_block_rsv(root); 6789 if (!rsv) 6790 return -ENOMEM; 6791 rsv->size = min_size; 6792 6793 /* 6794 * 1 for the truncate slack space 6795 * 1 for the orphan item we're going to add 6796 * 1 for the orphan item deletion 6797 * 1 for updating the inode. 6798 */ 6799 trans = btrfs_start_transaction(root, 4); 6800 if (IS_ERR(trans)) { 6801 err = PTR_ERR(trans); 6802 goto out; 6803 } 6804 6805 /* Migrate the slack space for the truncate to our reserve */ 6806 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 6807 min_size); 6808 BUG_ON(ret); 6809 6810 ret = btrfs_orphan_add(trans, inode); 6811 if (ret) { 6812 btrfs_end_transaction(trans, root); 6813 goto out; 6814 } 6815 6816 /* 6817 * setattr is responsible for setting the ordered_data_close flag, 6818 * but that is only tested during the last file release. That 6819 * could happen well after the next commit, leaving a great big 6820 * window where new writes may get lost if someone chooses to write 6821 * to this file after truncating to zero 6822 * 6823 * The inode doesn't have any dirty data here, and so if we commit 6824 * this is a noop. If someone immediately starts writing to the inode 6825 * it is very likely we'll catch some of their writes in this 6826 * transaction, and the commit will find this file on the ordered 6827 * data list with good things to send down. 6828 * 6829 * This is a best effort solution, there is still a window where 6830 * using truncate to replace the contents of the file will 6831 * end up with a zero length file after a crash. 6832 */ 6833 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 6834 &BTRFS_I(inode)->runtime_flags)) 6835 btrfs_add_ordered_operation(trans, root, inode); 6836 6837 while (1) { 6838 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6839 if (ret) { 6840 /* 6841 * This can only happen with the original transaction we 6842 * started above, every other time we shouldn't have a 6843 * transaction started yet. 6844 */ 6845 if (ret == -EAGAIN) 6846 goto end_trans; 6847 err = ret; 6848 break; 6849 } 6850 6851 if (!trans) { 6852 /* Just need the 1 for updating the inode */ 6853 trans = btrfs_start_transaction(root, 1); 6854 if (IS_ERR(trans)) { 6855 ret = err = PTR_ERR(trans); 6856 trans = NULL; 6857 break; 6858 } 6859 } 6860 6861 trans->block_rsv = rsv; 6862 6863 ret = btrfs_truncate_inode_items(trans, root, inode, 6864 inode->i_size, 6865 BTRFS_EXTENT_DATA_KEY); 6866 if (ret != -EAGAIN) { 6867 err = ret; 6868 break; 6869 } 6870 6871 trans->block_rsv = &root->fs_info->trans_block_rsv; 6872 ret = btrfs_update_inode(trans, root, inode); 6873 if (ret) { 6874 err = ret; 6875 break; 6876 } 6877 end_trans: 6878 nr = trans->blocks_used; 6879 btrfs_end_transaction(trans, root); 6880 trans = NULL; 6881 btrfs_btree_balance_dirty(root, nr); 6882 } 6883 6884 if (ret == 0 && inode->i_nlink > 0) { 6885 trans->block_rsv = root->orphan_block_rsv; 6886 ret = btrfs_orphan_del(trans, inode); 6887 if (ret) 6888 err = ret; 6889 } else if (ret && inode->i_nlink > 0) { 6890 /* 6891 * Failed to do the truncate, remove us from the in memory 6892 * orphan list. 6893 */ 6894 ret = btrfs_orphan_del(NULL, inode); 6895 } 6896 6897 if (trans) { 6898 trans->block_rsv = &root->fs_info->trans_block_rsv; 6899 ret = btrfs_update_inode(trans, root, inode); 6900 if (ret && !err) 6901 err = ret; 6902 6903 nr = trans->blocks_used; 6904 ret = btrfs_end_transaction(trans, root); 6905 btrfs_btree_balance_dirty(root, nr); 6906 } 6907 6908 out: 6909 btrfs_free_block_rsv(root, rsv); 6910 6911 if (ret && !err) 6912 err = ret; 6913 6914 return err; 6915 } 6916 6917 /* 6918 * create a new subvolume directory/inode (helper for the ioctl). 6919 */ 6920 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 6921 struct btrfs_root *new_root, u64 new_dirid) 6922 { 6923 struct inode *inode; 6924 int err; 6925 u64 index = 0; 6926 6927 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 6928 new_dirid, new_dirid, 6929 S_IFDIR | (~current_umask() & S_IRWXUGO), 6930 &index); 6931 if (IS_ERR(inode)) 6932 return PTR_ERR(inode); 6933 inode->i_op = &btrfs_dir_inode_operations; 6934 inode->i_fop = &btrfs_dir_file_operations; 6935 6936 set_nlink(inode, 1); 6937 btrfs_i_size_write(inode, 0); 6938 6939 err = btrfs_update_inode(trans, new_root, inode); 6940 6941 iput(inode); 6942 return err; 6943 } 6944 6945 struct inode *btrfs_alloc_inode(struct super_block *sb) 6946 { 6947 struct btrfs_inode *ei; 6948 struct inode *inode; 6949 6950 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6951 if (!ei) 6952 return NULL; 6953 6954 ei->root = NULL; 6955 ei->generation = 0; 6956 ei->last_trans = 0; 6957 ei->last_sub_trans = 0; 6958 ei->logged_trans = 0; 6959 ei->delalloc_bytes = 0; 6960 ei->disk_i_size = 0; 6961 ei->flags = 0; 6962 ei->csum_bytes = 0; 6963 ei->index_cnt = (u64)-1; 6964 ei->last_unlink_trans = 0; 6965 6966 spin_lock_init(&ei->lock); 6967 ei->outstanding_extents = 0; 6968 ei->reserved_extents = 0; 6969 6970 ei->runtime_flags = 0; 6971 ei->force_compress = BTRFS_COMPRESS_NONE; 6972 6973 ei->delayed_node = NULL; 6974 6975 inode = &ei->vfs_inode; 6976 extent_map_tree_init(&ei->extent_tree); 6977 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6978 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6979 ei->io_tree.track_uptodate = 1; 6980 ei->io_failure_tree.track_uptodate = 1; 6981 mutex_init(&ei->log_mutex); 6982 mutex_init(&ei->delalloc_mutex); 6983 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6984 INIT_LIST_HEAD(&ei->delalloc_inodes); 6985 INIT_LIST_HEAD(&ei->ordered_operations); 6986 RB_CLEAR_NODE(&ei->rb_node); 6987 6988 return inode; 6989 } 6990 6991 static void btrfs_i_callback(struct rcu_head *head) 6992 { 6993 struct inode *inode = container_of(head, struct inode, i_rcu); 6994 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6995 } 6996 6997 void btrfs_destroy_inode(struct inode *inode) 6998 { 6999 struct btrfs_ordered_extent *ordered; 7000 struct btrfs_root *root = BTRFS_I(inode)->root; 7001 7002 WARN_ON(!hlist_empty(&inode->i_dentry)); 7003 WARN_ON(inode->i_data.nrpages); 7004 WARN_ON(BTRFS_I(inode)->outstanding_extents); 7005 WARN_ON(BTRFS_I(inode)->reserved_extents); 7006 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 7007 WARN_ON(BTRFS_I(inode)->csum_bytes); 7008 7009 /* 7010 * This can happen where we create an inode, but somebody else also 7011 * created the same inode and we need to destroy the one we already 7012 * created. 7013 */ 7014 if (!root) 7015 goto free; 7016 7017 /* 7018 * Make sure we're properly removed from the ordered operation 7019 * lists. 7020 */ 7021 smp_mb(); 7022 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7023 spin_lock(&root->fs_info->ordered_extent_lock); 7024 list_del_init(&BTRFS_I(inode)->ordered_operations); 7025 spin_unlock(&root->fs_info->ordered_extent_lock); 7026 } 7027 7028 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7029 &BTRFS_I(inode)->runtime_flags)) { 7030 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 7031 (unsigned long long)btrfs_ino(inode)); 7032 atomic_dec(&root->orphan_inodes); 7033 } 7034 7035 while (1) { 7036 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 7037 if (!ordered) 7038 break; 7039 else { 7040 printk(KERN_ERR "btrfs found ordered " 7041 "extent %llu %llu on inode cleanup\n", 7042 (unsigned long long)ordered->file_offset, 7043 (unsigned long long)ordered->len); 7044 btrfs_remove_ordered_extent(inode, ordered); 7045 btrfs_put_ordered_extent(ordered); 7046 btrfs_put_ordered_extent(ordered); 7047 } 7048 } 7049 inode_tree_del(inode); 7050 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 7051 free: 7052 btrfs_remove_delayed_node(inode); 7053 call_rcu(&inode->i_rcu, btrfs_i_callback); 7054 } 7055 7056 int btrfs_drop_inode(struct inode *inode) 7057 { 7058 struct btrfs_root *root = BTRFS_I(inode)->root; 7059 7060 if (btrfs_root_refs(&root->root_item) == 0 && 7061 !btrfs_is_free_space_inode(inode)) 7062 return 1; 7063 else 7064 return generic_drop_inode(inode); 7065 } 7066 7067 static void init_once(void *foo) 7068 { 7069 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 7070 7071 inode_init_once(&ei->vfs_inode); 7072 } 7073 7074 void btrfs_destroy_cachep(void) 7075 { 7076 if (btrfs_inode_cachep) 7077 kmem_cache_destroy(btrfs_inode_cachep); 7078 if (btrfs_trans_handle_cachep) 7079 kmem_cache_destroy(btrfs_trans_handle_cachep); 7080 if (btrfs_transaction_cachep) 7081 kmem_cache_destroy(btrfs_transaction_cachep); 7082 if (btrfs_path_cachep) 7083 kmem_cache_destroy(btrfs_path_cachep); 7084 if (btrfs_free_space_cachep) 7085 kmem_cache_destroy(btrfs_free_space_cachep); 7086 } 7087 7088 int btrfs_init_cachep(void) 7089 { 7090 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 7091 sizeof(struct btrfs_inode), 0, 7092 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7093 if (!btrfs_inode_cachep) 7094 goto fail; 7095 7096 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 7097 sizeof(struct btrfs_trans_handle), 0, 7098 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7099 if (!btrfs_trans_handle_cachep) 7100 goto fail; 7101 7102 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 7103 sizeof(struct btrfs_transaction), 0, 7104 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7105 if (!btrfs_transaction_cachep) 7106 goto fail; 7107 7108 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 7109 sizeof(struct btrfs_path), 0, 7110 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7111 if (!btrfs_path_cachep) 7112 goto fail; 7113 7114 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 7115 sizeof(struct btrfs_free_space), 0, 7116 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7117 if (!btrfs_free_space_cachep) 7118 goto fail; 7119 7120 return 0; 7121 fail: 7122 btrfs_destroy_cachep(); 7123 return -ENOMEM; 7124 } 7125 7126 static int btrfs_getattr(struct vfsmount *mnt, 7127 struct dentry *dentry, struct kstat *stat) 7128 { 7129 struct inode *inode = dentry->d_inode; 7130 u32 blocksize = inode->i_sb->s_blocksize; 7131 7132 generic_fillattr(inode, stat); 7133 stat->dev = BTRFS_I(inode)->root->anon_dev; 7134 stat->blksize = PAGE_CACHE_SIZE; 7135 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 7136 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; 7137 return 0; 7138 } 7139 7140 /* 7141 * If a file is moved, it will inherit the cow and compression flags of the new 7142 * directory. 7143 */ 7144 static void fixup_inode_flags(struct inode *dir, struct inode *inode) 7145 { 7146 struct btrfs_inode *b_dir = BTRFS_I(dir); 7147 struct btrfs_inode *b_inode = BTRFS_I(inode); 7148 7149 if (b_dir->flags & BTRFS_INODE_NODATACOW) 7150 b_inode->flags |= BTRFS_INODE_NODATACOW; 7151 else 7152 b_inode->flags &= ~BTRFS_INODE_NODATACOW; 7153 7154 if (b_dir->flags & BTRFS_INODE_COMPRESS) { 7155 b_inode->flags |= BTRFS_INODE_COMPRESS; 7156 b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; 7157 } else { 7158 b_inode->flags &= ~(BTRFS_INODE_COMPRESS | 7159 BTRFS_INODE_NOCOMPRESS); 7160 } 7161 } 7162 7163 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 7164 struct inode *new_dir, struct dentry *new_dentry) 7165 { 7166 struct btrfs_trans_handle *trans; 7167 struct btrfs_root *root = BTRFS_I(old_dir)->root; 7168 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 7169 struct inode *new_inode = new_dentry->d_inode; 7170 struct inode *old_inode = old_dentry->d_inode; 7171 struct timespec ctime = CURRENT_TIME; 7172 u64 index = 0; 7173 u64 root_objectid; 7174 int ret; 7175 u64 old_ino = btrfs_ino(old_inode); 7176 7177 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 7178 return -EPERM; 7179 7180 /* we only allow rename subvolume link between subvolumes */ 7181 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 7182 return -EXDEV; 7183 7184 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 7185 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) 7186 return -ENOTEMPTY; 7187 7188 if (S_ISDIR(old_inode->i_mode) && new_inode && 7189 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7190 return -ENOTEMPTY; 7191 /* 7192 * we're using rename to replace one file with another. 7193 * and the replacement file is large. Start IO on it now so 7194 * we don't add too much work to the end of the transaction 7195 */ 7196 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 7197 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 7198 filemap_flush(old_inode->i_mapping); 7199 7200 /* close the racy window with snapshot create/destroy ioctl */ 7201 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7202 down_read(&root->fs_info->subvol_sem); 7203 /* 7204 * We want to reserve the absolute worst case amount of items. So if 7205 * both inodes are subvols and we need to unlink them then that would 7206 * require 4 item modifications, but if they are both normal inodes it 7207 * would require 5 item modifications, so we'll assume their normal 7208 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 7209 * should cover the worst case number of items we'll modify. 7210 */ 7211 trans = btrfs_start_transaction(root, 20); 7212 if (IS_ERR(trans)) { 7213 ret = PTR_ERR(trans); 7214 goto out_notrans; 7215 } 7216 7217 if (dest != root) 7218 btrfs_record_root_in_trans(trans, dest); 7219 7220 ret = btrfs_set_inode_index(new_dir, &index); 7221 if (ret) 7222 goto out_fail; 7223 7224 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7225 /* force full log commit if subvolume involved. */ 7226 root->fs_info->last_trans_log_full_commit = trans->transid; 7227 } else { 7228 ret = btrfs_insert_inode_ref(trans, dest, 7229 new_dentry->d_name.name, 7230 new_dentry->d_name.len, 7231 old_ino, 7232 btrfs_ino(new_dir), index); 7233 if (ret) 7234 goto out_fail; 7235 /* 7236 * this is an ugly little race, but the rename is required 7237 * to make sure that if we crash, the inode is either at the 7238 * old name or the new one. pinning the log transaction lets 7239 * us make sure we don't allow a log commit to come in after 7240 * we unlink the name but before we add the new name back in. 7241 */ 7242 btrfs_pin_log_trans(root); 7243 } 7244 /* 7245 * make sure the inode gets flushed if it is replacing 7246 * something. 7247 */ 7248 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 7249 btrfs_add_ordered_operation(trans, root, old_inode); 7250 7251 inode_inc_iversion(old_dir); 7252 inode_inc_iversion(new_dir); 7253 inode_inc_iversion(old_inode); 7254 old_dir->i_ctime = old_dir->i_mtime = ctime; 7255 new_dir->i_ctime = new_dir->i_mtime = ctime; 7256 old_inode->i_ctime = ctime; 7257 7258 if (old_dentry->d_parent != new_dentry->d_parent) 7259 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 7260 7261 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7262 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 7263 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 7264 old_dentry->d_name.name, 7265 old_dentry->d_name.len); 7266 } else { 7267 ret = __btrfs_unlink_inode(trans, root, old_dir, 7268 old_dentry->d_inode, 7269 old_dentry->d_name.name, 7270 old_dentry->d_name.len); 7271 if (!ret) 7272 ret = btrfs_update_inode(trans, root, old_inode); 7273 } 7274 if (ret) { 7275 btrfs_abort_transaction(trans, root, ret); 7276 goto out_fail; 7277 } 7278 7279 if (new_inode) { 7280 inode_inc_iversion(new_inode); 7281 new_inode->i_ctime = CURRENT_TIME; 7282 if (unlikely(btrfs_ino(new_inode) == 7283 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7284 root_objectid = BTRFS_I(new_inode)->location.objectid; 7285 ret = btrfs_unlink_subvol(trans, dest, new_dir, 7286 root_objectid, 7287 new_dentry->d_name.name, 7288 new_dentry->d_name.len); 7289 BUG_ON(new_inode->i_nlink == 0); 7290 } else { 7291 ret = btrfs_unlink_inode(trans, dest, new_dir, 7292 new_dentry->d_inode, 7293 new_dentry->d_name.name, 7294 new_dentry->d_name.len); 7295 } 7296 if (!ret && new_inode->i_nlink == 0) { 7297 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 7298 BUG_ON(ret); 7299 } 7300 if (ret) { 7301 btrfs_abort_transaction(trans, root, ret); 7302 goto out_fail; 7303 } 7304 } 7305 7306 fixup_inode_flags(new_dir, old_inode); 7307 7308 ret = btrfs_add_link(trans, new_dir, old_inode, 7309 new_dentry->d_name.name, 7310 new_dentry->d_name.len, 0, index); 7311 if (ret) { 7312 btrfs_abort_transaction(trans, root, ret); 7313 goto out_fail; 7314 } 7315 7316 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 7317 struct dentry *parent = new_dentry->d_parent; 7318 btrfs_log_new_name(trans, old_inode, old_dir, parent); 7319 btrfs_end_log_trans(root); 7320 } 7321 out_fail: 7322 btrfs_end_transaction(trans, root); 7323 out_notrans: 7324 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7325 up_read(&root->fs_info->subvol_sem); 7326 7327 return ret; 7328 } 7329 7330 /* 7331 * some fairly slow code that needs optimization. This walks the list 7332 * of all the inodes with pending delalloc and forces them to disk. 7333 */ 7334 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 7335 { 7336 struct list_head *head = &root->fs_info->delalloc_inodes; 7337 struct btrfs_inode *binode; 7338 struct inode *inode; 7339 7340 if (root->fs_info->sb->s_flags & MS_RDONLY) 7341 return -EROFS; 7342 7343 spin_lock(&root->fs_info->delalloc_lock); 7344 while (!list_empty(head)) { 7345 binode = list_entry(head->next, struct btrfs_inode, 7346 delalloc_inodes); 7347 inode = igrab(&binode->vfs_inode); 7348 if (!inode) 7349 list_del_init(&binode->delalloc_inodes); 7350 spin_unlock(&root->fs_info->delalloc_lock); 7351 if (inode) { 7352 filemap_flush(inode->i_mapping); 7353 if (delay_iput) 7354 btrfs_add_delayed_iput(inode); 7355 else 7356 iput(inode); 7357 } 7358 cond_resched(); 7359 spin_lock(&root->fs_info->delalloc_lock); 7360 } 7361 spin_unlock(&root->fs_info->delalloc_lock); 7362 7363 /* the filemap_flush will queue IO into the worker threads, but 7364 * we have to make sure the IO is actually started and that 7365 * ordered extents get created before we return 7366 */ 7367 atomic_inc(&root->fs_info->async_submit_draining); 7368 while (atomic_read(&root->fs_info->nr_async_submits) || 7369 atomic_read(&root->fs_info->async_delalloc_pages)) { 7370 wait_event(root->fs_info->async_submit_wait, 7371 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 7372 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7373 } 7374 atomic_dec(&root->fs_info->async_submit_draining); 7375 return 0; 7376 } 7377 7378 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7379 const char *symname) 7380 { 7381 struct btrfs_trans_handle *trans; 7382 struct btrfs_root *root = BTRFS_I(dir)->root; 7383 struct btrfs_path *path; 7384 struct btrfs_key key; 7385 struct inode *inode = NULL; 7386 int err; 7387 int drop_inode = 0; 7388 u64 objectid; 7389 u64 index = 0 ; 7390 int name_len; 7391 int datasize; 7392 unsigned long ptr; 7393 struct btrfs_file_extent_item *ei; 7394 struct extent_buffer *leaf; 7395 unsigned long nr = 0; 7396 7397 name_len = strlen(symname) + 1; 7398 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7399 return -ENAMETOOLONG; 7400 7401 /* 7402 * 2 items for inode item and ref 7403 * 2 items for dir items 7404 * 1 item for xattr if selinux is on 7405 */ 7406 trans = btrfs_start_transaction(root, 5); 7407 if (IS_ERR(trans)) 7408 return PTR_ERR(trans); 7409 7410 err = btrfs_find_free_ino(root, &objectid); 7411 if (err) 7412 goto out_unlock; 7413 7414 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 7415 dentry->d_name.len, btrfs_ino(dir), objectid, 7416 S_IFLNK|S_IRWXUGO, &index); 7417 if (IS_ERR(inode)) { 7418 err = PTR_ERR(inode); 7419 goto out_unlock; 7420 } 7421 7422 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 7423 if (err) { 7424 drop_inode = 1; 7425 goto out_unlock; 7426 } 7427 7428 /* 7429 * If the active LSM wants to access the inode during 7430 * d_instantiate it needs these. Smack checks to see 7431 * if the filesystem supports xattrs by looking at the 7432 * ops vector. 7433 */ 7434 inode->i_fop = &btrfs_file_operations; 7435 inode->i_op = &btrfs_file_inode_operations; 7436 7437 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7438 if (err) 7439 drop_inode = 1; 7440 else { 7441 inode->i_mapping->a_ops = &btrfs_aops; 7442 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7443 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7444 } 7445 if (drop_inode) 7446 goto out_unlock; 7447 7448 path = btrfs_alloc_path(); 7449 if (!path) { 7450 err = -ENOMEM; 7451 drop_inode = 1; 7452 goto out_unlock; 7453 } 7454 key.objectid = btrfs_ino(inode); 7455 key.offset = 0; 7456 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7457 datasize = btrfs_file_extent_calc_inline_size(name_len); 7458 err = btrfs_insert_empty_item(trans, root, path, &key, 7459 datasize); 7460 if (err) { 7461 drop_inode = 1; 7462 btrfs_free_path(path); 7463 goto out_unlock; 7464 } 7465 leaf = path->nodes[0]; 7466 ei = btrfs_item_ptr(leaf, path->slots[0], 7467 struct btrfs_file_extent_item); 7468 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 7469 btrfs_set_file_extent_type(leaf, ei, 7470 BTRFS_FILE_EXTENT_INLINE); 7471 btrfs_set_file_extent_encryption(leaf, ei, 0); 7472 btrfs_set_file_extent_compression(leaf, ei, 0); 7473 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 7474 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 7475 7476 ptr = btrfs_file_extent_inline_start(ei); 7477 write_extent_buffer(leaf, symname, ptr, name_len); 7478 btrfs_mark_buffer_dirty(leaf); 7479 btrfs_free_path(path); 7480 7481 inode->i_op = &btrfs_symlink_inode_operations; 7482 inode->i_mapping->a_ops = &btrfs_symlink_aops; 7483 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7484 inode_set_bytes(inode, name_len); 7485 btrfs_i_size_write(inode, name_len - 1); 7486 err = btrfs_update_inode(trans, root, inode); 7487 if (err) 7488 drop_inode = 1; 7489 7490 out_unlock: 7491 if (!err) 7492 d_instantiate(dentry, inode); 7493 nr = trans->blocks_used; 7494 btrfs_end_transaction(trans, root); 7495 if (drop_inode) { 7496 inode_dec_link_count(inode); 7497 iput(inode); 7498 } 7499 btrfs_btree_balance_dirty(root, nr); 7500 return err; 7501 } 7502 7503 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 7504 u64 start, u64 num_bytes, u64 min_size, 7505 loff_t actual_len, u64 *alloc_hint, 7506 struct btrfs_trans_handle *trans) 7507 { 7508 struct btrfs_root *root = BTRFS_I(inode)->root; 7509 struct btrfs_key ins; 7510 u64 cur_offset = start; 7511 u64 i_size; 7512 int ret = 0; 7513 bool own_trans = true; 7514 7515 if (trans) 7516 own_trans = false; 7517 while (num_bytes > 0) { 7518 if (own_trans) { 7519 trans = btrfs_start_transaction(root, 3); 7520 if (IS_ERR(trans)) { 7521 ret = PTR_ERR(trans); 7522 break; 7523 } 7524 } 7525 7526 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 7527 0, *alloc_hint, &ins, 1); 7528 if (ret) { 7529 if (own_trans) 7530 btrfs_end_transaction(trans, root); 7531 break; 7532 } 7533 7534 ret = insert_reserved_file_extent(trans, inode, 7535 cur_offset, ins.objectid, 7536 ins.offset, ins.offset, 7537 ins.offset, 0, 0, 0, 7538 BTRFS_FILE_EXTENT_PREALLOC); 7539 if (ret) { 7540 btrfs_abort_transaction(trans, root, ret); 7541 if (own_trans) 7542 btrfs_end_transaction(trans, root); 7543 break; 7544 } 7545 btrfs_drop_extent_cache(inode, cur_offset, 7546 cur_offset + ins.offset -1, 0); 7547 7548 num_bytes -= ins.offset; 7549 cur_offset += ins.offset; 7550 *alloc_hint = ins.objectid + ins.offset; 7551 7552 inode_inc_iversion(inode); 7553 inode->i_ctime = CURRENT_TIME; 7554 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7555 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7556 (actual_len > inode->i_size) && 7557 (cur_offset > inode->i_size)) { 7558 if (cur_offset > actual_len) 7559 i_size = actual_len; 7560 else 7561 i_size = cur_offset; 7562 i_size_write(inode, i_size); 7563 btrfs_ordered_update_i_size(inode, i_size, NULL); 7564 } 7565 7566 ret = btrfs_update_inode(trans, root, inode); 7567 7568 if (ret) { 7569 btrfs_abort_transaction(trans, root, ret); 7570 if (own_trans) 7571 btrfs_end_transaction(trans, root); 7572 break; 7573 } 7574 7575 if (own_trans) 7576 btrfs_end_transaction(trans, root); 7577 } 7578 return ret; 7579 } 7580 7581 int btrfs_prealloc_file_range(struct inode *inode, int mode, 7582 u64 start, u64 num_bytes, u64 min_size, 7583 loff_t actual_len, u64 *alloc_hint) 7584 { 7585 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7586 min_size, actual_len, alloc_hint, 7587 NULL); 7588 } 7589 7590 int btrfs_prealloc_file_range_trans(struct inode *inode, 7591 struct btrfs_trans_handle *trans, int mode, 7592 u64 start, u64 num_bytes, u64 min_size, 7593 loff_t actual_len, u64 *alloc_hint) 7594 { 7595 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7596 min_size, actual_len, alloc_hint, trans); 7597 } 7598 7599 static int btrfs_set_page_dirty(struct page *page) 7600 { 7601 return __set_page_dirty_nobuffers(page); 7602 } 7603 7604 static int btrfs_permission(struct inode *inode, int mask) 7605 { 7606 struct btrfs_root *root = BTRFS_I(inode)->root; 7607 umode_t mode = inode->i_mode; 7608 7609 if (mask & MAY_WRITE && 7610 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 7611 if (btrfs_root_readonly(root)) 7612 return -EROFS; 7613 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 7614 return -EACCES; 7615 } 7616 return generic_permission(inode, mask); 7617 } 7618 7619 static const struct inode_operations btrfs_dir_inode_operations = { 7620 .getattr = btrfs_getattr, 7621 .lookup = btrfs_lookup, 7622 .create = btrfs_create, 7623 .unlink = btrfs_unlink, 7624 .link = btrfs_link, 7625 .mkdir = btrfs_mkdir, 7626 .rmdir = btrfs_rmdir, 7627 .rename = btrfs_rename, 7628 .symlink = btrfs_symlink, 7629 .setattr = btrfs_setattr, 7630 .mknod = btrfs_mknod, 7631 .setxattr = btrfs_setxattr, 7632 .getxattr = btrfs_getxattr, 7633 .listxattr = btrfs_listxattr, 7634 .removexattr = btrfs_removexattr, 7635 .permission = btrfs_permission, 7636 .get_acl = btrfs_get_acl, 7637 }; 7638 static const struct inode_operations btrfs_dir_ro_inode_operations = { 7639 .lookup = btrfs_lookup, 7640 .permission = btrfs_permission, 7641 .get_acl = btrfs_get_acl, 7642 }; 7643 7644 static const struct file_operations btrfs_dir_file_operations = { 7645 .llseek = generic_file_llseek, 7646 .read = generic_read_dir, 7647 .readdir = btrfs_real_readdir, 7648 .unlocked_ioctl = btrfs_ioctl, 7649 #ifdef CONFIG_COMPAT 7650 .compat_ioctl = btrfs_ioctl, 7651 #endif 7652 .release = btrfs_release_file, 7653 .fsync = btrfs_sync_file, 7654 }; 7655 7656 static struct extent_io_ops btrfs_extent_io_ops = { 7657 .fill_delalloc = run_delalloc_range, 7658 .submit_bio_hook = btrfs_submit_bio_hook, 7659 .merge_bio_hook = btrfs_merge_bio_hook, 7660 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7661 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7662 .writepage_start_hook = btrfs_writepage_start_hook, 7663 .set_bit_hook = btrfs_set_bit_hook, 7664 .clear_bit_hook = btrfs_clear_bit_hook, 7665 .merge_extent_hook = btrfs_merge_extent_hook, 7666 .split_extent_hook = btrfs_split_extent_hook, 7667 }; 7668 7669 /* 7670 * btrfs doesn't support the bmap operation because swapfiles 7671 * use bmap to make a mapping of extents in the file. They assume 7672 * these extents won't change over the life of the file and they 7673 * use the bmap result to do IO directly to the drive. 7674 * 7675 * the btrfs bmap call would return logical addresses that aren't 7676 * suitable for IO and they also will change frequently as COW 7677 * operations happen. So, swapfile + btrfs == corruption. 7678 * 7679 * For now we're avoiding this by dropping bmap. 7680 */ 7681 static const struct address_space_operations btrfs_aops = { 7682 .readpage = btrfs_readpage, 7683 .writepage = btrfs_writepage, 7684 .writepages = btrfs_writepages, 7685 .readpages = btrfs_readpages, 7686 .direct_IO = btrfs_direct_IO, 7687 .invalidatepage = btrfs_invalidatepage, 7688 .releasepage = btrfs_releasepage, 7689 .set_page_dirty = btrfs_set_page_dirty, 7690 .error_remove_page = generic_error_remove_page, 7691 }; 7692 7693 static const struct address_space_operations btrfs_symlink_aops = { 7694 .readpage = btrfs_readpage, 7695 .writepage = btrfs_writepage, 7696 .invalidatepage = btrfs_invalidatepage, 7697 .releasepage = btrfs_releasepage, 7698 }; 7699 7700 static const struct inode_operations btrfs_file_inode_operations = { 7701 .getattr = btrfs_getattr, 7702 .setattr = btrfs_setattr, 7703 .setxattr = btrfs_setxattr, 7704 .getxattr = btrfs_getxattr, 7705 .listxattr = btrfs_listxattr, 7706 .removexattr = btrfs_removexattr, 7707 .permission = btrfs_permission, 7708 .fiemap = btrfs_fiemap, 7709 .get_acl = btrfs_get_acl, 7710 .update_time = btrfs_update_time, 7711 }; 7712 static const struct inode_operations btrfs_special_inode_operations = { 7713 .getattr = btrfs_getattr, 7714 .setattr = btrfs_setattr, 7715 .permission = btrfs_permission, 7716 .setxattr = btrfs_setxattr, 7717 .getxattr = btrfs_getxattr, 7718 .listxattr = btrfs_listxattr, 7719 .removexattr = btrfs_removexattr, 7720 .get_acl = btrfs_get_acl, 7721 .update_time = btrfs_update_time, 7722 }; 7723 static const struct inode_operations btrfs_symlink_inode_operations = { 7724 .readlink = generic_readlink, 7725 .follow_link = page_follow_link_light, 7726 .put_link = page_put_link, 7727 .getattr = btrfs_getattr, 7728 .setattr = btrfs_setattr, 7729 .permission = btrfs_permission, 7730 .setxattr = btrfs_setxattr, 7731 .getxattr = btrfs_getxattr, 7732 .listxattr = btrfs_listxattr, 7733 .removexattr = btrfs_removexattr, 7734 .get_acl = btrfs_get_acl, 7735 .update_time = btrfs_update_time, 7736 }; 7737 7738 const struct dentry_operations btrfs_dentry_operations = { 7739 .d_delete = btrfs_dentry_delete, 7740 .d_release = btrfs_dentry_release, 7741 }; 7742