1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include <linux/slab.h> 40 #include <linux/ratelimit.h> 41 #include <linux/mount.h> 42 #include "compat.h" 43 #include "ctree.h" 44 #include "disk-io.h" 45 #include "transaction.h" 46 #include "btrfs_inode.h" 47 #include "ioctl.h" 48 #include "print-tree.h" 49 #include "ordered-data.h" 50 #include "xattr.h" 51 #include "tree-log.h" 52 #include "volumes.h" 53 #include "compression.h" 54 #include "locking.h" 55 #include "free-space-cache.h" 56 #include "inode-map.h" 57 58 struct btrfs_iget_args { 59 u64 ino; 60 struct btrfs_root *root; 61 }; 62 63 static const struct inode_operations btrfs_dir_inode_operations; 64 static const struct inode_operations btrfs_symlink_inode_operations; 65 static const struct inode_operations btrfs_dir_ro_inode_operations; 66 static const struct inode_operations btrfs_special_inode_operations; 67 static const struct inode_operations btrfs_file_inode_operations; 68 static const struct address_space_operations btrfs_aops; 69 static const struct address_space_operations btrfs_symlink_aops; 70 static const struct file_operations btrfs_dir_file_operations; 71 static struct extent_io_ops btrfs_extent_io_ops; 72 73 static struct kmem_cache *btrfs_inode_cachep; 74 struct kmem_cache *btrfs_trans_handle_cachep; 75 struct kmem_cache *btrfs_transaction_cachep; 76 struct kmem_cache *btrfs_path_cachep; 77 struct kmem_cache *btrfs_free_space_cachep; 78 79 #define S_SHIFT 12 80 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 81 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 82 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 83 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 84 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 85 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 86 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 87 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 88 }; 89 90 static int btrfs_setsize(struct inode *inode, loff_t newsize); 91 static int btrfs_truncate(struct inode *inode); 92 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 93 static noinline int cow_file_range(struct inode *inode, 94 struct page *locked_page, 95 u64 start, u64 end, int *page_started, 96 unsigned long *nr_written, int unlock); 97 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, struct inode *inode); 99 100 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 101 struct inode *inode, struct inode *dir, 102 const struct qstr *qstr) 103 { 104 int err; 105 106 err = btrfs_init_acl(trans, inode, dir); 107 if (!err) 108 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 109 return err; 110 } 111 112 /* 113 * this does all the hard work for inserting an inline extent into 114 * the btree. The caller should have done a btrfs_drop_extents so that 115 * no overlapping inline items exist in the btree 116 */ 117 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 118 struct btrfs_root *root, struct inode *inode, 119 u64 start, size_t size, size_t compressed_size, 120 int compress_type, 121 struct page **compressed_pages) 122 { 123 struct btrfs_key key; 124 struct btrfs_path *path; 125 struct extent_buffer *leaf; 126 struct page *page = NULL; 127 char *kaddr; 128 unsigned long ptr; 129 struct btrfs_file_extent_item *ei; 130 int err = 0; 131 int ret; 132 size_t cur_size = size; 133 size_t datasize; 134 unsigned long offset; 135 136 if (compressed_size && compressed_pages) 137 cur_size = compressed_size; 138 139 path = btrfs_alloc_path(); 140 if (!path) 141 return -ENOMEM; 142 143 path->leave_spinning = 1; 144 145 key.objectid = btrfs_ino(inode); 146 key.offset = start; 147 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 148 datasize = btrfs_file_extent_calc_inline_size(cur_size); 149 150 inode_add_bytes(inode, size); 151 ret = btrfs_insert_empty_item(trans, root, path, &key, 152 datasize); 153 if (ret) { 154 err = ret; 155 goto fail; 156 } 157 leaf = path->nodes[0]; 158 ei = btrfs_item_ptr(leaf, path->slots[0], 159 struct btrfs_file_extent_item); 160 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 161 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 162 btrfs_set_file_extent_encryption(leaf, ei, 0); 163 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 164 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 165 ptr = btrfs_file_extent_inline_start(ei); 166 167 if (compress_type != BTRFS_COMPRESS_NONE) { 168 struct page *cpage; 169 int i = 0; 170 while (compressed_size > 0) { 171 cpage = compressed_pages[i]; 172 cur_size = min_t(unsigned long, compressed_size, 173 PAGE_CACHE_SIZE); 174 175 kaddr = kmap_atomic(cpage); 176 write_extent_buffer(leaf, kaddr, ptr, cur_size); 177 kunmap_atomic(kaddr); 178 179 i++; 180 ptr += cur_size; 181 compressed_size -= cur_size; 182 } 183 btrfs_set_file_extent_compression(leaf, ei, 184 compress_type); 185 } else { 186 page = find_get_page(inode->i_mapping, 187 start >> PAGE_CACHE_SHIFT); 188 btrfs_set_file_extent_compression(leaf, ei, 0); 189 kaddr = kmap_atomic(page); 190 offset = start & (PAGE_CACHE_SIZE - 1); 191 write_extent_buffer(leaf, kaddr + offset, ptr, size); 192 kunmap_atomic(kaddr); 193 page_cache_release(page); 194 } 195 btrfs_mark_buffer_dirty(leaf); 196 btrfs_free_path(path); 197 198 /* 199 * we're an inline extent, so nobody can 200 * extend the file past i_size without locking 201 * a page we already have locked. 202 * 203 * We must do any isize and inode updates 204 * before we unlock the pages. Otherwise we 205 * could end up racing with unlink. 206 */ 207 BTRFS_I(inode)->disk_i_size = inode->i_size; 208 ret = btrfs_update_inode(trans, root, inode); 209 210 return ret; 211 fail: 212 btrfs_free_path(path); 213 return err; 214 } 215 216 217 /* 218 * conditionally insert an inline extent into the file. This 219 * does the checks required to make sure the data is small enough 220 * to fit as an inline extent. 221 */ 222 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 223 struct btrfs_root *root, 224 struct inode *inode, u64 start, u64 end, 225 size_t compressed_size, int compress_type, 226 struct page **compressed_pages) 227 { 228 u64 isize = i_size_read(inode); 229 u64 actual_end = min(end + 1, isize); 230 u64 inline_len = actual_end - start; 231 u64 aligned_end = (end + root->sectorsize - 1) & 232 ~((u64)root->sectorsize - 1); 233 u64 hint_byte; 234 u64 data_len = inline_len; 235 int ret; 236 237 if (compressed_size) 238 data_len = compressed_size; 239 240 if (start > 0 || 241 actual_end >= PAGE_CACHE_SIZE || 242 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 243 (!compressed_size && 244 (actual_end & (root->sectorsize - 1)) == 0) || 245 end + 1 < isize || 246 data_len > root->fs_info->max_inline) { 247 return 1; 248 } 249 250 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 251 &hint_byte, 1); 252 if (ret) 253 return ret; 254 255 if (isize > actual_end) 256 inline_len = min_t(u64, isize, actual_end); 257 ret = insert_inline_extent(trans, root, inode, start, 258 inline_len, compressed_size, 259 compress_type, compressed_pages); 260 if (ret && ret != -ENOSPC) { 261 btrfs_abort_transaction(trans, root, ret); 262 return ret; 263 } else if (ret == -ENOSPC) { 264 return 1; 265 } 266 267 btrfs_delalloc_release_metadata(inode, end + 1 - start); 268 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 269 return 0; 270 } 271 272 struct async_extent { 273 u64 start; 274 u64 ram_size; 275 u64 compressed_size; 276 struct page **pages; 277 unsigned long nr_pages; 278 int compress_type; 279 struct list_head list; 280 }; 281 282 struct async_cow { 283 struct inode *inode; 284 struct btrfs_root *root; 285 struct page *locked_page; 286 u64 start; 287 u64 end; 288 struct list_head extents; 289 struct btrfs_work work; 290 }; 291 292 static noinline int add_async_extent(struct async_cow *cow, 293 u64 start, u64 ram_size, 294 u64 compressed_size, 295 struct page **pages, 296 unsigned long nr_pages, 297 int compress_type) 298 { 299 struct async_extent *async_extent; 300 301 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 302 BUG_ON(!async_extent); /* -ENOMEM */ 303 async_extent->start = start; 304 async_extent->ram_size = ram_size; 305 async_extent->compressed_size = compressed_size; 306 async_extent->pages = pages; 307 async_extent->nr_pages = nr_pages; 308 async_extent->compress_type = compress_type; 309 list_add_tail(&async_extent->list, &cow->extents); 310 return 0; 311 } 312 313 /* 314 * we create compressed extents in two phases. The first 315 * phase compresses a range of pages that have already been 316 * locked (both pages and state bits are locked). 317 * 318 * This is done inside an ordered work queue, and the compression 319 * is spread across many cpus. The actual IO submission is step 320 * two, and the ordered work queue takes care of making sure that 321 * happens in the same order things were put onto the queue by 322 * writepages and friends. 323 * 324 * If this code finds it can't get good compression, it puts an 325 * entry onto the work queue to write the uncompressed bytes. This 326 * makes sure that both compressed inodes and uncompressed inodes 327 * are written in the same order that pdflush sent them down. 328 */ 329 static noinline int compress_file_range(struct inode *inode, 330 struct page *locked_page, 331 u64 start, u64 end, 332 struct async_cow *async_cow, 333 int *num_added) 334 { 335 struct btrfs_root *root = BTRFS_I(inode)->root; 336 struct btrfs_trans_handle *trans; 337 u64 num_bytes; 338 u64 blocksize = root->sectorsize; 339 u64 actual_end; 340 u64 isize = i_size_read(inode); 341 int ret = 0; 342 struct page **pages = NULL; 343 unsigned long nr_pages; 344 unsigned long nr_pages_ret = 0; 345 unsigned long total_compressed = 0; 346 unsigned long total_in = 0; 347 unsigned long max_compressed = 128 * 1024; 348 unsigned long max_uncompressed = 128 * 1024; 349 int i; 350 int will_compress; 351 int compress_type = root->fs_info->compress_type; 352 353 /* if this is a small write inside eof, kick off a defrag */ 354 if ((end - start + 1) < 16 * 1024 && 355 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 356 btrfs_add_inode_defrag(NULL, inode); 357 358 actual_end = min_t(u64, isize, end + 1); 359 again: 360 will_compress = 0; 361 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 362 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 363 364 /* 365 * we don't want to send crud past the end of i_size through 366 * compression, that's just a waste of CPU time. So, if the 367 * end of the file is before the start of our current 368 * requested range of bytes, we bail out to the uncompressed 369 * cleanup code that can deal with all of this. 370 * 371 * It isn't really the fastest way to fix things, but this is a 372 * very uncommon corner. 373 */ 374 if (actual_end <= start) 375 goto cleanup_and_bail_uncompressed; 376 377 total_compressed = actual_end - start; 378 379 /* we want to make sure that amount of ram required to uncompress 380 * an extent is reasonable, so we limit the total size in ram 381 * of a compressed extent to 128k. This is a crucial number 382 * because it also controls how easily we can spread reads across 383 * cpus for decompression. 384 * 385 * We also want to make sure the amount of IO required to do 386 * a random read is reasonably small, so we limit the size of 387 * a compressed extent to 128k. 388 */ 389 total_compressed = min(total_compressed, max_uncompressed); 390 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 391 num_bytes = max(blocksize, num_bytes); 392 total_in = 0; 393 ret = 0; 394 395 /* 396 * we do compression for mount -o compress and when the 397 * inode has not been flagged as nocompress. This flag can 398 * change at any time if we discover bad compression ratios. 399 */ 400 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 401 (btrfs_test_opt(root, COMPRESS) || 402 (BTRFS_I(inode)->force_compress) || 403 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 404 WARN_ON(pages); 405 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 406 if (!pages) { 407 /* just bail out to the uncompressed code */ 408 goto cont; 409 } 410 411 if (BTRFS_I(inode)->force_compress) 412 compress_type = BTRFS_I(inode)->force_compress; 413 414 ret = btrfs_compress_pages(compress_type, 415 inode->i_mapping, start, 416 total_compressed, pages, 417 nr_pages, &nr_pages_ret, 418 &total_in, 419 &total_compressed, 420 max_compressed); 421 422 if (!ret) { 423 unsigned long offset = total_compressed & 424 (PAGE_CACHE_SIZE - 1); 425 struct page *page = pages[nr_pages_ret - 1]; 426 char *kaddr; 427 428 /* zero the tail end of the last page, we might be 429 * sending it down to disk 430 */ 431 if (offset) { 432 kaddr = kmap_atomic(page); 433 memset(kaddr + offset, 0, 434 PAGE_CACHE_SIZE - offset); 435 kunmap_atomic(kaddr); 436 } 437 will_compress = 1; 438 } 439 } 440 cont: 441 if (start == 0) { 442 trans = btrfs_join_transaction(root); 443 if (IS_ERR(trans)) { 444 ret = PTR_ERR(trans); 445 trans = NULL; 446 goto cleanup_and_out; 447 } 448 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 449 450 /* lets try to make an inline extent */ 451 if (ret || total_in < (actual_end - start)) { 452 /* we didn't compress the entire range, try 453 * to make an uncompressed inline extent. 454 */ 455 ret = cow_file_range_inline(trans, root, inode, 456 start, end, 0, 0, NULL); 457 } else { 458 /* try making a compressed inline extent */ 459 ret = cow_file_range_inline(trans, root, inode, 460 start, end, 461 total_compressed, 462 compress_type, pages); 463 } 464 if (ret <= 0) { 465 /* 466 * inline extent creation worked or returned error, 467 * we don't need to create any more async work items. 468 * Unlock and free up our temp pages. 469 */ 470 extent_clear_unlock_delalloc(inode, 471 &BTRFS_I(inode)->io_tree, 472 start, end, NULL, 473 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 474 EXTENT_CLEAR_DELALLOC | 475 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 476 477 btrfs_end_transaction(trans, root); 478 goto free_pages_out; 479 } 480 btrfs_end_transaction(trans, root); 481 } 482 483 if (will_compress) { 484 /* 485 * we aren't doing an inline extent round the compressed size 486 * up to a block size boundary so the allocator does sane 487 * things 488 */ 489 total_compressed = (total_compressed + blocksize - 1) & 490 ~(blocksize - 1); 491 492 /* 493 * one last check to make sure the compression is really a 494 * win, compare the page count read with the blocks on disk 495 */ 496 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 497 ~(PAGE_CACHE_SIZE - 1); 498 if (total_compressed >= total_in) { 499 will_compress = 0; 500 } else { 501 num_bytes = total_in; 502 } 503 } 504 if (!will_compress && pages) { 505 /* 506 * the compression code ran but failed to make things smaller, 507 * free any pages it allocated and our page pointer array 508 */ 509 for (i = 0; i < nr_pages_ret; i++) { 510 WARN_ON(pages[i]->mapping); 511 page_cache_release(pages[i]); 512 } 513 kfree(pages); 514 pages = NULL; 515 total_compressed = 0; 516 nr_pages_ret = 0; 517 518 /* flag the file so we don't compress in the future */ 519 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 520 !(BTRFS_I(inode)->force_compress)) { 521 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 522 } 523 } 524 if (will_compress) { 525 *num_added += 1; 526 527 /* the async work queues will take care of doing actual 528 * allocation on disk for these compressed pages, 529 * and will submit them to the elevator. 530 */ 531 add_async_extent(async_cow, start, num_bytes, 532 total_compressed, pages, nr_pages_ret, 533 compress_type); 534 535 if (start + num_bytes < end) { 536 start += num_bytes; 537 pages = NULL; 538 cond_resched(); 539 goto again; 540 } 541 } else { 542 cleanup_and_bail_uncompressed: 543 /* 544 * No compression, but we still need to write the pages in 545 * the file we've been given so far. redirty the locked 546 * page if it corresponds to our extent and set things up 547 * for the async work queue to run cow_file_range to do 548 * the normal delalloc dance 549 */ 550 if (page_offset(locked_page) >= start && 551 page_offset(locked_page) <= end) { 552 __set_page_dirty_nobuffers(locked_page); 553 /* unlocked later on in the async handlers */ 554 } 555 add_async_extent(async_cow, start, end - start + 1, 556 0, NULL, 0, BTRFS_COMPRESS_NONE); 557 *num_added += 1; 558 } 559 560 out: 561 return ret; 562 563 free_pages_out: 564 for (i = 0; i < nr_pages_ret; i++) { 565 WARN_ON(pages[i]->mapping); 566 page_cache_release(pages[i]); 567 } 568 kfree(pages); 569 570 goto out; 571 572 cleanup_and_out: 573 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 574 start, end, NULL, 575 EXTENT_CLEAR_UNLOCK_PAGE | 576 EXTENT_CLEAR_DIRTY | 577 EXTENT_CLEAR_DELALLOC | 578 EXTENT_SET_WRITEBACK | 579 EXTENT_END_WRITEBACK); 580 if (!trans || IS_ERR(trans)) 581 btrfs_error(root->fs_info, ret, "Failed to join transaction"); 582 else 583 btrfs_abort_transaction(trans, root, ret); 584 goto free_pages_out; 585 } 586 587 /* 588 * phase two of compressed writeback. This is the ordered portion 589 * of the code, which only gets called in the order the work was 590 * queued. We walk all the async extents created by compress_file_range 591 * and send them down to the disk. 592 */ 593 static noinline int submit_compressed_extents(struct inode *inode, 594 struct async_cow *async_cow) 595 { 596 struct async_extent *async_extent; 597 u64 alloc_hint = 0; 598 struct btrfs_trans_handle *trans; 599 struct btrfs_key ins; 600 struct extent_map *em; 601 struct btrfs_root *root = BTRFS_I(inode)->root; 602 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 603 struct extent_io_tree *io_tree; 604 int ret = 0; 605 606 if (list_empty(&async_cow->extents)) 607 return 0; 608 609 610 while (!list_empty(&async_cow->extents)) { 611 async_extent = list_entry(async_cow->extents.next, 612 struct async_extent, list); 613 list_del(&async_extent->list); 614 615 io_tree = &BTRFS_I(inode)->io_tree; 616 617 retry: 618 /* did the compression code fall back to uncompressed IO? */ 619 if (!async_extent->pages) { 620 int page_started = 0; 621 unsigned long nr_written = 0; 622 623 lock_extent(io_tree, async_extent->start, 624 async_extent->start + 625 async_extent->ram_size - 1); 626 627 /* allocate blocks */ 628 ret = cow_file_range(inode, async_cow->locked_page, 629 async_extent->start, 630 async_extent->start + 631 async_extent->ram_size - 1, 632 &page_started, &nr_written, 0); 633 634 /* JDM XXX */ 635 636 /* 637 * if page_started, cow_file_range inserted an 638 * inline extent and took care of all the unlocking 639 * and IO for us. Otherwise, we need to submit 640 * all those pages down to the drive. 641 */ 642 if (!page_started && !ret) 643 extent_write_locked_range(io_tree, 644 inode, async_extent->start, 645 async_extent->start + 646 async_extent->ram_size - 1, 647 btrfs_get_extent, 648 WB_SYNC_ALL); 649 kfree(async_extent); 650 cond_resched(); 651 continue; 652 } 653 654 lock_extent(io_tree, async_extent->start, 655 async_extent->start + async_extent->ram_size - 1); 656 657 trans = btrfs_join_transaction(root); 658 if (IS_ERR(trans)) { 659 ret = PTR_ERR(trans); 660 } else { 661 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 662 ret = btrfs_reserve_extent(trans, root, 663 async_extent->compressed_size, 664 async_extent->compressed_size, 665 0, alloc_hint, &ins, 1); 666 if (ret) 667 btrfs_abort_transaction(trans, root, ret); 668 btrfs_end_transaction(trans, root); 669 } 670 671 if (ret) { 672 int i; 673 for (i = 0; i < async_extent->nr_pages; i++) { 674 WARN_ON(async_extent->pages[i]->mapping); 675 page_cache_release(async_extent->pages[i]); 676 } 677 kfree(async_extent->pages); 678 async_extent->nr_pages = 0; 679 async_extent->pages = NULL; 680 unlock_extent(io_tree, async_extent->start, 681 async_extent->start + 682 async_extent->ram_size - 1); 683 if (ret == -ENOSPC) 684 goto retry; 685 goto out_free; /* JDM: Requeue? */ 686 } 687 688 /* 689 * here we're doing allocation and writeback of the 690 * compressed pages 691 */ 692 btrfs_drop_extent_cache(inode, async_extent->start, 693 async_extent->start + 694 async_extent->ram_size - 1, 0); 695 696 em = alloc_extent_map(); 697 BUG_ON(!em); /* -ENOMEM */ 698 em->start = async_extent->start; 699 em->len = async_extent->ram_size; 700 em->orig_start = em->start; 701 702 em->block_start = ins.objectid; 703 em->block_len = ins.offset; 704 em->bdev = root->fs_info->fs_devices->latest_bdev; 705 em->compress_type = async_extent->compress_type; 706 set_bit(EXTENT_FLAG_PINNED, &em->flags); 707 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 708 709 while (1) { 710 write_lock(&em_tree->lock); 711 ret = add_extent_mapping(em_tree, em); 712 write_unlock(&em_tree->lock); 713 if (ret != -EEXIST) { 714 free_extent_map(em); 715 break; 716 } 717 btrfs_drop_extent_cache(inode, async_extent->start, 718 async_extent->start + 719 async_extent->ram_size - 1, 0); 720 } 721 722 ret = btrfs_add_ordered_extent_compress(inode, 723 async_extent->start, 724 ins.objectid, 725 async_extent->ram_size, 726 ins.offset, 727 BTRFS_ORDERED_COMPRESSED, 728 async_extent->compress_type); 729 BUG_ON(ret); /* -ENOMEM */ 730 731 /* 732 * clear dirty, set writeback and unlock the pages. 733 */ 734 extent_clear_unlock_delalloc(inode, 735 &BTRFS_I(inode)->io_tree, 736 async_extent->start, 737 async_extent->start + 738 async_extent->ram_size - 1, 739 NULL, EXTENT_CLEAR_UNLOCK_PAGE | 740 EXTENT_CLEAR_UNLOCK | 741 EXTENT_CLEAR_DELALLOC | 742 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); 743 744 ret = btrfs_submit_compressed_write(inode, 745 async_extent->start, 746 async_extent->ram_size, 747 ins.objectid, 748 ins.offset, async_extent->pages, 749 async_extent->nr_pages); 750 751 BUG_ON(ret); /* -ENOMEM */ 752 alloc_hint = ins.objectid + ins.offset; 753 kfree(async_extent); 754 cond_resched(); 755 } 756 ret = 0; 757 out: 758 return ret; 759 out_free: 760 kfree(async_extent); 761 goto out; 762 } 763 764 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 765 u64 num_bytes) 766 { 767 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 768 struct extent_map *em; 769 u64 alloc_hint = 0; 770 771 read_lock(&em_tree->lock); 772 em = search_extent_mapping(em_tree, start, num_bytes); 773 if (em) { 774 /* 775 * if block start isn't an actual block number then find the 776 * first block in this inode and use that as a hint. If that 777 * block is also bogus then just don't worry about it. 778 */ 779 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 780 free_extent_map(em); 781 em = search_extent_mapping(em_tree, 0, 0); 782 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 783 alloc_hint = em->block_start; 784 if (em) 785 free_extent_map(em); 786 } else { 787 alloc_hint = em->block_start; 788 free_extent_map(em); 789 } 790 } 791 read_unlock(&em_tree->lock); 792 793 return alloc_hint; 794 } 795 796 /* 797 * when extent_io.c finds a delayed allocation range in the file, 798 * the call backs end up in this code. The basic idea is to 799 * allocate extents on disk for the range, and create ordered data structs 800 * in ram to track those extents. 801 * 802 * locked_page is the page that writepage had locked already. We use 803 * it to make sure we don't do extra locks or unlocks. 804 * 805 * *page_started is set to one if we unlock locked_page and do everything 806 * required to start IO on it. It may be clean and already done with 807 * IO when we return. 808 */ 809 static noinline int cow_file_range(struct inode *inode, 810 struct page *locked_page, 811 u64 start, u64 end, int *page_started, 812 unsigned long *nr_written, 813 int unlock) 814 { 815 struct btrfs_root *root = BTRFS_I(inode)->root; 816 struct btrfs_trans_handle *trans; 817 u64 alloc_hint = 0; 818 u64 num_bytes; 819 unsigned long ram_size; 820 u64 disk_num_bytes; 821 u64 cur_alloc_size; 822 u64 blocksize = root->sectorsize; 823 struct btrfs_key ins; 824 struct extent_map *em; 825 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 826 int ret = 0; 827 828 BUG_ON(btrfs_is_free_space_inode(root, inode)); 829 trans = btrfs_join_transaction(root); 830 if (IS_ERR(trans)) { 831 extent_clear_unlock_delalloc(inode, 832 &BTRFS_I(inode)->io_tree, 833 start, end, locked_page, 834 EXTENT_CLEAR_UNLOCK_PAGE | 835 EXTENT_CLEAR_UNLOCK | 836 EXTENT_CLEAR_DELALLOC | 837 EXTENT_CLEAR_DIRTY | 838 EXTENT_SET_WRITEBACK | 839 EXTENT_END_WRITEBACK); 840 return PTR_ERR(trans); 841 } 842 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 843 844 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 845 num_bytes = max(blocksize, num_bytes); 846 disk_num_bytes = num_bytes; 847 ret = 0; 848 849 /* if this is a small write inside eof, kick off defrag */ 850 if (num_bytes < 64 * 1024 && 851 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 852 btrfs_add_inode_defrag(trans, inode); 853 854 if (start == 0) { 855 /* lets try to make an inline extent */ 856 ret = cow_file_range_inline(trans, root, inode, 857 start, end, 0, 0, NULL); 858 if (ret == 0) { 859 extent_clear_unlock_delalloc(inode, 860 &BTRFS_I(inode)->io_tree, 861 start, end, NULL, 862 EXTENT_CLEAR_UNLOCK_PAGE | 863 EXTENT_CLEAR_UNLOCK | 864 EXTENT_CLEAR_DELALLOC | 865 EXTENT_CLEAR_DIRTY | 866 EXTENT_SET_WRITEBACK | 867 EXTENT_END_WRITEBACK); 868 869 *nr_written = *nr_written + 870 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 871 *page_started = 1; 872 goto out; 873 } else if (ret < 0) { 874 btrfs_abort_transaction(trans, root, ret); 875 goto out_unlock; 876 } 877 } 878 879 BUG_ON(disk_num_bytes > 880 btrfs_super_total_bytes(root->fs_info->super_copy)); 881 882 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 883 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 884 885 while (disk_num_bytes > 0) { 886 unsigned long op; 887 888 cur_alloc_size = disk_num_bytes; 889 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 890 root->sectorsize, 0, alloc_hint, 891 &ins, 1); 892 if (ret < 0) { 893 btrfs_abort_transaction(trans, root, ret); 894 goto out_unlock; 895 } 896 897 em = alloc_extent_map(); 898 BUG_ON(!em); /* -ENOMEM */ 899 em->start = start; 900 em->orig_start = em->start; 901 ram_size = ins.offset; 902 em->len = ins.offset; 903 904 em->block_start = ins.objectid; 905 em->block_len = ins.offset; 906 em->bdev = root->fs_info->fs_devices->latest_bdev; 907 set_bit(EXTENT_FLAG_PINNED, &em->flags); 908 909 while (1) { 910 write_lock(&em_tree->lock); 911 ret = add_extent_mapping(em_tree, em); 912 write_unlock(&em_tree->lock); 913 if (ret != -EEXIST) { 914 free_extent_map(em); 915 break; 916 } 917 btrfs_drop_extent_cache(inode, start, 918 start + ram_size - 1, 0); 919 } 920 921 cur_alloc_size = ins.offset; 922 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 923 ram_size, cur_alloc_size, 0); 924 BUG_ON(ret); /* -ENOMEM */ 925 926 if (root->root_key.objectid == 927 BTRFS_DATA_RELOC_TREE_OBJECTID) { 928 ret = btrfs_reloc_clone_csums(inode, start, 929 cur_alloc_size); 930 if (ret) { 931 btrfs_abort_transaction(trans, root, ret); 932 goto out_unlock; 933 } 934 } 935 936 if (disk_num_bytes < cur_alloc_size) 937 break; 938 939 /* we're not doing compressed IO, don't unlock the first 940 * page (which the caller expects to stay locked), don't 941 * clear any dirty bits and don't set any writeback bits 942 * 943 * Do set the Private2 bit so we know this page was properly 944 * setup for writepage 945 */ 946 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; 947 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 948 EXTENT_SET_PRIVATE2; 949 950 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 951 start, start + ram_size - 1, 952 locked_page, op); 953 disk_num_bytes -= cur_alloc_size; 954 num_bytes -= cur_alloc_size; 955 alloc_hint = ins.objectid + ins.offset; 956 start += cur_alloc_size; 957 } 958 ret = 0; 959 out: 960 btrfs_end_transaction(trans, root); 961 962 return ret; 963 out_unlock: 964 extent_clear_unlock_delalloc(inode, 965 &BTRFS_I(inode)->io_tree, 966 start, end, locked_page, 967 EXTENT_CLEAR_UNLOCK_PAGE | 968 EXTENT_CLEAR_UNLOCK | 969 EXTENT_CLEAR_DELALLOC | 970 EXTENT_CLEAR_DIRTY | 971 EXTENT_SET_WRITEBACK | 972 EXTENT_END_WRITEBACK); 973 974 goto out; 975 } 976 977 /* 978 * work queue call back to started compression on a file and pages 979 */ 980 static noinline void async_cow_start(struct btrfs_work *work) 981 { 982 struct async_cow *async_cow; 983 int num_added = 0; 984 async_cow = container_of(work, struct async_cow, work); 985 986 compress_file_range(async_cow->inode, async_cow->locked_page, 987 async_cow->start, async_cow->end, async_cow, 988 &num_added); 989 if (num_added == 0) { 990 btrfs_add_delayed_iput(async_cow->inode); 991 async_cow->inode = NULL; 992 } 993 } 994 995 /* 996 * work queue call back to submit previously compressed pages 997 */ 998 static noinline void async_cow_submit(struct btrfs_work *work) 999 { 1000 struct async_cow *async_cow; 1001 struct btrfs_root *root; 1002 unsigned long nr_pages; 1003 1004 async_cow = container_of(work, struct async_cow, work); 1005 1006 root = async_cow->root; 1007 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 1008 PAGE_CACHE_SHIFT; 1009 1010 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 1011 1012 if (atomic_read(&root->fs_info->async_delalloc_pages) < 1013 5 * 1042 * 1024 && 1014 waitqueue_active(&root->fs_info->async_submit_wait)) 1015 wake_up(&root->fs_info->async_submit_wait); 1016 1017 if (async_cow->inode) 1018 submit_compressed_extents(async_cow->inode, async_cow); 1019 } 1020 1021 static noinline void async_cow_free(struct btrfs_work *work) 1022 { 1023 struct async_cow *async_cow; 1024 async_cow = container_of(work, struct async_cow, work); 1025 if (async_cow->inode) 1026 btrfs_add_delayed_iput(async_cow->inode); 1027 kfree(async_cow); 1028 } 1029 1030 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1031 u64 start, u64 end, int *page_started, 1032 unsigned long *nr_written) 1033 { 1034 struct async_cow *async_cow; 1035 struct btrfs_root *root = BTRFS_I(inode)->root; 1036 unsigned long nr_pages; 1037 u64 cur_end; 1038 int limit = 10 * 1024 * 1042; 1039 1040 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1041 1, 0, NULL, GFP_NOFS); 1042 while (start < end) { 1043 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1044 BUG_ON(!async_cow); /* -ENOMEM */ 1045 async_cow->inode = igrab(inode); 1046 async_cow->root = root; 1047 async_cow->locked_page = locked_page; 1048 async_cow->start = start; 1049 1050 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1051 cur_end = end; 1052 else 1053 cur_end = min(end, start + 512 * 1024 - 1); 1054 1055 async_cow->end = cur_end; 1056 INIT_LIST_HEAD(&async_cow->extents); 1057 1058 async_cow->work.func = async_cow_start; 1059 async_cow->work.ordered_func = async_cow_submit; 1060 async_cow->work.ordered_free = async_cow_free; 1061 async_cow->work.flags = 0; 1062 1063 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1064 PAGE_CACHE_SHIFT; 1065 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1066 1067 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1068 &async_cow->work); 1069 1070 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1071 wait_event(root->fs_info->async_submit_wait, 1072 (atomic_read(&root->fs_info->async_delalloc_pages) < 1073 limit)); 1074 } 1075 1076 while (atomic_read(&root->fs_info->async_submit_draining) && 1077 atomic_read(&root->fs_info->async_delalloc_pages)) { 1078 wait_event(root->fs_info->async_submit_wait, 1079 (atomic_read(&root->fs_info->async_delalloc_pages) == 1080 0)); 1081 } 1082 1083 *nr_written += nr_pages; 1084 start = cur_end + 1; 1085 } 1086 *page_started = 1; 1087 return 0; 1088 } 1089 1090 static noinline int csum_exist_in_range(struct btrfs_root *root, 1091 u64 bytenr, u64 num_bytes) 1092 { 1093 int ret; 1094 struct btrfs_ordered_sum *sums; 1095 LIST_HEAD(list); 1096 1097 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1098 bytenr + num_bytes - 1, &list, 0); 1099 if (ret == 0 && list_empty(&list)) 1100 return 0; 1101 1102 while (!list_empty(&list)) { 1103 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1104 list_del(&sums->list); 1105 kfree(sums); 1106 } 1107 return 1; 1108 } 1109 1110 /* 1111 * when nowcow writeback call back. This checks for snapshots or COW copies 1112 * of the extents that exist in the file, and COWs the file as required. 1113 * 1114 * If no cow copies or snapshots exist, we write directly to the existing 1115 * blocks on disk 1116 */ 1117 static noinline int run_delalloc_nocow(struct inode *inode, 1118 struct page *locked_page, 1119 u64 start, u64 end, int *page_started, int force, 1120 unsigned long *nr_written) 1121 { 1122 struct btrfs_root *root = BTRFS_I(inode)->root; 1123 struct btrfs_trans_handle *trans; 1124 struct extent_buffer *leaf; 1125 struct btrfs_path *path; 1126 struct btrfs_file_extent_item *fi; 1127 struct btrfs_key found_key; 1128 u64 cow_start; 1129 u64 cur_offset; 1130 u64 extent_end; 1131 u64 extent_offset; 1132 u64 disk_bytenr; 1133 u64 num_bytes; 1134 int extent_type; 1135 int ret, err; 1136 int type; 1137 int nocow; 1138 int check_prev = 1; 1139 bool nolock; 1140 u64 ino = btrfs_ino(inode); 1141 1142 path = btrfs_alloc_path(); 1143 if (!path) { 1144 extent_clear_unlock_delalloc(inode, 1145 &BTRFS_I(inode)->io_tree, 1146 start, end, locked_page, 1147 EXTENT_CLEAR_UNLOCK_PAGE | 1148 EXTENT_CLEAR_UNLOCK | 1149 EXTENT_CLEAR_DELALLOC | 1150 EXTENT_CLEAR_DIRTY | 1151 EXTENT_SET_WRITEBACK | 1152 EXTENT_END_WRITEBACK); 1153 return -ENOMEM; 1154 } 1155 1156 nolock = btrfs_is_free_space_inode(root, inode); 1157 1158 if (nolock) 1159 trans = btrfs_join_transaction_nolock(root); 1160 else 1161 trans = btrfs_join_transaction(root); 1162 1163 if (IS_ERR(trans)) { 1164 extent_clear_unlock_delalloc(inode, 1165 &BTRFS_I(inode)->io_tree, 1166 start, end, locked_page, 1167 EXTENT_CLEAR_UNLOCK_PAGE | 1168 EXTENT_CLEAR_UNLOCK | 1169 EXTENT_CLEAR_DELALLOC | 1170 EXTENT_CLEAR_DIRTY | 1171 EXTENT_SET_WRITEBACK | 1172 EXTENT_END_WRITEBACK); 1173 btrfs_free_path(path); 1174 return PTR_ERR(trans); 1175 } 1176 1177 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1178 1179 cow_start = (u64)-1; 1180 cur_offset = start; 1181 while (1) { 1182 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1183 cur_offset, 0); 1184 if (ret < 0) { 1185 btrfs_abort_transaction(trans, root, ret); 1186 goto error; 1187 } 1188 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1189 leaf = path->nodes[0]; 1190 btrfs_item_key_to_cpu(leaf, &found_key, 1191 path->slots[0] - 1); 1192 if (found_key.objectid == ino && 1193 found_key.type == BTRFS_EXTENT_DATA_KEY) 1194 path->slots[0]--; 1195 } 1196 check_prev = 0; 1197 next_slot: 1198 leaf = path->nodes[0]; 1199 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1200 ret = btrfs_next_leaf(root, path); 1201 if (ret < 0) { 1202 btrfs_abort_transaction(trans, root, ret); 1203 goto error; 1204 } 1205 if (ret > 0) 1206 break; 1207 leaf = path->nodes[0]; 1208 } 1209 1210 nocow = 0; 1211 disk_bytenr = 0; 1212 num_bytes = 0; 1213 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1214 1215 if (found_key.objectid > ino || 1216 found_key.type > BTRFS_EXTENT_DATA_KEY || 1217 found_key.offset > end) 1218 break; 1219 1220 if (found_key.offset > cur_offset) { 1221 extent_end = found_key.offset; 1222 extent_type = 0; 1223 goto out_check; 1224 } 1225 1226 fi = btrfs_item_ptr(leaf, path->slots[0], 1227 struct btrfs_file_extent_item); 1228 extent_type = btrfs_file_extent_type(leaf, fi); 1229 1230 if (extent_type == BTRFS_FILE_EXTENT_REG || 1231 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1232 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1233 extent_offset = btrfs_file_extent_offset(leaf, fi); 1234 extent_end = found_key.offset + 1235 btrfs_file_extent_num_bytes(leaf, fi); 1236 if (extent_end <= start) { 1237 path->slots[0]++; 1238 goto next_slot; 1239 } 1240 if (disk_bytenr == 0) 1241 goto out_check; 1242 if (btrfs_file_extent_compression(leaf, fi) || 1243 btrfs_file_extent_encryption(leaf, fi) || 1244 btrfs_file_extent_other_encoding(leaf, fi)) 1245 goto out_check; 1246 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1247 goto out_check; 1248 if (btrfs_extent_readonly(root, disk_bytenr)) 1249 goto out_check; 1250 if (btrfs_cross_ref_exist(trans, root, ino, 1251 found_key.offset - 1252 extent_offset, disk_bytenr)) 1253 goto out_check; 1254 disk_bytenr += extent_offset; 1255 disk_bytenr += cur_offset - found_key.offset; 1256 num_bytes = min(end + 1, extent_end) - cur_offset; 1257 /* 1258 * force cow if csum exists in the range. 1259 * this ensure that csum for a given extent are 1260 * either valid or do not exist. 1261 */ 1262 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1263 goto out_check; 1264 nocow = 1; 1265 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1266 extent_end = found_key.offset + 1267 btrfs_file_extent_inline_len(leaf, fi); 1268 extent_end = ALIGN(extent_end, root->sectorsize); 1269 } else { 1270 BUG_ON(1); 1271 } 1272 out_check: 1273 if (extent_end <= start) { 1274 path->slots[0]++; 1275 goto next_slot; 1276 } 1277 if (!nocow) { 1278 if (cow_start == (u64)-1) 1279 cow_start = cur_offset; 1280 cur_offset = extent_end; 1281 if (cur_offset > end) 1282 break; 1283 path->slots[0]++; 1284 goto next_slot; 1285 } 1286 1287 btrfs_release_path(path); 1288 if (cow_start != (u64)-1) { 1289 ret = cow_file_range(inode, locked_page, cow_start, 1290 found_key.offset - 1, page_started, 1291 nr_written, 1); 1292 if (ret) { 1293 btrfs_abort_transaction(trans, root, ret); 1294 goto error; 1295 } 1296 cow_start = (u64)-1; 1297 } 1298 1299 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1300 struct extent_map *em; 1301 struct extent_map_tree *em_tree; 1302 em_tree = &BTRFS_I(inode)->extent_tree; 1303 em = alloc_extent_map(); 1304 BUG_ON(!em); /* -ENOMEM */ 1305 em->start = cur_offset; 1306 em->orig_start = em->start; 1307 em->len = num_bytes; 1308 em->block_len = num_bytes; 1309 em->block_start = disk_bytenr; 1310 em->bdev = root->fs_info->fs_devices->latest_bdev; 1311 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1312 while (1) { 1313 write_lock(&em_tree->lock); 1314 ret = add_extent_mapping(em_tree, em); 1315 write_unlock(&em_tree->lock); 1316 if (ret != -EEXIST) { 1317 free_extent_map(em); 1318 break; 1319 } 1320 btrfs_drop_extent_cache(inode, em->start, 1321 em->start + em->len - 1, 0); 1322 } 1323 type = BTRFS_ORDERED_PREALLOC; 1324 } else { 1325 type = BTRFS_ORDERED_NOCOW; 1326 } 1327 1328 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1329 num_bytes, num_bytes, type); 1330 BUG_ON(ret); /* -ENOMEM */ 1331 1332 if (root->root_key.objectid == 1333 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1334 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1335 num_bytes); 1336 if (ret) { 1337 btrfs_abort_transaction(trans, root, ret); 1338 goto error; 1339 } 1340 } 1341 1342 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1343 cur_offset, cur_offset + num_bytes - 1, 1344 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1345 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 1346 EXTENT_SET_PRIVATE2); 1347 cur_offset = extent_end; 1348 if (cur_offset > end) 1349 break; 1350 } 1351 btrfs_release_path(path); 1352 1353 if (cur_offset <= end && cow_start == (u64)-1) { 1354 cow_start = cur_offset; 1355 cur_offset = end; 1356 } 1357 1358 if (cow_start != (u64)-1) { 1359 ret = cow_file_range(inode, locked_page, cow_start, end, 1360 page_started, nr_written, 1); 1361 if (ret) { 1362 btrfs_abort_transaction(trans, root, ret); 1363 goto error; 1364 } 1365 } 1366 1367 error: 1368 if (nolock) { 1369 err = btrfs_end_transaction_nolock(trans, root); 1370 } else { 1371 err = btrfs_end_transaction(trans, root); 1372 } 1373 if (!ret) 1374 ret = err; 1375 1376 if (ret && cur_offset < end) 1377 extent_clear_unlock_delalloc(inode, 1378 &BTRFS_I(inode)->io_tree, 1379 cur_offset, end, locked_page, 1380 EXTENT_CLEAR_UNLOCK_PAGE | 1381 EXTENT_CLEAR_UNLOCK | 1382 EXTENT_CLEAR_DELALLOC | 1383 EXTENT_CLEAR_DIRTY | 1384 EXTENT_SET_WRITEBACK | 1385 EXTENT_END_WRITEBACK); 1386 1387 btrfs_free_path(path); 1388 return ret; 1389 } 1390 1391 /* 1392 * extent_io.c call back to do delayed allocation processing 1393 */ 1394 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1395 u64 start, u64 end, int *page_started, 1396 unsigned long *nr_written) 1397 { 1398 int ret; 1399 struct btrfs_root *root = BTRFS_I(inode)->root; 1400 1401 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1402 ret = run_delalloc_nocow(inode, locked_page, start, end, 1403 page_started, 1, nr_written); 1404 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1405 ret = run_delalloc_nocow(inode, locked_page, start, end, 1406 page_started, 0, nr_written); 1407 } else if (!btrfs_test_opt(root, COMPRESS) && 1408 !(BTRFS_I(inode)->force_compress) && 1409 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { 1410 ret = cow_file_range(inode, locked_page, start, end, 1411 page_started, nr_written, 1); 1412 } else { 1413 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1414 &BTRFS_I(inode)->runtime_flags); 1415 ret = cow_file_range_async(inode, locked_page, start, end, 1416 page_started, nr_written); 1417 } 1418 return ret; 1419 } 1420 1421 static void btrfs_split_extent_hook(struct inode *inode, 1422 struct extent_state *orig, u64 split) 1423 { 1424 /* not delalloc, ignore it */ 1425 if (!(orig->state & EXTENT_DELALLOC)) 1426 return; 1427 1428 spin_lock(&BTRFS_I(inode)->lock); 1429 BTRFS_I(inode)->outstanding_extents++; 1430 spin_unlock(&BTRFS_I(inode)->lock); 1431 } 1432 1433 /* 1434 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1435 * extents so we can keep track of new extents that are just merged onto old 1436 * extents, such as when we are doing sequential writes, so we can properly 1437 * account for the metadata space we'll need. 1438 */ 1439 static void btrfs_merge_extent_hook(struct inode *inode, 1440 struct extent_state *new, 1441 struct extent_state *other) 1442 { 1443 /* not delalloc, ignore it */ 1444 if (!(other->state & EXTENT_DELALLOC)) 1445 return; 1446 1447 spin_lock(&BTRFS_I(inode)->lock); 1448 BTRFS_I(inode)->outstanding_extents--; 1449 spin_unlock(&BTRFS_I(inode)->lock); 1450 } 1451 1452 /* 1453 * extent_io.c set_bit_hook, used to track delayed allocation 1454 * bytes in this file, and to maintain the list of inodes that 1455 * have pending delalloc work to be done. 1456 */ 1457 static void btrfs_set_bit_hook(struct inode *inode, 1458 struct extent_state *state, int *bits) 1459 { 1460 1461 /* 1462 * set_bit and clear bit hooks normally require _irqsave/restore 1463 * but in this case, we are only testing for the DELALLOC 1464 * bit, which is only set or cleared with irqs on 1465 */ 1466 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1467 struct btrfs_root *root = BTRFS_I(inode)->root; 1468 u64 len = state->end + 1 - state->start; 1469 bool do_list = !btrfs_is_free_space_inode(root, inode); 1470 1471 if (*bits & EXTENT_FIRST_DELALLOC) { 1472 *bits &= ~EXTENT_FIRST_DELALLOC; 1473 } else { 1474 spin_lock(&BTRFS_I(inode)->lock); 1475 BTRFS_I(inode)->outstanding_extents++; 1476 spin_unlock(&BTRFS_I(inode)->lock); 1477 } 1478 1479 spin_lock(&root->fs_info->delalloc_lock); 1480 BTRFS_I(inode)->delalloc_bytes += len; 1481 root->fs_info->delalloc_bytes += len; 1482 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1483 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1484 &root->fs_info->delalloc_inodes); 1485 } 1486 spin_unlock(&root->fs_info->delalloc_lock); 1487 } 1488 } 1489 1490 /* 1491 * extent_io.c clear_bit_hook, see set_bit_hook for why 1492 */ 1493 static void btrfs_clear_bit_hook(struct inode *inode, 1494 struct extent_state *state, int *bits) 1495 { 1496 /* 1497 * set_bit and clear bit hooks normally require _irqsave/restore 1498 * but in this case, we are only testing for the DELALLOC 1499 * bit, which is only set or cleared with irqs on 1500 */ 1501 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1502 struct btrfs_root *root = BTRFS_I(inode)->root; 1503 u64 len = state->end + 1 - state->start; 1504 bool do_list = !btrfs_is_free_space_inode(root, inode); 1505 1506 if (*bits & EXTENT_FIRST_DELALLOC) { 1507 *bits &= ~EXTENT_FIRST_DELALLOC; 1508 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1509 spin_lock(&BTRFS_I(inode)->lock); 1510 BTRFS_I(inode)->outstanding_extents--; 1511 spin_unlock(&BTRFS_I(inode)->lock); 1512 } 1513 1514 if (*bits & EXTENT_DO_ACCOUNTING) 1515 btrfs_delalloc_release_metadata(inode, len); 1516 1517 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1518 && do_list) 1519 btrfs_free_reserved_data_space(inode, len); 1520 1521 spin_lock(&root->fs_info->delalloc_lock); 1522 root->fs_info->delalloc_bytes -= len; 1523 BTRFS_I(inode)->delalloc_bytes -= len; 1524 1525 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1526 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1527 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1528 } 1529 spin_unlock(&root->fs_info->delalloc_lock); 1530 } 1531 } 1532 1533 /* 1534 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1535 * we don't create bios that span stripes or chunks 1536 */ 1537 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1538 size_t size, struct bio *bio, 1539 unsigned long bio_flags) 1540 { 1541 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1542 struct btrfs_mapping_tree *map_tree; 1543 u64 logical = (u64)bio->bi_sector << 9; 1544 u64 length = 0; 1545 u64 map_length; 1546 int ret; 1547 1548 if (bio_flags & EXTENT_BIO_COMPRESSED) 1549 return 0; 1550 1551 length = bio->bi_size; 1552 map_tree = &root->fs_info->mapping_tree; 1553 map_length = length; 1554 ret = btrfs_map_block(map_tree, READ, logical, 1555 &map_length, NULL, 0); 1556 /* Will always return 0 or 1 with map_multi == NULL */ 1557 BUG_ON(ret < 0); 1558 if (map_length < length + size) 1559 return 1; 1560 return 0; 1561 } 1562 1563 /* 1564 * in order to insert checksums into the metadata in large chunks, 1565 * we wait until bio submission time. All the pages in the bio are 1566 * checksummed and sums are attached onto the ordered extent record. 1567 * 1568 * At IO completion time the cums attached on the ordered extent record 1569 * are inserted into the btree 1570 */ 1571 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1572 struct bio *bio, int mirror_num, 1573 unsigned long bio_flags, 1574 u64 bio_offset) 1575 { 1576 struct btrfs_root *root = BTRFS_I(inode)->root; 1577 int ret = 0; 1578 1579 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1580 BUG_ON(ret); /* -ENOMEM */ 1581 return 0; 1582 } 1583 1584 /* 1585 * in order to insert checksums into the metadata in large chunks, 1586 * we wait until bio submission time. All the pages in the bio are 1587 * checksummed and sums are attached onto the ordered extent record. 1588 * 1589 * At IO completion time the cums attached on the ordered extent record 1590 * are inserted into the btree 1591 */ 1592 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1593 int mirror_num, unsigned long bio_flags, 1594 u64 bio_offset) 1595 { 1596 struct btrfs_root *root = BTRFS_I(inode)->root; 1597 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1598 } 1599 1600 /* 1601 * extent_io.c submission hook. This does the right thing for csum calculation 1602 * on write, or reading the csums from the tree before a read 1603 */ 1604 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1605 int mirror_num, unsigned long bio_flags, 1606 u64 bio_offset) 1607 { 1608 struct btrfs_root *root = BTRFS_I(inode)->root; 1609 int ret = 0; 1610 int skip_sum; 1611 int metadata = 0; 1612 1613 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1614 1615 if (btrfs_is_free_space_inode(root, inode)) 1616 metadata = 2; 1617 1618 if (!(rw & REQ_WRITE)) { 1619 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1620 if (ret) 1621 return ret; 1622 1623 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1624 return btrfs_submit_compressed_read(inode, bio, 1625 mirror_num, bio_flags); 1626 } else if (!skip_sum) { 1627 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1628 if (ret) 1629 return ret; 1630 } 1631 goto mapit; 1632 } else if (!skip_sum) { 1633 /* csum items have already been cloned */ 1634 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1635 goto mapit; 1636 /* we're doing a write, do the async checksumming */ 1637 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1638 inode, rw, bio, mirror_num, 1639 bio_flags, bio_offset, 1640 __btrfs_submit_bio_start, 1641 __btrfs_submit_bio_done); 1642 } 1643 1644 mapit: 1645 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1646 } 1647 1648 /* 1649 * given a list of ordered sums record them in the inode. This happens 1650 * at IO completion time based on sums calculated at bio submission time. 1651 */ 1652 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1653 struct inode *inode, u64 file_offset, 1654 struct list_head *list) 1655 { 1656 struct btrfs_ordered_sum *sum; 1657 1658 list_for_each_entry(sum, list, list) { 1659 btrfs_csum_file_blocks(trans, 1660 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1661 } 1662 return 0; 1663 } 1664 1665 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1666 struct extent_state **cached_state) 1667 { 1668 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1669 WARN_ON(1); 1670 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1671 cached_state, GFP_NOFS); 1672 } 1673 1674 /* see btrfs_writepage_start_hook for details on why this is required */ 1675 struct btrfs_writepage_fixup { 1676 struct page *page; 1677 struct btrfs_work work; 1678 }; 1679 1680 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1681 { 1682 struct btrfs_writepage_fixup *fixup; 1683 struct btrfs_ordered_extent *ordered; 1684 struct extent_state *cached_state = NULL; 1685 struct page *page; 1686 struct inode *inode; 1687 u64 page_start; 1688 u64 page_end; 1689 int ret; 1690 1691 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1692 page = fixup->page; 1693 again: 1694 lock_page(page); 1695 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1696 ClearPageChecked(page); 1697 goto out_page; 1698 } 1699 1700 inode = page->mapping->host; 1701 page_start = page_offset(page); 1702 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1703 1704 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1705 &cached_state); 1706 1707 /* already ordered? We're done */ 1708 if (PagePrivate2(page)) 1709 goto out; 1710 1711 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1712 if (ordered) { 1713 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1714 page_end, &cached_state, GFP_NOFS); 1715 unlock_page(page); 1716 btrfs_start_ordered_extent(inode, ordered, 1); 1717 btrfs_put_ordered_extent(ordered); 1718 goto again; 1719 } 1720 1721 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1722 if (ret) { 1723 mapping_set_error(page->mapping, ret); 1724 end_extent_writepage(page, ret, page_start, page_end); 1725 ClearPageChecked(page); 1726 goto out; 1727 } 1728 1729 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1730 ClearPageChecked(page); 1731 set_page_dirty(page); 1732 out: 1733 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1734 &cached_state, GFP_NOFS); 1735 out_page: 1736 unlock_page(page); 1737 page_cache_release(page); 1738 kfree(fixup); 1739 } 1740 1741 /* 1742 * There are a few paths in the higher layers of the kernel that directly 1743 * set the page dirty bit without asking the filesystem if it is a 1744 * good idea. This causes problems because we want to make sure COW 1745 * properly happens and the data=ordered rules are followed. 1746 * 1747 * In our case any range that doesn't have the ORDERED bit set 1748 * hasn't been properly setup for IO. We kick off an async process 1749 * to fix it up. The async helper will wait for ordered extents, set 1750 * the delalloc bit and make it safe to write the page. 1751 */ 1752 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1753 { 1754 struct inode *inode = page->mapping->host; 1755 struct btrfs_writepage_fixup *fixup; 1756 struct btrfs_root *root = BTRFS_I(inode)->root; 1757 1758 /* this page is properly in the ordered list */ 1759 if (TestClearPagePrivate2(page)) 1760 return 0; 1761 1762 if (PageChecked(page)) 1763 return -EAGAIN; 1764 1765 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1766 if (!fixup) 1767 return -EAGAIN; 1768 1769 SetPageChecked(page); 1770 page_cache_get(page); 1771 fixup->work.func = btrfs_writepage_fixup_worker; 1772 fixup->page = page; 1773 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1774 return -EBUSY; 1775 } 1776 1777 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1778 struct inode *inode, u64 file_pos, 1779 u64 disk_bytenr, u64 disk_num_bytes, 1780 u64 num_bytes, u64 ram_bytes, 1781 u8 compression, u8 encryption, 1782 u16 other_encoding, int extent_type) 1783 { 1784 struct btrfs_root *root = BTRFS_I(inode)->root; 1785 struct btrfs_file_extent_item *fi; 1786 struct btrfs_path *path; 1787 struct extent_buffer *leaf; 1788 struct btrfs_key ins; 1789 u64 hint; 1790 int ret; 1791 1792 path = btrfs_alloc_path(); 1793 if (!path) 1794 return -ENOMEM; 1795 1796 path->leave_spinning = 1; 1797 1798 /* 1799 * we may be replacing one extent in the tree with another. 1800 * The new extent is pinned in the extent map, and we don't want 1801 * to drop it from the cache until it is completely in the btree. 1802 * 1803 * So, tell btrfs_drop_extents to leave this extent in the cache. 1804 * the caller is expected to unpin it and allow it to be merged 1805 * with the others. 1806 */ 1807 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1808 &hint, 0); 1809 if (ret) 1810 goto out; 1811 1812 ins.objectid = btrfs_ino(inode); 1813 ins.offset = file_pos; 1814 ins.type = BTRFS_EXTENT_DATA_KEY; 1815 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1816 if (ret) 1817 goto out; 1818 leaf = path->nodes[0]; 1819 fi = btrfs_item_ptr(leaf, path->slots[0], 1820 struct btrfs_file_extent_item); 1821 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1822 btrfs_set_file_extent_type(leaf, fi, extent_type); 1823 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1824 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1825 btrfs_set_file_extent_offset(leaf, fi, 0); 1826 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1827 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1828 btrfs_set_file_extent_compression(leaf, fi, compression); 1829 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1830 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1831 1832 btrfs_unlock_up_safe(path, 1); 1833 btrfs_set_lock_blocking(leaf); 1834 1835 btrfs_mark_buffer_dirty(leaf); 1836 1837 inode_add_bytes(inode, num_bytes); 1838 1839 ins.objectid = disk_bytenr; 1840 ins.offset = disk_num_bytes; 1841 ins.type = BTRFS_EXTENT_ITEM_KEY; 1842 ret = btrfs_alloc_reserved_file_extent(trans, root, 1843 root->root_key.objectid, 1844 btrfs_ino(inode), file_pos, &ins); 1845 out: 1846 btrfs_free_path(path); 1847 1848 return ret; 1849 } 1850 1851 /* 1852 * helper function for btrfs_finish_ordered_io, this 1853 * just reads in some of the csum leaves to prime them into ram 1854 * before we start the transaction. It limits the amount of btree 1855 * reads required while inside the transaction. 1856 */ 1857 /* as ordered data IO finishes, this gets called so we can finish 1858 * an ordered extent if the range of bytes in the file it covers are 1859 * fully written. 1860 */ 1861 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 1862 { 1863 struct inode *inode = ordered_extent->inode; 1864 struct btrfs_root *root = BTRFS_I(inode)->root; 1865 struct btrfs_trans_handle *trans = NULL; 1866 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1867 struct extent_state *cached_state = NULL; 1868 int compress_type = 0; 1869 int ret; 1870 bool nolock; 1871 1872 nolock = btrfs_is_free_space_inode(root, inode); 1873 1874 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 1875 ret = -EIO; 1876 goto out; 1877 } 1878 1879 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1880 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1881 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1882 if (!ret) { 1883 if (nolock) 1884 trans = btrfs_join_transaction_nolock(root); 1885 else 1886 trans = btrfs_join_transaction(root); 1887 if (IS_ERR(trans)) 1888 return PTR_ERR(trans); 1889 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1890 ret = btrfs_update_inode_fallback(trans, root, inode); 1891 if (ret) /* -ENOMEM or corruption */ 1892 btrfs_abort_transaction(trans, root, ret); 1893 } 1894 goto out; 1895 } 1896 1897 lock_extent_bits(io_tree, ordered_extent->file_offset, 1898 ordered_extent->file_offset + ordered_extent->len - 1, 1899 0, &cached_state); 1900 1901 if (nolock) 1902 trans = btrfs_join_transaction_nolock(root); 1903 else 1904 trans = btrfs_join_transaction(root); 1905 if (IS_ERR(trans)) { 1906 ret = PTR_ERR(trans); 1907 trans = NULL; 1908 goto out_unlock; 1909 } 1910 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1911 1912 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1913 compress_type = ordered_extent->compress_type; 1914 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1915 BUG_ON(compress_type); 1916 ret = btrfs_mark_extent_written(trans, inode, 1917 ordered_extent->file_offset, 1918 ordered_extent->file_offset + 1919 ordered_extent->len); 1920 } else { 1921 BUG_ON(root == root->fs_info->tree_root); 1922 ret = insert_reserved_file_extent(trans, inode, 1923 ordered_extent->file_offset, 1924 ordered_extent->start, 1925 ordered_extent->disk_len, 1926 ordered_extent->len, 1927 ordered_extent->len, 1928 compress_type, 0, 0, 1929 BTRFS_FILE_EXTENT_REG); 1930 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1931 ordered_extent->file_offset, 1932 ordered_extent->len); 1933 } 1934 1935 if (ret < 0) { 1936 btrfs_abort_transaction(trans, root, ret); 1937 goto out_unlock; 1938 } 1939 1940 add_pending_csums(trans, inode, ordered_extent->file_offset, 1941 &ordered_extent->list); 1942 1943 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1944 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1945 ret = btrfs_update_inode_fallback(trans, root, inode); 1946 if (ret) { /* -ENOMEM or corruption */ 1947 btrfs_abort_transaction(trans, root, ret); 1948 goto out_unlock; 1949 } 1950 } 1951 ret = 0; 1952 out_unlock: 1953 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1954 ordered_extent->file_offset + 1955 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1956 out: 1957 if (root != root->fs_info->tree_root) 1958 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1959 if (trans) { 1960 if (nolock) 1961 btrfs_end_transaction_nolock(trans, root); 1962 else 1963 btrfs_end_transaction(trans, root); 1964 } 1965 1966 if (ret) 1967 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 1968 ordered_extent->file_offset + 1969 ordered_extent->len - 1, NULL, GFP_NOFS); 1970 1971 /* 1972 * This needs to be dont to make sure anybody waiting knows we are done 1973 * upating everything for this ordered extent. 1974 */ 1975 btrfs_remove_ordered_extent(inode, ordered_extent); 1976 1977 /* once for us */ 1978 btrfs_put_ordered_extent(ordered_extent); 1979 /* once for the tree */ 1980 btrfs_put_ordered_extent(ordered_extent); 1981 1982 return ret; 1983 } 1984 1985 static void finish_ordered_fn(struct btrfs_work *work) 1986 { 1987 struct btrfs_ordered_extent *ordered_extent; 1988 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 1989 btrfs_finish_ordered_io(ordered_extent); 1990 } 1991 1992 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1993 struct extent_state *state, int uptodate) 1994 { 1995 struct inode *inode = page->mapping->host; 1996 struct btrfs_root *root = BTRFS_I(inode)->root; 1997 struct btrfs_ordered_extent *ordered_extent = NULL; 1998 struct btrfs_workers *workers; 1999 2000 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2001 2002 ClearPagePrivate2(page); 2003 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 2004 end - start + 1, uptodate)) 2005 return 0; 2006 2007 ordered_extent->work.func = finish_ordered_fn; 2008 ordered_extent->work.flags = 0; 2009 2010 if (btrfs_is_free_space_inode(root, inode)) 2011 workers = &root->fs_info->endio_freespace_worker; 2012 else 2013 workers = &root->fs_info->endio_write_workers; 2014 btrfs_queue_worker(workers, &ordered_extent->work); 2015 2016 return 0; 2017 } 2018 2019 /* 2020 * when reads are done, we need to check csums to verify the data is correct 2021 * if there's a match, we allow the bio to finish. If not, the code in 2022 * extent_io.c will try to find good copies for us. 2023 */ 2024 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 2025 struct extent_state *state, int mirror) 2026 { 2027 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 2028 struct inode *inode = page->mapping->host; 2029 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2030 char *kaddr; 2031 u64 private = ~(u32)0; 2032 int ret; 2033 struct btrfs_root *root = BTRFS_I(inode)->root; 2034 u32 csum = ~(u32)0; 2035 2036 if (PageChecked(page)) { 2037 ClearPageChecked(page); 2038 goto good; 2039 } 2040 2041 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2042 goto good; 2043 2044 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2045 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2046 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 2047 GFP_NOFS); 2048 return 0; 2049 } 2050 2051 if (state && state->start == start) { 2052 private = state->private; 2053 ret = 0; 2054 } else { 2055 ret = get_state_private(io_tree, start, &private); 2056 } 2057 kaddr = kmap_atomic(page); 2058 if (ret) 2059 goto zeroit; 2060 2061 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 2062 btrfs_csum_final(csum, (char *)&csum); 2063 if (csum != private) 2064 goto zeroit; 2065 2066 kunmap_atomic(kaddr); 2067 good: 2068 return 0; 2069 2070 zeroit: 2071 printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " 2072 "private %llu\n", 2073 (unsigned long long)btrfs_ino(page->mapping->host), 2074 (unsigned long long)start, csum, 2075 (unsigned long long)private); 2076 memset(kaddr + offset, 1, end - start + 1); 2077 flush_dcache_page(page); 2078 kunmap_atomic(kaddr); 2079 if (private == 0) 2080 return 0; 2081 return -EIO; 2082 } 2083 2084 struct delayed_iput { 2085 struct list_head list; 2086 struct inode *inode; 2087 }; 2088 2089 /* JDM: If this is fs-wide, why can't we add a pointer to 2090 * btrfs_inode instead and avoid the allocation? */ 2091 void btrfs_add_delayed_iput(struct inode *inode) 2092 { 2093 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2094 struct delayed_iput *delayed; 2095 2096 if (atomic_add_unless(&inode->i_count, -1, 1)) 2097 return; 2098 2099 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 2100 delayed->inode = inode; 2101 2102 spin_lock(&fs_info->delayed_iput_lock); 2103 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 2104 spin_unlock(&fs_info->delayed_iput_lock); 2105 } 2106 2107 void btrfs_run_delayed_iputs(struct btrfs_root *root) 2108 { 2109 LIST_HEAD(list); 2110 struct btrfs_fs_info *fs_info = root->fs_info; 2111 struct delayed_iput *delayed; 2112 int empty; 2113 2114 spin_lock(&fs_info->delayed_iput_lock); 2115 empty = list_empty(&fs_info->delayed_iputs); 2116 spin_unlock(&fs_info->delayed_iput_lock); 2117 if (empty) 2118 return; 2119 2120 down_read(&root->fs_info->cleanup_work_sem); 2121 spin_lock(&fs_info->delayed_iput_lock); 2122 list_splice_init(&fs_info->delayed_iputs, &list); 2123 spin_unlock(&fs_info->delayed_iput_lock); 2124 2125 while (!list_empty(&list)) { 2126 delayed = list_entry(list.next, struct delayed_iput, list); 2127 list_del(&delayed->list); 2128 iput(delayed->inode); 2129 kfree(delayed); 2130 } 2131 up_read(&root->fs_info->cleanup_work_sem); 2132 } 2133 2134 enum btrfs_orphan_cleanup_state { 2135 ORPHAN_CLEANUP_STARTED = 1, 2136 ORPHAN_CLEANUP_DONE = 2, 2137 }; 2138 2139 /* 2140 * This is called in transaction commit time. If there are no orphan 2141 * files in the subvolume, it removes orphan item and frees block_rsv 2142 * structure. 2143 */ 2144 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2145 struct btrfs_root *root) 2146 { 2147 struct btrfs_block_rsv *block_rsv; 2148 int ret; 2149 2150 if (atomic_read(&root->orphan_inodes) || 2151 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2152 return; 2153 2154 spin_lock(&root->orphan_lock); 2155 if (atomic_read(&root->orphan_inodes)) { 2156 spin_unlock(&root->orphan_lock); 2157 return; 2158 } 2159 2160 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { 2161 spin_unlock(&root->orphan_lock); 2162 return; 2163 } 2164 2165 block_rsv = root->orphan_block_rsv; 2166 root->orphan_block_rsv = NULL; 2167 spin_unlock(&root->orphan_lock); 2168 2169 if (root->orphan_item_inserted && 2170 btrfs_root_refs(&root->root_item) > 0) { 2171 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2172 root->root_key.objectid); 2173 BUG_ON(ret); 2174 root->orphan_item_inserted = 0; 2175 } 2176 2177 if (block_rsv) { 2178 WARN_ON(block_rsv->size > 0); 2179 btrfs_free_block_rsv(root, block_rsv); 2180 } 2181 } 2182 2183 /* 2184 * This creates an orphan entry for the given inode in case something goes 2185 * wrong in the middle of an unlink/truncate. 2186 * 2187 * NOTE: caller of this function should reserve 5 units of metadata for 2188 * this function. 2189 */ 2190 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2191 { 2192 struct btrfs_root *root = BTRFS_I(inode)->root; 2193 struct btrfs_block_rsv *block_rsv = NULL; 2194 int reserve = 0; 2195 int insert = 0; 2196 int ret; 2197 2198 if (!root->orphan_block_rsv) { 2199 block_rsv = btrfs_alloc_block_rsv(root); 2200 if (!block_rsv) 2201 return -ENOMEM; 2202 } 2203 2204 spin_lock(&root->orphan_lock); 2205 if (!root->orphan_block_rsv) { 2206 root->orphan_block_rsv = block_rsv; 2207 } else if (block_rsv) { 2208 btrfs_free_block_rsv(root, block_rsv); 2209 block_rsv = NULL; 2210 } 2211 2212 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2213 &BTRFS_I(inode)->runtime_flags)) { 2214 #if 0 2215 /* 2216 * For proper ENOSPC handling, we should do orphan 2217 * cleanup when mounting. But this introduces backward 2218 * compatibility issue. 2219 */ 2220 if (!xchg(&root->orphan_item_inserted, 1)) 2221 insert = 2; 2222 else 2223 insert = 1; 2224 #endif 2225 insert = 1; 2226 atomic_dec(&root->orphan_inodes); 2227 } 2228 2229 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2230 &BTRFS_I(inode)->runtime_flags)) 2231 reserve = 1; 2232 spin_unlock(&root->orphan_lock); 2233 2234 /* grab metadata reservation from transaction handle */ 2235 if (reserve) { 2236 ret = btrfs_orphan_reserve_metadata(trans, inode); 2237 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ 2238 } 2239 2240 /* insert an orphan item to track this unlinked/truncated file */ 2241 if (insert >= 1) { 2242 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2243 if (ret && ret != -EEXIST) { 2244 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2245 &BTRFS_I(inode)->runtime_flags); 2246 btrfs_abort_transaction(trans, root, ret); 2247 return ret; 2248 } 2249 ret = 0; 2250 } 2251 2252 /* insert an orphan item to track subvolume contains orphan files */ 2253 if (insert >= 2) { 2254 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2255 root->root_key.objectid); 2256 if (ret && ret != -EEXIST) { 2257 btrfs_abort_transaction(trans, root, ret); 2258 return ret; 2259 } 2260 } 2261 return 0; 2262 } 2263 2264 /* 2265 * We have done the truncate/delete so we can go ahead and remove the orphan 2266 * item for this particular inode. 2267 */ 2268 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2269 { 2270 struct btrfs_root *root = BTRFS_I(inode)->root; 2271 int delete_item = 0; 2272 int release_rsv = 0; 2273 int ret = 0; 2274 2275 spin_lock(&root->orphan_lock); 2276 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2277 &BTRFS_I(inode)->runtime_flags)) 2278 delete_item = 1; 2279 2280 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2281 &BTRFS_I(inode)->runtime_flags)) 2282 release_rsv = 1; 2283 spin_unlock(&root->orphan_lock); 2284 2285 if (trans && delete_item) { 2286 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); 2287 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2288 } 2289 2290 if (release_rsv) { 2291 btrfs_orphan_release_metadata(inode); 2292 atomic_dec(&root->orphan_inodes); 2293 } 2294 2295 return 0; 2296 } 2297 2298 /* 2299 * this cleans up any orphans that may be left on the list from the last use 2300 * of this root. 2301 */ 2302 int btrfs_orphan_cleanup(struct btrfs_root *root) 2303 { 2304 struct btrfs_path *path; 2305 struct extent_buffer *leaf; 2306 struct btrfs_key key, found_key; 2307 struct btrfs_trans_handle *trans; 2308 struct inode *inode; 2309 u64 last_objectid = 0; 2310 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2311 2312 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2313 return 0; 2314 2315 path = btrfs_alloc_path(); 2316 if (!path) { 2317 ret = -ENOMEM; 2318 goto out; 2319 } 2320 path->reada = -1; 2321 2322 key.objectid = BTRFS_ORPHAN_OBJECTID; 2323 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2324 key.offset = (u64)-1; 2325 2326 while (1) { 2327 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2328 if (ret < 0) 2329 goto out; 2330 2331 /* 2332 * if ret == 0 means we found what we were searching for, which 2333 * is weird, but possible, so only screw with path if we didn't 2334 * find the key and see if we have stuff that matches 2335 */ 2336 if (ret > 0) { 2337 ret = 0; 2338 if (path->slots[0] == 0) 2339 break; 2340 path->slots[0]--; 2341 } 2342 2343 /* pull out the item */ 2344 leaf = path->nodes[0]; 2345 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2346 2347 /* make sure the item matches what we want */ 2348 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 2349 break; 2350 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 2351 break; 2352 2353 /* release the path since we're done with it */ 2354 btrfs_release_path(path); 2355 2356 /* 2357 * this is where we are basically btrfs_lookup, without the 2358 * crossing root thing. we store the inode number in the 2359 * offset of the orphan item. 2360 */ 2361 2362 if (found_key.offset == last_objectid) { 2363 printk(KERN_ERR "btrfs: Error removing orphan entry, " 2364 "stopping orphan cleanup\n"); 2365 ret = -EINVAL; 2366 goto out; 2367 } 2368 2369 last_objectid = found_key.offset; 2370 2371 found_key.objectid = found_key.offset; 2372 found_key.type = BTRFS_INODE_ITEM_KEY; 2373 found_key.offset = 0; 2374 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2375 ret = PTR_RET(inode); 2376 if (ret && ret != -ESTALE) 2377 goto out; 2378 2379 if (ret == -ESTALE && root == root->fs_info->tree_root) { 2380 struct btrfs_root *dead_root; 2381 struct btrfs_fs_info *fs_info = root->fs_info; 2382 int is_dead_root = 0; 2383 2384 /* 2385 * this is an orphan in the tree root. Currently these 2386 * could come from 2 sources: 2387 * a) a snapshot deletion in progress 2388 * b) a free space cache inode 2389 * We need to distinguish those two, as the snapshot 2390 * orphan must not get deleted. 2391 * find_dead_roots already ran before us, so if this 2392 * is a snapshot deletion, we should find the root 2393 * in the dead_roots list 2394 */ 2395 spin_lock(&fs_info->trans_lock); 2396 list_for_each_entry(dead_root, &fs_info->dead_roots, 2397 root_list) { 2398 if (dead_root->root_key.objectid == 2399 found_key.objectid) { 2400 is_dead_root = 1; 2401 break; 2402 } 2403 } 2404 spin_unlock(&fs_info->trans_lock); 2405 if (is_dead_root) { 2406 /* prevent this orphan from being found again */ 2407 key.offset = found_key.objectid - 1; 2408 continue; 2409 } 2410 } 2411 /* 2412 * Inode is already gone but the orphan item is still there, 2413 * kill the orphan item. 2414 */ 2415 if (ret == -ESTALE) { 2416 trans = btrfs_start_transaction(root, 1); 2417 if (IS_ERR(trans)) { 2418 ret = PTR_ERR(trans); 2419 goto out; 2420 } 2421 printk(KERN_ERR "auto deleting %Lu\n", 2422 found_key.objectid); 2423 ret = btrfs_del_orphan_item(trans, root, 2424 found_key.objectid); 2425 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2426 btrfs_end_transaction(trans, root); 2427 continue; 2428 } 2429 2430 /* 2431 * add this inode to the orphan list so btrfs_orphan_del does 2432 * the proper thing when we hit it 2433 */ 2434 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2435 &BTRFS_I(inode)->runtime_flags); 2436 2437 /* if we have links, this was a truncate, lets do that */ 2438 if (inode->i_nlink) { 2439 if (!S_ISREG(inode->i_mode)) { 2440 WARN_ON(1); 2441 iput(inode); 2442 continue; 2443 } 2444 nr_truncate++; 2445 ret = btrfs_truncate(inode); 2446 } else { 2447 nr_unlink++; 2448 } 2449 2450 /* this will do delete_inode and everything for us */ 2451 iput(inode); 2452 if (ret) 2453 goto out; 2454 } 2455 /* release the path since we're done with it */ 2456 btrfs_release_path(path); 2457 2458 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2459 2460 if (root->orphan_block_rsv) 2461 btrfs_block_rsv_release(root, root->orphan_block_rsv, 2462 (u64)-1); 2463 2464 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2465 trans = btrfs_join_transaction(root); 2466 if (!IS_ERR(trans)) 2467 btrfs_end_transaction(trans, root); 2468 } 2469 2470 if (nr_unlink) 2471 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2472 if (nr_truncate) 2473 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2474 2475 out: 2476 if (ret) 2477 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); 2478 btrfs_free_path(path); 2479 return ret; 2480 } 2481 2482 /* 2483 * very simple check to peek ahead in the leaf looking for xattrs. If we 2484 * don't find any xattrs, we know there can't be any acls. 2485 * 2486 * slot is the slot the inode is in, objectid is the objectid of the inode 2487 */ 2488 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2489 int slot, u64 objectid) 2490 { 2491 u32 nritems = btrfs_header_nritems(leaf); 2492 struct btrfs_key found_key; 2493 int scanned = 0; 2494 2495 slot++; 2496 while (slot < nritems) { 2497 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2498 2499 /* we found a different objectid, there must not be acls */ 2500 if (found_key.objectid != objectid) 2501 return 0; 2502 2503 /* we found an xattr, assume we've got an acl */ 2504 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2505 return 1; 2506 2507 /* 2508 * we found a key greater than an xattr key, there can't 2509 * be any acls later on 2510 */ 2511 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2512 return 0; 2513 2514 slot++; 2515 scanned++; 2516 2517 /* 2518 * it goes inode, inode backrefs, xattrs, extents, 2519 * so if there are a ton of hard links to an inode there can 2520 * be a lot of backrefs. Don't waste time searching too hard, 2521 * this is just an optimization 2522 */ 2523 if (scanned >= 8) 2524 break; 2525 } 2526 /* we hit the end of the leaf before we found an xattr or 2527 * something larger than an xattr. We have to assume the inode 2528 * has acls 2529 */ 2530 return 1; 2531 } 2532 2533 /* 2534 * read an inode from the btree into the in-memory inode 2535 */ 2536 static void btrfs_read_locked_inode(struct inode *inode) 2537 { 2538 struct btrfs_path *path; 2539 struct extent_buffer *leaf; 2540 struct btrfs_inode_item *inode_item; 2541 struct btrfs_timespec *tspec; 2542 struct btrfs_root *root = BTRFS_I(inode)->root; 2543 struct btrfs_key location; 2544 int maybe_acls; 2545 u32 rdev; 2546 int ret; 2547 bool filled = false; 2548 2549 ret = btrfs_fill_inode(inode, &rdev); 2550 if (!ret) 2551 filled = true; 2552 2553 path = btrfs_alloc_path(); 2554 if (!path) 2555 goto make_bad; 2556 2557 path->leave_spinning = 1; 2558 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2559 2560 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2561 if (ret) 2562 goto make_bad; 2563 2564 leaf = path->nodes[0]; 2565 2566 if (filled) 2567 goto cache_acl; 2568 2569 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2570 struct btrfs_inode_item); 2571 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2572 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 2573 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2574 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2575 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2576 2577 tspec = btrfs_inode_atime(inode_item); 2578 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2579 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2580 2581 tspec = btrfs_inode_mtime(inode_item); 2582 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2583 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2584 2585 tspec = btrfs_inode_ctime(inode_item); 2586 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2587 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2588 2589 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2590 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2591 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2592 inode->i_generation = BTRFS_I(inode)->generation; 2593 inode->i_rdev = 0; 2594 rdev = btrfs_inode_rdev(leaf, inode_item); 2595 2596 BTRFS_I(inode)->index_cnt = (u64)-1; 2597 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2598 cache_acl: 2599 /* 2600 * try to precache a NULL acl entry for files that don't have 2601 * any xattrs or acls 2602 */ 2603 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 2604 btrfs_ino(inode)); 2605 if (!maybe_acls) 2606 cache_no_acl(inode); 2607 2608 btrfs_free_path(path); 2609 2610 switch (inode->i_mode & S_IFMT) { 2611 case S_IFREG: 2612 inode->i_mapping->a_ops = &btrfs_aops; 2613 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2614 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2615 inode->i_fop = &btrfs_file_operations; 2616 inode->i_op = &btrfs_file_inode_operations; 2617 break; 2618 case S_IFDIR: 2619 inode->i_fop = &btrfs_dir_file_operations; 2620 if (root == root->fs_info->tree_root) 2621 inode->i_op = &btrfs_dir_ro_inode_operations; 2622 else 2623 inode->i_op = &btrfs_dir_inode_operations; 2624 break; 2625 case S_IFLNK: 2626 inode->i_op = &btrfs_symlink_inode_operations; 2627 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2628 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2629 break; 2630 default: 2631 inode->i_op = &btrfs_special_inode_operations; 2632 init_special_inode(inode, inode->i_mode, rdev); 2633 break; 2634 } 2635 2636 btrfs_update_iflags(inode); 2637 return; 2638 2639 make_bad: 2640 btrfs_free_path(path); 2641 make_bad_inode(inode); 2642 } 2643 2644 /* 2645 * given a leaf and an inode, copy the inode fields into the leaf 2646 */ 2647 static void fill_inode_item(struct btrfs_trans_handle *trans, 2648 struct extent_buffer *leaf, 2649 struct btrfs_inode_item *item, 2650 struct inode *inode) 2651 { 2652 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2653 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2654 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2655 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2656 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2657 2658 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2659 inode->i_atime.tv_sec); 2660 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2661 inode->i_atime.tv_nsec); 2662 2663 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2664 inode->i_mtime.tv_sec); 2665 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2666 inode->i_mtime.tv_nsec); 2667 2668 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2669 inode->i_ctime.tv_sec); 2670 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2671 inode->i_ctime.tv_nsec); 2672 2673 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2674 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2675 btrfs_set_inode_sequence(leaf, item, inode->i_version); 2676 btrfs_set_inode_transid(leaf, item, trans->transid); 2677 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2678 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2679 btrfs_set_inode_block_group(leaf, item, 0); 2680 } 2681 2682 /* 2683 * copy everything in the in-memory inode into the btree. 2684 */ 2685 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 2686 struct btrfs_root *root, struct inode *inode) 2687 { 2688 struct btrfs_inode_item *inode_item; 2689 struct btrfs_path *path; 2690 struct extent_buffer *leaf; 2691 int ret; 2692 2693 path = btrfs_alloc_path(); 2694 if (!path) 2695 return -ENOMEM; 2696 2697 path->leave_spinning = 1; 2698 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 2699 1); 2700 if (ret) { 2701 if (ret > 0) 2702 ret = -ENOENT; 2703 goto failed; 2704 } 2705 2706 btrfs_unlock_up_safe(path, 1); 2707 leaf = path->nodes[0]; 2708 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2709 struct btrfs_inode_item); 2710 2711 fill_inode_item(trans, leaf, inode_item, inode); 2712 btrfs_mark_buffer_dirty(leaf); 2713 btrfs_set_inode_last_trans(trans, inode); 2714 ret = 0; 2715 failed: 2716 btrfs_free_path(path); 2717 return ret; 2718 } 2719 2720 /* 2721 * copy everything in the in-memory inode into the btree. 2722 */ 2723 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2724 struct btrfs_root *root, struct inode *inode) 2725 { 2726 int ret; 2727 2728 /* 2729 * If the inode is a free space inode, we can deadlock during commit 2730 * if we put it into the delayed code. 2731 * 2732 * The data relocation inode should also be directly updated 2733 * without delay 2734 */ 2735 if (!btrfs_is_free_space_inode(root, inode) 2736 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2737 ret = btrfs_delayed_update_inode(trans, root, inode); 2738 if (!ret) 2739 btrfs_set_inode_last_trans(trans, inode); 2740 return ret; 2741 } 2742 2743 return btrfs_update_inode_item(trans, root, inode); 2744 } 2745 2746 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 2747 struct btrfs_root *root, struct inode *inode) 2748 { 2749 int ret; 2750 2751 ret = btrfs_update_inode(trans, root, inode); 2752 if (ret == -ENOSPC) 2753 return btrfs_update_inode_item(trans, root, inode); 2754 return ret; 2755 } 2756 2757 /* 2758 * unlink helper that gets used here in inode.c and in the tree logging 2759 * recovery code. It remove a link in a directory with a given name, and 2760 * also drops the back refs in the inode to the directory 2761 */ 2762 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2763 struct btrfs_root *root, 2764 struct inode *dir, struct inode *inode, 2765 const char *name, int name_len) 2766 { 2767 struct btrfs_path *path; 2768 int ret = 0; 2769 struct extent_buffer *leaf; 2770 struct btrfs_dir_item *di; 2771 struct btrfs_key key; 2772 u64 index; 2773 u64 ino = btrfs_ino(inode); 2774 u64 dir_ino = btrfs_ino(dir); 2775 2776 path = btrfs_alloc_path(); 2777 if (!path) { 2778 ret = -ENOMEM; 2779 goto out; 2780 } 2781 2782 path->leave_spinning = 1; 2783 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2784 name, name_len, -1); 2785 if (IS_ERR(di)) { 2786 ret = PTR_ERR(di); 2787 goto err; 2788 } 2789 if (!di) { 2790 ret = -ENOENT; 2791 goto err; 2792 } 2793 leaf = path->nodes[0]; 2794 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2795 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2796 if (ret) 2797 goto err; 2798 btrfs_release_path(path); 2799 2800 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 2801 dir_ino, &index); 2802 if (ret) { 2803 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2804 "inode %llu parent %llu\n", name_len, name, 2805 (unsigned long long)ino, (unsigned long long)dir_ino); 2806 btrfs_abort_transaction(trans, root, ret); 2807 goto err; 2808 } 2809 2810 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 2811 if (ret) { 2812 btrfs_abort_transaction(trans, root, ret); 2813 goto err; 2814 } 2815 2816 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2817 inode, dir_ino); 2818 if (ret != 0 && ret != -ENOENT) { 2819 btrfs_abort_transaction(trans, root, ret); 2820 goto err; 2821 } 2822 2823 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2824 dir, index); 2825 if (ret == -ENOENT) 2826 ret = 0; 2827 err: 2828 btrfs_free_path(path); 2829 if (ret) 2830 goto out; 2831 2832 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2833 inode_inc_iversion(inode); 2834 inode_inc_iversion(dir); 2835 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2836 btrfs_update_inode(trans, root, dir); 2837 out: 2838 return ret; 2839 } 2840 2841 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2842 struct btrfs_root *root, 2843 struct inode *dir, struct inode *inode, 2844 const char *name, int name_len) 2845 { 2846 int ret; 2847 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 2848 if (!ret) { 2849 btrfs_drop_nlink(inode); 2850 ret = btrfs_update_inode(trans, root, inode); 2851 } 2852 return ret; 2853 } 2854 2855 2856 /* helper to check if there is any shared block in the path */ 2857 static int check_path_shared(struct btrfs_root *root, 2858 struct btrfs_path *path) 2859 { 2860 struct extent_buffer *eb; 2861 int level; 2862 u64 refs = 1; 2863 2864 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2865 int ret; 2866 2867 if (!path->nodes[level]) 2868 break; 2869 eb = path->nodes[level]; 2870 if (!btrfs_block_can_be_shared(root, eb)) 2871 continue; 2872 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, 2873 &refs, NULL); 2874 if (refs > 1) 2875 return 1; 2876 } 2877 return 0; 2878 } 2879 2880 /* 2881 * helper to start transaction for unlink and rmdir. 2882 * 2883 * unlink and rmdir are special in btrfs, they do not always free space. 2884 * so in enospc case, we should make sure they will free space before 2885 * allowing them to use the global metadata reservation. 2886 */ 2887 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 2888 struct dentry *dentry) 2889 { 2890 struct btrfs_trans_handle *trans; 2891 struct btrfs_root *root = BTRFS_I(dir)->root; 2892 struct btrfs_path *path; 2893 struct btrfs_inode_ref *ref; 2894 struct btrfs_dir_item *di; 2895 struct inode *inode = dentry->d_inode; 2896 u64 index; 2897 int check_link = 1; 2898 int err = -ENOSPC; 2899 int ret; 2900 u64 ino = btrfs_ino(inode); 2901 u64 dir_ino = btrfs_ino(dir); 2902 2903 /* 2904 * 1 for the possible orphan item 2905 * 1 for the dir item 2906 * 1 for the dir index 2907 * 1 for the inode ref 2908 * 1 for the inode ref in the tree log 2909 * 2 for the dir entries in the log 2910 * 1 for the inode 2911 */ 2912 trans = btrfs_start_transaction(root, 8); 2913 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2914 return trans; 2915 2916 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 2917 return ERR_PTR(-ENOSPC); 2918 2919 /* check if there is someone else holds reference */ 2920 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) 2921 return ERR_PTR(-ENOSPC); 2922 2923 if (atomic_read(&inode->i_count) > 2) 2924 return ERR_PTR(-ENOSPC); 2925 2926 if (xchg(&root->fs_info->enospc_unlink, 1)) 2927 return ERR_PTR(-ENOSPC); 2928 2929 path = btrfs_alloc_path(); 2930 if (!path) { 2931 root->fs_info->enospc_unlink = 0; 2932 return ERR_PTR(-ENOMEM); 2933 } 2934 2935 /* 1 for the orphan item */ 2936 trans = btrfs_start_transaction(root, 1); 2937 if (IS_ERR(trans)) { 2938 btrfs_free_path(path); 2939 root->fs_info->enospc_unlink = 0; 2940 return trans; 2941 } 2942 2943 path->skip_locking = 1; 2944 path->search_commit_root = 1; 2945 2946 ret = btrfs_lookup_inode(trans, root, path, 2947 &BTRFS_I(dir)->location, 0); 2948 if (ret < 0) { 2949 err = ret; 2950 goto out; 2951 } 2952 if (ret == 0) { 2953 if (check_path_shared(root, path)) 2954 goto out; 2955 } else { 2956 check_link = 0; 2957 } 2958 btrfs_release_path(path); 2959 2960 ret = btrfs_lookup_inode(trans, root, path, 2961 &BTRFS_I(inode)->location, 0); 2962 if (ret < 0) { 2963 err = ret; 2964 goto out; 2965 } 2966 if (ret == 0) { 2967 if (check_path_shared(root, path)) 2968 goto out; 2969 } else { 2970 check_link = 0; 2971 } 2972 btrfs_release_path(path); 2973 2974 if (ret == 0 && S_ISREG(inode->i_mode)) { 2975 ret = btrfs_lookup_file_extent(trans, root, path, 2976 ino, (u64)-1, 0); 2977 if (ret < 0) { 2978 err = ret; 2979 goto out; 2980 } 2981 BUG_ON(ret == 0); /* Corruption */ 2982 if (check_path_shared(root, path)) 2983 goto out; 2984 btrfs_release_path(path); 2985 } 2986 2987 if (!check_link) { 2988 err = 0; 2989 goto out; 2990 } 2991 2992 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2993 dentry->d_name.name, dentry->d_name.len, 0); 2994 if (IS_ERR(di)) { 2995 err = PTR_ERR(di); 2996 goto out; 2997 } 2998 if (di) { 2999 if (check_path_shared(root, path)) 3000 goto out; 3001 } else { 3002 err = 0; 3003 goto out; 3004 } 3005 btrfs_release_path(path); 3006 3007 ref = btrfs_lookup_inode_ref(trans, root, path, 3008 dentry->d_name.name, dentry->d_name.len, 3009 ino, dir_ino, 0); 3010 if (IS_ERR(ref)) { 3011 err = PTR_ERR(ref); 3012 goto out; 3013 } 3014 BUG_ON(!ref); /* Logic error */ 3015 if (check_path_shared(root, path)) 3016 goto out; 3017 index = btrfs_inode_ref_index(path->nodes[0], ref); 3018 btrfs_release_path(path); 3019 3020 /* 3021 * This is a commit root search, if we can lookup inode item and other 3022 * relative items in the commit root, it means the transaction of 3023 * dir/file creation has been committed, and the dir index item that we 3024 * delay to insert has also been inserted into the commit root. So 3025 * we needn't worry about the delayed insertion of the dir index item 3026 * here. 3027 */ 3028 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, 3029 dentry->d_name.name, dentry->d_name.len, 0); 3030 if (IS_ERR(di)) { 3031 err = PTR_ERR(di); 3032 goto out; 3033 } 3034 BUG_ON(ret == -ENOENT); 3035 if (check_path_shared(root, path)) 3036 goto out; 3037 3038 err = 0; 3039 out: 3040 btrfs_free_path(path); 3041 /* Migrate the orphan reservation over */ 3042 if (!err) 3043 err = btrfs_block_rsv_migrate(trans->block_rsv, 3044 &root->fs_info->global_block_rsv, 3045 trans->bytes_reserved); 3046 3047 if (err) { 3048 btrfs_end_transaction(trans, root); 3049 root->fs_info->enospc_unlink = 0; 3050 return ERR_PTR(err); 3051 } 3052 3053 trans->block_rsv = &root->fs_info->global_block_rsv; 3054 return trans; 3055 } 3056 3057 static void __unlink_end_trans(struct btrfs_trans_handle *trans, 3058 struct btrfs_root *root) 3059 { 3060 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 3061 btrfs_block_rsv_release(root, trans->block_rsv, 3062 trans->bytes_reserved); 3063 trans->block_rsv = &root->fs_info->trans_block_rsv; 3064 BUG_ON(!root->fs_info->enospc_unlink); 3065 root->fs_info->enospc_unlink = 0; 3066 } 3067 btrfs_end_transaction(trans, root); 3068 } 3069 3070 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3071 { 3072 struct btrfs_root *root = BTRFS_I(dir)->root; 3073 struct btrfs_trans_handle *trans; 3074 struct inode *inode = dentry->d_inode; 3075 int ret; 3076 unsigned long nr = 0; 3077 3078 trans = __unlink_start_trans(dir, dentry); 3079 if (IS_ERR(trans)) 3080 return PTR_ERR(trans); 3081 3082 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 3083 3084 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3085 dentry->d_name.name, dentry->d_name.len); 3086 if (ret) 3087 goto out; 3088 3089 if (inode->i_nlink == 0) { 3090 ret = btrfs_orphan_add(trans, inode); 3091 if (ret) 3092 goto out; 3093 } 3094 3095 out: 3096 nr = trans->blocks_used; 3097 __unlink_end_trans(trans, root); 3098 btrfs_btree_balance_dirty(root, nr); 3099 return ret; 3100 } 3101 3102 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 3103 struct btrfs_root *root, 3104 struct inode *dir, u64 objectid, 3105 const char *name, int name_len) 3106 { 3107 struct btrfs_path *path; 3108 struct extent_buffer *leaf; 3109 struct btrfs_dir_item *di; 3110 struct btrfs_key key; 3111 u64 index; 3112 int ret; 3113 u64 dir_ino = btrfs_ino(dir); 3114 3115 path = btrfs_alloc_path(); 3116 if (!path) 3117 return -ENOMEM; 3118 3119 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3120 name, name_len, -1); 3121 if (IS_ERR_OR_NULL(di)) { 3122 if (!di) 3123 ret = -ENOENT; 3124 else 3125 ret = PTR_ERR(di); 3126 goto out; 3127 } 3128 3129 leaf = path->nodes[0]; 3130 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3131 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3132 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3133 if (ret) { 3134 btrfs_abort_transaction(trans, root, ret); 3135 goto out; 3136 } 3137 btrfs_release_path(path); 3138 3139 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 3140 objectid, root->root_key.objectid, 3141 dir_ino, &index, name, name_len); 3142 if (ret < 0) { 3143 if (ret != -ENOENT) { 3144 btrfs_abort_transaction(trans, root, ret); 3145 goto out; 3146 } 3147 di = btrfs_search_dir_index_item(root, path, dir_ino, 3148 name, name_len); 3149 if (IS_ERR_OR_NULL(di)) { 3150 if (!di) 3151 ret = -ENOENT; 3152 else 3153 ret = PTR_ERR(di); 3154 btrfs_abort_transaction(trans, root, ret); 3155 goto out; 3156 } 3157 3158 leaf = path->nodes[0]; 3159 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3160 btrfs_release_path(path); 3161 index = key.offset; 3162 } 3163 btrfs_release_path(path); 3164 3165 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 3166 if (ret) { 3167 btrfs_abort_transaction(trans, root, ret); 3168 goto out; 3169 } 3170 3171 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3172 inode_inc_iversion(dir); 3173 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3174 ret = btrfs_update_inode(trans, root, dir); 3175 if (ret) 3176 btrfs_abort_transaction(trans, root, ret); 3177 out: 3178 btrfs_free_path(path); 3179 return ret; 3180 } 3181 3182 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 3183 { 3184 struct inode *inode = dentry->d_inode; 3185 int err = 0; 3186 struct btrfs_root *root = BTRFS_I(dir)->root; 3187 struct btrfs_trans_handle *trans; 3188 unsigned long nr = 0; 3189 3190 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3191 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3192 return -ENOTEMPTY; 3193 3194 trans = __unlink_start_trans(dir, dentry); 3195 if (IS_ERR(trans)) 3196 return PTR_ERR(trans); 3197 3198 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 3199 err = btrfs_unlink_subvol(trans, root, dir, 3200 BTRFS_I(inode)->location.objectid, 3201 dentry->d_name.name, 3202 dentry->d_name.len); 3203 goto out; 3204 } 3205 3206 err = btrfs_orphan_add(trans, inode); 3207 if (err) 3208 goto out; 3209 3210 /* now the directory is empty */ 3211 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3212 dentry->d_name.name, dentry->d_name.len); 3213 if (!err) 3214 btrfs_i_size_write(inode, 0); 3215 out: 3216 nr = trans->blocks_used; 3217 __unlink_end_trans(trans, root); 3218 btrfs_btree_balance_dirty(root, nr); 3219 3220 return err; 3221 } 3222 3223 /* 3224 * this can truncate away extent items, csum items and directory items. 3225 * It starts at a high offset and removes keys until it can't find 3226 * any higher than new_size 3227 * 3228 * csum items that cross the new i_size are truncated to the new size 3229 * as well. 3230 * 3231 * min_type is the minimum key type to truncate down to. If set to 0, this 3232 * will kill all the items on this inode, including the INODE_ITEM_KEY. 3233 */ 3234 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3235 struct btrfs_root *root, 3236 struct inode *inode, 3237 u64 new_size, u32 min_type) 3238 { 3239 struct btrfs_path *path; 3240 struct extent_buffer *leaf; 3241 struct btrfs_file_extent_item *fi; 3242 struct btrfs_key key; 3243 struct btrfs_key found_key; 3244 u64 extent_start = 0; 3245 u64 extent_num_bytes = 0; 3246 u64 extent_offset = 0; 3247 u64 item_end = 0; 3248 u64 mask = root->sectorsize - 1; 3249 u32 found_type = (u8)-1; 3250 int found_extent; 3251 int del_item; 3252 int pending_del_nr = 0; 3253 int pending_del_slot = 0; 3254 int extent_type = -1; 3255 int ret; 3256 int err = 0; 3257 u64 ino = btrfs_ino(inode); 3258 3259 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3260 3261 path = btrfs_alloc_path(); 3262 if (!path) 3263 return -ENOMEM; 3264 path->reada = -1; 3265 3266 if (root->ref_cows || root == root->fs_info->tree_root) 3267 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3268 3269 /* 3270 * This function is also used to drop the items in the log tree before 3271 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 3272 * it is used to drop the loged items. So we shouldn't kill the delayed 3273 * items. 3274 */ 3275 if (min_type == 0 && root == BTRFS_I(inode)->root) 3276 btrfs_kill_delayed_inode_items(inode); 3277 3278 key.objectid = ino; 3279 key.offset = (u64)-1; 3280 key.type = (u8)-1; 3281 3282 search_again: 3283 path->leave_spinning = 1; 3284 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3285 if (ret < 0) { 3286 err = ret; 3287 goto out; 3288 } 3289 3290 if (ret > 0) { 3291 /* there are no items in the tree for us to truncate, we're 3292 * done 3293 */ 3294 if (path->slots[0] == 0) 3295 goto out; 3296 path->slots[0]--; 3297 } 3298 3299 while (1) { 3300 fi = NULL; 3301 leaf = path->nodes[0]; 3302 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3303 found_type = btrfs_key_type(&found_key); 3304 3305 if (found_key.objectid != ino) 3306 break; 3307 3308 if (found_type < min_type) 3309 break; 3310 3311 item_end = found_key.offset; 3312 if (found_type == BTRFS_EXTENT_DATA_KEY) { 3313 fi = btrfs_item_ptr(leaf, path->slots[0], 3314 struct btrfs_file_extent_item); 3315 extent_type = btrfs_file_extent_type(leaf, fi); 3316 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3317 item_end += 3318 btrfs_file_extent_num_bytes(leaf, fi); 3319 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3320 item_end += btrfs_file_extent_inline_len(leaf, 3321 fi); 3322 } 3323 item_end--; 3324 } 3325 if (found_type > min_type) { 3326 del_item = 1; 3327 } else { 3328 if (item_end < new_size) 3329 break; 3330 if (found_key.offset >= new_size) 3331 del_item = 1; 3332 else 3333 del_item = 0; 3334 } 3335 found_extent = 0; 3336 /* FIXME, shrink the extent if the ref count is only 1 */ 3337 if (found_type != BTRFS_EXTENT_DATA_KEY) 3338 goto delete; 3339 3340 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3341 u64 num_dec; 3342 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3343 if (!del_item) { 3344 u64 orig_num_bytes = 3345 btrfs_file_extent_num_bytes(leaf, fi); 3346 extent_num_bytes = new_size - 3347 found_key.offset + root->sectorsize - 1; 3348 extent_num_bytes = extent_num_bytes & 3349 ~((u64)root->sectorsize - 1); 3350 btrfs_set_file_extent_num_bytes(leaf, fi, 3351 extent_num_bytes); 3352 num_dec = (orig_num_bytes - 3353 extent_num_bytes); 3354 if (root->ref_cows && extent_start != 0) 3355 inode_sub_bytes(inode, num_dec); 3356 btrfs_mark_buffer_dirty(leaf); 3357 } else { 3358 extent_num_bytes = 3359 btrfs_file_extent_disk_num_bytes(leaf, 3360 fi); 3361 extent_offset = found_key.offset - 3362 btrfs_file_extent_offset(leaf, fi); 3363 3364 /* FIXME blocksize != 4096 */ 3365 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 3366 if (extent_start != 0) { 3367 found_extent = 1; 3368 if (root->ref_cows) 3369 inode_sub_bytes(inode, num_dec); 3370 } 3371 } 3372 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3373 /* 3374 * we can't truncate inline items that have had 3375 * special encodings 3376 */ 3377 if (!del_item && 3378 btrfs_file_extent_compression(leaf, fi) == 0 && 3379 btrfs_file_extent_encryption(leaf, fi) == 0 && 3380 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 3381 u32 size = new_size - found_key.offset; 3382 3383 if (root->ref_cows) { 3384 inode_sub_bytes(inode, item_end + 1 - 3385 new_size); 3386 } 3387 size = 3388 btrfs_file_extent_calc_inline_size(size); 3389 btrfs_truncate_item(trans, root, path, 3390 size, 1); 3391 } else if (root->ref_cows) { 3392 inode_sub_bytes(inode, item_end + 1 - 3393 found_key.offset); 3394 } 3395 } 3396 delete: 3397 if (del_item) { 3398 if (!pending_del_nr) { 3399 /* no pending yet, add ourselves */ 3400 pending_del_slot = path->slots[0]; 3401 pending_del_nr = 1; 3402 } else if (pending_del_nr && 3403 path->slots[0] + 1 == pending_del_slot) { 3404 /* hop on the pending chunk */ 3405 pending_del_nr++; 3406 pending_del_slot = path->slots[0]; 3407 } else { 3408 BUG(); 3409 } 3410 } else { 3411 break; 3412 } 3413 if (found_extent && (root->ref_cows || 3414 root == root->fs_info->tree_root)) { 3415 btrfs_set_path_blocking(path); 3416 ret = btrfs_free_extent(trans, root, extent_start, 3417 extent_num_bytes, 0, 3418 btrfs_header_owner(leaf), 3419 ino, extent_offset, 0); 3420 BUG_ON(ret); 3421 } 3422 3423 if (found_type == BTRFS_INODE_ITEM_KEY) 3424 break; 3425 3426 if (path->slots[0] == 0 || 3427 path->slots[0] != pending_del_slot) { 3428 if (root->ref_cows && 3429 BTRFS_I(inode)->location.objectid != 3430 BTRFS_FREE_INO_OBJECTID) { 3431 err = -EAGAIN; 3432 goto out; 3433 } 3434 if (pending_del_nr) { 3435 ret = btrfs_del_items(trans, root, path, 3436 pending_del_slot, 3437 pending_del_nr); 3438 if (ret) { 3439 btrfs_abort_transaction(trans, 3440 root, ret); 3441 goto error; 3442 } 3443 pending_del_nr = 0; 3444 } 3445 btrfs_release_path(path); 3446 goto search_again; 3447 } else { 3448 path->slots[0]--; 3449 } 3450 } 3451 out: 3452 if (pending_del_nr) { 3453 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3454 pending_del_nr); 3455 if (ret) 3456 btrfs_abort_transaction(trans, root, ret); 3457 } 3458 error: 3459 btrfs_free_path(path); 3460 return err; 3461 } 3462 3463 /* 3464 * taken from block_truncate_page, but does cow as it zeros out 3465 * any bytes left in the last page in the file. 3466 */ 3467 static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3468 { 3469 struct inode *inode = mapping->host; 3470 struct btrfs_root *root = BTRFS_I(inode)->root; 3471 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3472 struct btrfs_ordered_extent *ordered; 3473 struct extent_state *cached_state = NULL; 3474 char *kaddr; 3475 u32 blocksize = root->sectorsize; 3476 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3477 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3478 struct page *page; 3479 gfp_t mask = btrfs_alloc_write_mask(mapping); 3480 int ret = 0; 3481 u64 page_start; 3482 u64 page_end; 3483 3484 if ((offset & (blocksize - 1)) == 0) 3485 goto out; 3486 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3487 if (ret) 3488 goto out; 3489 3490 ret = -ENOMEM; 3491 again: 3492 page = find_or_create_page(mapping, index, mask); 3493 if (!page) { 3494 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3495 goto out; 3496 } 3497 3498 page_start = page_offset(page); 3499 page_end = page_start + PAGE_CACHE_SIZE - 1; 3500 3501 if (!PageUptodate(page)) { 3502 ret = btrfs_readpage(NULL, page); 3503 lock_page(page); 3504 if (page->mapping != mapping) { 3505 unlock_page(page); 3506 page_cache_release(page); 3507 goto again; 3508 } 3509 if (!PageUptodate(page)) { 3510 ret = -EIO; 3511 goto out_unlock; 3512 } 3513 } 3514 wait_on_page_writeback(page); 3515 3516 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 3517 set_page_extent_mapped(page); 3518 3519 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3520 if (ordered) { 3521 unlock_extent_cached(io_tree, page_start, page_end, 3522 &cached_state, GFP_NOFS); 3523 unlock_page(page); 3524 page_cache_release(page); 3525 btrfs_start_ordered_extent(inode, ordered, 1); 3526 btrfs_put_ordered_extent(ordered); 3527 goto again; 3528 } 3529 3530 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3531 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3532 0, 0, &cached_state, GFP_NOFS); 3533 3534 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3535 &cached_state); 3536 if (ret) { 3537 unlock_extent_cached(io_tree, page_start, page_end, 3538 &cached_state, GFP_NOFS); 3539 goto out_unlock; 3540 } 3541 3542 ret = 0; 3543 if (offset != PAGE_CACHE_SIZE) { 3544 kaddr = kmap(page); 3545 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3546 flush_dcache_page(page); 3547 kunmap(page); 3548 } 3549 ClearPageChecked(page); 3550 set_page_dirty(page); 3551 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 3552 GFP_NOFS); 3553 3554 out_unlock: 3555 if (ret) 3556 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3557 unlock_page(page); 3558 page_cache_release(page); 3559 out: 3560 return ret; 3561 } 3562 3563 /* 3564 * This function puts in dummy file extents for the area we're creating a hole 3565 * for. So if we are truncating this file to a larger size we need to insert 3566 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 3567 * the range between oldsize and size 3568 */ 3569 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 3570 { 3571 struct btrfs_trans_handle *trans; 3572 struct btrfs_root *root = BTRFS_I(inode)->root; 3573 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3574 struct extent_map *em = NULL; 3575 struct extent_state *cached_state = NULL; 3576 u64 mask = root->sectorsize - 1; 3577 u64 hole_start = (oldsize + mask) & ~mask; 3578 u64 block_end = (size + mask) & ~mask; 3579 u64 last_byte; 3580 u64 cur_offset; 3581 u64 hole_size; 3582 int err = 0; 3583 3584 if (size <= hole_start) 3585 return 0; 3586 3587 while (1) { 3588 struct btrfs_ordered_extent *ordered; 3589 btrfs_wait_ordered_range(inode, hole_start, 3590 block_end - hole_start); 3591 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 3592 &cached_state); 3593 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3594 if (!ordered) 3595 break; 3596 unlock_extent_cached(io_tree, hole_start, block_end - 1, 3597 &cached_state, GFP_NOFS); 3598 btrfs_put_ordered_extent(ordered); 3599 } 3600 3601 cur_offset = hole_start; 3602 while (1) { 3603 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3604 block_end - cur_offset, 0); 3605 if (IS_ERR(em)) { 3606 err = PTR_ERR(em); 3607 break; 3608 } 3609 last_byte = min(extent_map_end(em), block_end); 3610 last_byte = (last_byte + mask) & ~mask; 3611 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3612 u64 hint_byte = 0; 3613 hole_size = last_byte - cur_offset; 3614 3615 trans = btrfs_start_transaction(root, 3); 3616 if (IS_ERR(trans)) { 3617 err = PTR_ERR(trans); 3618 break; 3619 } 3620 3621 err = btrfs_drop_extents(trans, inode, cur_offset, 3622 cur_offset + hole_size, 3623 &hint_byte, 1); 3624 if (err) { 3625 btrfs_abort_transaction(trans, root, err); 3626 btrfs_end_transaction(trans, root); 3627 break; 3628 } 3629 3630 err = btrfs_insert_file_extent(trans, root, 3631 btrfs_ino(inode), cur_offset, 0, 3632 0, hole_size, 0, hole_size, 3633 0, 0, 0); 3634 if (err) { 3635 btrfs_abort_transaction(trans, root, err); 3636 btrfs_end_transaction(trans, root); 3637 break; 3638 } 3639 3640 btrfs_drop_extent_cache(inode, hole_start, 3641 last_byte - 1, 0); 3642 3643 btrfs_update_inode(trans, root, inode); 3644 btrfs_end_transaction(trans, root); 3645 } 3646 free_extent_map(em); 3647 em = NULL; 3648 cur_offset = last_byte; 3649 if (cur_offset >= block_end) 3650 break; 3651 } 3652 3653 free_extent_map(em); 3654 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3655 GFP_NOFS); 3656 return err; 3657 } 3658 3659 static int btrfs_setsize(struct inode *inode, loff_t newsize) 3660 { 3661 struct btrfs_root *root = BTRFS_I(inode)->root; 3662 struct btrfs_trans_handle *trans; 3663 loff_t oldsize = i_size_read(inode); 3664 int ret; 3665 3666 if (newsize == oldsize) 3667 return 0; 3668 3669 if (newsize > oldsize) { 3670 truncate_pagecache(inode, oldsize, newsize); 3671 ret = btrfs_cont_expand(inode, oldsize, newsize); 3672 if (ret) 3673 return ret; 3674 3675 trans = btrfs_start_transaction(root, 1); 3676 if (IS_ERR(trans)) 3677 return PTR_ERR(trans); 3678 3679 i_size_write(inode, newsize); 3680 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3681 ret = btrfs_update_inode(trans, root, inode); 3682 btrfs_end_transaction(trans, root); 3683 } else { 3684 3685 /* 3686 * We're truncating a file that used to have good data down to 3687 * zero. Make sure it gets into the ordered flush list so that 3688 * any new writes get down to disk quickly. 3689 */ 3690 if (newsize == 0) 3691 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 3692 &BTRFS_I(inode)->runtime_flags); 3693 3694 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3695 truncate_setsize(inode, newsize); 3696 ret = btrfs_truncate(inode); 3697 } 3698 3699 return ret; 3700 } 3701 3702 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3703 { 3704 struct inode *inode = dentry->d_inode; 3705 struct btrfs_root *root = BTRFS_I(inode)->root; 3706 int err; 3707 3708 if (btrfs_root_readonly(root)) 3709 return -EROFS; 3710 3711 err = inode_change_ok(inode, attr); 3712 if (err) 3713 return err; 3714 3715 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3716 err = btrfs_setsize(inode, attr->ia_size); 3717 if (err) 3718 return err; 3719 } 3720 3721 if (attr->ia_valid) { 3722 setattr_copy(inode, attr); 3723 inode_inc_iversion(inode); 3724 err = btrfs_dirty_inode(inode); 3725 3726 if (!err && attr->ia_valid & ATTR_MODE) 3727 err = btrfs_acl_chmod(inode); 3728 } 3729 3730 return err; 3731 } 3732 3733 void btrfs_evict_inode(struct inode *inode) 3734 { 3735 struct btrfs_trans_handle *trans; 3736 struct btrfs_root *root = BTRFS_I(inode)->root; 3737 struct btrfs_block_rsv *rsv, *global_rsv; 3738 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3739 unsigned long nr; 3740 int ret; 3741 3742 trace_btrfs_inode_evict(inode); 3743 3744 truncate_inode_pages(&inode->i_data, 0); 3745 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3746 btrfs_is_free_space_inode(root, inode))) 3747 goto no_delete; 3748 3749 if (is_bad_inode(inode)) { 3750 btrfs_orphan_del(NULL, inode); 3751 goto no_delete; 3752 } 3753 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 3754 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3755 3756 if (root->fs_info->log_root_recovering) { 3757 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3758 &BTRFS_I(inode)->runtime_flags)); 3759 goto no_delete; 3760 } 3761 3762 if (inode->i_nlink > 0) { 3763 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3764 goto no_delete; 3765 } 3766 3767 rsv = btrfs_alloc_block_rsv(root); 3768 if (!rsv) { 3769 btrfs_orphan_del(NULL, inode); 3770 goto no_delete; 3771 } 3772 rsv->size = min_size; 3773 global_rsv = &root->fs_info->global_block_rsv; 3774 3775 btrfs_i_size_write(inode, 0); 3776 3777 /* 3778 * This is a bit simpler than btrfs_truncate since 3779 * 3780 * 1) We've already reserved our space for our orphan item in the 3781 * unlink. 3782 * 2) We're going to delete the inode item, so we don't need to update 3783 * it at all. 3784 * 3785 * So we just need to reserve some slack space in case we add bytes when 3786 * doing the truncate. 3787 */ 3788 while (1) { 3789 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3790 3791 /* 3792 * Try and steal from the global reserve since we will 3793 * likely not use this space anyway, we want to try as 3794 * hard as possible to get this to work. 3795 */ 3796 if (ret) 3797 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); 3798 3799 if (ret) { 3800 printk(KERN_WARNING "Could not get space for a " 3801 "delete, will truncate on mount %d\n", ret); 3802 btrfs_orphan_del(NULL, inode); 3803 btrfs_free_block_rsv(root, rsv); 3804 goto no_delete; 3805 } 3806 3807 trans = btrfs_start_transaction(root, 0); 3808 if (IS_ERR(trans)) { 3809 btrfs_orphan_del(NULL, inode); 3810 btrfs_free_block_rsv(root, rsv); 3811 goto no_delete; 3812 } 3813 3814 trans->block_rsv = rsv; 3815 3816 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3817 if (ret != -EAGAIN) 3818 break; 3819 3820 nr = trans->blocks_used; 3821 btrfs_end_transaction(trans, root); 3822 trans = NULL; 3823 btrfs_btree_balance_dirty(root, nr); 3824 } 3825 3826 btrfs_free_block_rsv(root, rsv); 3827 3828 if (ret == 0) { 3829 trans->block_rsv = root->orphan_block_rsv; 3830 ret = btrfs_orphan_del(trans, inode); 3831 BUG_ON(ret); 3832 } 3833 3834 trans->block_rsv = &root->fs_info->trans_block_rsv; 3835 if (!(root == root->fs_info->tree_root || 3836 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3837 btrfs_return_ino(root, btrfs_ino(inode)); 3838 3839 nr = trans->blocks_used; 3840 btrfs_end_transaction(trans, root); 3841 btrfs_btree_balance_dirty(root, nr); 3842 no_delete: 3843 clear_inode(inode); 3844 return; 3845 } 3846 3847 /* 3848 * this returns the key found in the dir entry in the location pointer. 3849 * If no dir entries were found, location->objectid is 0. 3850 */ 3851 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3852 struct btrfs_key *location) 3853 { 3854 const char *name = dentry->d_name.name; 3855 int namelen = dentry->d_name.len; 3856 struct btrfs_dir_item *di; 3857 struct btrfs_path *path; 3858 struct btrfs_root *root = BTRFS_I(dir)->root; 3859 int ret = 0; 3860 3861 path = btrfs_alloc_path(); 3862 if (!path) 3863 return -ENOMEM; 3864 3865 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 3866 namelen, 0); 3867 if (IS_ERR(di)) 3868 ret = PTR_ERR(di); 3869 3870 if (IS_ERR_OR_NULL(di)) 3871 goto out_err; 3872 3873 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3874 out: 3875 btrfs_free_path(path); 3876 return ret; 3877 out_err: 3878 location->objectid = 0; 3879 goto out; 3880 } 3881 3882 /* 3883 * when we hit a tree root in a directory, the btrfs part of the inode 3884 * needs to be changed to reflect the root directory of the tree root. This 3885 * is kind of like crossing a mount point. 3886 */ 3887 static int fixup_tree_root_location(struct btrfs_root *root, 3888 struct inode *dir, 3889 struct dentry *dentry, 3890 struct btrfs_key *location, 3891 struct btrfs_root **sub_root) 3892 { 3893 struct btrfs_path *path; 3894 struct btrfs_root *new_root; 3895 struct btrfs_root_ref *ref; 3896 struct extent_buffer *leaf; 3897 int ret; 3898 int err = 0; 3899 3900 path = btrfs_alloc_path(); 3901 if (!path) { 3902 err = -ENOMEM; 3903 goto out; 3904 } 3905 3906 err = -ENOENT; 3907 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3908 BTRFS_I(dir)->root->root_key.objectid, 3909 location->objectid); 3910 if (ret) { 3911 if (ret < 0) 3912 err = ret; 3913 goto out; 3914 } 3915 3916 leaf = path->nodes[0]; 3917 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3918 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 3919 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3920 goto out; 3921 3922 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3923 (unsigned long)(ref + 1), 3924 dentry->d_name.len); 3925 if (ret) 3926 goto out; 3927 3928 btrfs_release_path(path); 3929 3930 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3931 if (IS_ERR(new_root)) { 3932 err = PTR_ERR(new_root); 3933 goto out; 3934 } 3935 3936 if (btrfs_root_refs(&new_root->root_item) == 0) { 3937 err = -ENOENT; 3938 goto out; 3939 } 3940 3941 *sub_root = new_root; 3942 location->objectid = btrfs_root_dirid(&new_root->root_item); 3943 location->type = BTRFS_INODE_ITEM_KEY; 3944 location->offset = 0; 3945 err = 0; 3946 out: 3947 btrfs_free_path(path); 3948 return err; 3949 } 3950 3951 static void inode_tree_add(struct inode *inode) 3952 { 3953 struct btrfs_root *root = BTRFS_I(inode)->root; 3954 struct btrfs_inode *entry; 3955 struct rb_node **p; 3956 struct rb_node *parent; 3957 u64 ino = btrfs_ino(inode); 3958 again: 3959 p = &root->inode_tree.rb_node; 3960 parent = NULL; 3961 3962 if (inode_unhashed(inode)) 3963 return; 3964 3965 spin_lock(&root->inode_lock); 3966 while (*p) { 3967 parent = *p; 3968 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3969 3970 if (ino < btrfs_ino(&entry->vfs_inode)) 3971 p = &parent->rb_left; 3972 else if (ino > btrfs_ino(&entry->vfs_inode)) 3973 p = &parent->rb_right; 3974 else { 3975 WARN_ON(!(entry->vfs_inode.i_state & 3976 (I_WILL_FREE | I_FREEING))); 3977 rb_erase(parent, &root->inode_tree); 3978 RB_CLEAR_NODE(parent); 3979 spin_unlock(&root->inode_lock); 3980 goto again; 3981 } 3982 } 3983 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3984 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3985 spin_unlock(&root->inode_lock); 3986 } 3987 3988 static void inode_tree_del(struct inode *inode) 3989 { 3990 struct btrfs_root *root = BTRFS_I(inode)->root; 3991 int empty = 0; 3992 3993 spin_lock(&root->inode_lock); 3994 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3995 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3996 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3997 empty = RB_EMPTY_ROOT(&root->inode_tree); 3998 } 3999 spin_unlock(&root->inode_lock); 4000 4001 /* 4002 * Free space cache has inodes in the tree root, but the tree root has a 4003 * root_refs of 0, so this could end up dropping the tree root as a 4004 * snapshot, so we need the extra !root->fs_info->tree_root check to 4005 * make sure we don't drop it. 4006 */ 4007 if (empty && btrfs_root_refs(&root->root_item) == 0 && 4008 root != root->fs_info->tree_root) { 4009 synchronize_srcu(&root->fs_info->subvol_srcu); 4010 spin_lock(&root->inode_lock); 4011 empty = RB_EMPTY_ROOT(&root->inode_tree); 4012 spin_unlock(&root->inode_lock); 4013 if (empty) 4014 btrfs_add_dead_root(root); 4015 } 4016 } 4017 4018 void btrfs_invalidate_inodes(struct btrfs_root *root) 4019 { 4020 struct rb_node *node; 4021 struct rb_node *prev; 4022 struct btrfs_inode *entry; 4023 struct inode *inode; 4024 u64 objectid = 0; 4025 4026 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4027 4028 spin_lock(&root->inode_lock); 4029 again: 4030 node = root->inode_tree.rb_node; 4031 prev = NULL; 4032 while (node) { 4033 prev = node; 4034 entry = rb_entry(node, struct btrfs_inode, rb_node); 4035 4036 if (objectid < btrfs_ino(&entry->vfs_inode)) 4037 node = node->rb_left; 4038 else if (objectid > btrfs_ino(&entry->vfs_inode)) 4039 node = node->rb_right; 4040 else 4041 break; 4042 } 4043 if (!node) { 4044 while (prev) { 4045 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4046 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 4047 node = prev; 4048 break; 4049 } 4050 prev = rb_next(prev); 4051 } 4052 } 4053 while (node) { 4054 entry = rb_entry(node, struct btrfs_inode, rb_node); 4055 objectid = btrfs_ino(&entry->vfs_inode) + 1; 4056 inode = igrab(&entry->vfs_inode); 4057 if (inode) { 4058 spin_unlock(&root->inode_lock); 4059 if (atomic_read(&inode->i_count) > 1) 4060 d_prune_aliases(inode); 4061 /* 4062 * btrfs_drop_inode will have it removed from 4063 * the inode cache when its usage count 4064 * hits zero. 4065 */ 4066 iput(inode); 4067 cond_resched(); 4068 spin_lock(&root->inode_lock); 4069 goto again; 4070 } 4071 4072 if (cond_resched_lock(&root->inode_lock)) 4073 goto again; 4074 4075 node = rb_next(node); 4076 } 4077 spin_unlock(&root->inode_lock); 4078 } 4079 4080 static int btrfs_init_locked_inode(struct inode *inode, void *p) 4081 { 4082 struct btrfs_iget_args *args = p; 4083 inode->i_ino = args->ino; 4084 BTRFS_I(inode)->root = args->root; 4085 btrfs_set_inode_space_info(args->root, inode); 4086 return 0; 4087 } 4088 4089 static int btrfs_find_actor(struct inode *inode, void *opaque) 4090 { 4091 struct btrfs_iget_args *args = opaque; 4092 return args->ino == btrfs_ino(inode) && 4093 args->root == BTRFS_I(inode)->root; 4094 } 4095 4096 static struct inode *btrfs_iget_locked(struct super_block *s, 4097 u64 objectid, 4098 struct btrfs_root *root) 4099 { 4100 struct inode *inode; 4101 struct btrfs_iget_args args; 4102 args.ino = objectid; 4103 args.root = root; 4104 4105 inode = iget5_locked(s, objectid, btrfs_find_actor, 4106 btrfs_init_locked_inode, 4107 (void *)&args); 4108 return inode; 4109 } 4110 4111 /* Get an inode object given its location and corresponding root. 4112 * Returns in *is_new if the inode was read from disk 4113 */ 4114 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 4115 struct btrfs_root *root, int *new) 4116 { 4117 struct inode *inode; 4118 4119 inode = btrfs_iget_locked(s, location->objectid, root); 4120 if (!inode) 4121 return ERR_PTR(-ENOMEM); 4122 4123 if (inode->i_state & I_NEW) { 4124 BTRFS_I(inode)->root = root; 4125 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4126 btrfs_read_locked_inode(inode); 4127 if (!is_bad_inode(inode)) { 4128 inode_tree_add(inode); 4129 unlock_new_inode(inode); 4130 if (new) 4131 *new = 1; 4132 } else { 4133 unlock_new_inode(inode); 4134 iput(inode); 4135 inode = ERR_PTR(-ESTALE); 4136 } 4137 } 4138 4139 return inode; 4140 } 4141 4142 static struct inode *new_simple_dir(struct super_block *s, 4143 struct btrfs_key *key, 4144 struct btrfs_root *root) 4145 { 4146 struct inode *inode = new_inode(s); 4147 4148 if (!inode) 4149 return ERR_PTR(-ENOMEM); 4150 4151 BTRFS_I(inode)->root = root; 4152 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4153 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 4154 4155 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4156 inode->i_op = &btrfs_dir_ro_inode_operations; 4157 inode->i_fop = &simple_dir_operations; 4158 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 4159 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4160 4161 return inode; 4162 } 4163 4164 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 4165 { 4166 struct inode *inode; 4167 struct btrfs_root *root = BTRFS_I(dir)->root; 4168 struct btrfs_root *sub_root = root; 4169 struct btrfs_key location; 4170 int index; 4171 int ret = 0; 4172 4173 if (dentry->d_name.len > BTRFS_NAME_LEN) 4174 return ERR_PTR(-ENAMETOOLONG); 4175 4176 if (unlikely(d_need_lookup(dentry))) { 4177 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); 4178 kfree(dentry->d_fsdata); 4179 dentry->d_fsdata = NULL; 4180 /* This thing is hashed, drop it for now */ 4181 d_drop(dentry); 4182 } else { 4183 ret = btrfs_inode_by_name(dir, dentry, &location); 4184 } 4185 4186 if (ret < 0) 4187 return ERR_PTR(ret); 4188 4189 if (location.objectid == 0) 4190 return NULL; 4191 4192 if (location.type == BTRFS_INODE_ITEM_KEY) { 4193 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 4194 return inode; 4195 } 4196 4197 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 4198 4199 index = srcu_read_lock(&root->fs_info->subvol_srcu); 4200 ret = fixup_tree_root_location(root, dir, dentry, 4201 &location, &sub_root); 4202 if (ret < 0) { 4203 if (ret != -ENOENT) 4204 inode = ERR_PTR(ret); 4205 else 4206 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4207 } else { 4208 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 4209 } 4210 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4211 4212 if (!IS_ERR(inode) && root != sub_root) { 4213 down_read(&root->fs_info->cleanup_work_sem); 4214 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4215 ret = btrfs_orphan_cleanup(sub_root); 4216 up_read(&root->fs_info->cleanup_work_sem); 4217 if (ret) 4218 inode = ERR_PTR(ret); 4219 } 4220 4221 return inode; 4222 } 4223 4224 static int btrfs_dentry_delete(const struct dentry *dentry) 4225 { 4226 struct btrfs_root *root; 4227 struct inode *inode = dentry->d_inode; 4228 4229 if (!inode && !IS_ROOT(dentry)) 4230 inode = dentry->d_parent->d_inode; 4231 4232 if (inode) { 4233 root = BTRFS_I(inode)->root; 4234 if (btrfs_root_refs(&root->root_item) == 0) 4235 return 1; 4236 4237 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 4238 return 1; 4239 } 4240 return 0; 4241 } 4242 4243 static void btrfs_dentry_release(struct dentry *dentry) 4244 { 4245 if (dentry->d_fsdata) 4246 kfree(dentry->d_fsdata); 4247 } 4248 4249 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4250 struct nameidata *nd) 4251 { 4252 struct dentry *ret; 4253 4254 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4255 if (unlikely(d_need_lookup(dentry))) { 4256 spin_lock(&dentry->d_lock); 4257 dentry->d_flags &= ~DCACHE_NEED_LOOKUP; 4258 spin_unlock(&dentry->d_lock); 4259 } 4260 return ret; 4261 } 4262 4263 unsigned char btrfs_filetype_table[] = { 4264 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 4265 }; 4266 4267 static int btrfs_real_readdir(struct file *filp, void *dirent, 4268 filldir_t filldir) 4269 { 4270 struct inode *inode = filp->f_dentry->d_inode; 4271 struct btrfs_root *root = BTRFS_I(inode)->root; 4272 struct btrfs_item *item; 4273 struct btrfs_dir_item *di; 4274 struct btrfs_key key; 4275 struct btrfs_key found_key; 4276 struct btrfs_path *path; 4277 struct list_head ins_list; 4278 struct list_head del_list; 4279 int ret; 4280 struct extent_buffer *leaf; 4281 int slot; 4282 unsigned char d_type; 4283 int over = 0; 4284 u32 di_cur; 4285 u32 di_total; 4286 u32 di_len; 4287 int key_type = BTRFS_DIR_INDEX_KEY; 4288 char tmp_name[32]; 4289 char *name_ptr; 4290 int name_len; 4291 int is_curr = 0; /* filp->f_pos points to the current index? */ 4292 4293 /* FIXME, use a real flag for deciding about the key type */ 4294 if (root->fs_info->tree_root == root) 4295 key_type = BTRFS_DIR_ITEM_KEY; 4296 4297 /* special case for "." */ 4298 if (filp->f_pos == 0) { 4299 over = filldir(dirent, ".", 1, 4300 filp->f_pos, btrfs_ino(inode), DT_DIR); 4301 if (over) 4302 return 0; 4303 filp->f_pos = 1; 4304 } 4305 /* special case for .., just use the back ref */ 4306 if (filp->f_pos == 1) { 4307 u64 pino = parent_ino(filp->f_path.dentry); 4308 over = filldir(dirent, "..", 2, 4309 filp->f_pos, pino, DT_DIR); 4310 if (over) 4311 return 0; 4312 filp->f_pos = 2; 4313 } 4314 path = btrfs_alloc_path(); 4315 if (!path) 4316 return -ENOMEM; 4317 4318 path->reada = 1; 4319 4320 if (key_type == BTRFS_DIR_INDEX_KEY) { 4321 INIT_LIST_HEAD(&ins_list); 4322 INIT_LIST_HEAD(&del_list); 4323 btrfs_get_delayed_items(inode, &ins_list, &del_list); 4324 } 4325 4326 btrfs_set_key_type(&key, key_type); 4327 key.offset = filp->f_pos; 4328 key.objectid = btrfs_ino(inode); 4329 4330 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4331 if (ret < 0) 4332 goto err; 4333 4334 while (1) { 4335 leaf = path->nodes[0]; 4336 slot = path->slots[0]; 4337 if (slot >= btrfs_header_nritems(leaf)) { 4338 ret = btrfs_next_leaf(root, path); 4339 if (ret < 0) 4340 goto err; 4341 else if (ret > 0) 4342 break; 4343 continue; 4344 } 4345 4346 item = btrfs_item_nr(leaf, slot); 4347 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4348 4349 if (found_key.objectid != key.objectid) 4350 break; 4351 if (btrfs_key_type(&found_key) != key_type) 4352 break; 4353 if (found_key.offset < filp->f_pos) 4354 goto next; 4355 if (key_type == BTRFS_DIR_INDEX_KEY && 4356 btrfs_should_delete_dir_index(&del_list, 4357 found_key.offset)) 4358 goto next; 4359 4360 filp->f_pos = found_key.offset; 4361 is_curr = 1; 4362 4363 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 4364 di_cur = 0; 4365 di_total = btrfs_item_size(leaf, item); 4366 4367 while (di_cur < di_total) { 4368 struct btrfs_key location; 4369 4370 if (verify_dir_item(root, leaf, di)) 4371 break; 4372 4373 name_len = btrfs_dir_name_len(leaf, di); 4374 if (name_len <= sizeof(tmp_name)) { 4375 name_ptr = tmp_name; 4376 } else { 4377 name_ptr = kmalloc(name_len, GFP_NOFS); 4378 if (!name_ptr) { 4379 ret = -ENOMEM; 4380 goto err; 4381 } 4382 } 4383 read_extent_buffer(leaf, name_ptr, 4384 (unsigned long)(di + 1), name_len); 4385 4386 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4387 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4388 4389 4390 /* is this a reference to our own snapshot? If so 4391 * skip it. 4392 * 4393 * In contrast to old kernels, we insert the snapshot's 4394 * dir item and dir index after it has been created, so 4395 * we won't find a reference to our own snapshot. We 4396 * still keep the following code for backward 4397 * compatibility. 4398 */ 4399 if (location.type == BTRFS_ROOT_ITEM_KEY && 4400 location.objectid == root->root_key.objectid) { 4401 over = 0; 4402 goto skip; 4403 } 4404 over = filldir(dirent, name_ptr, name_len, 4405 found_key.offset, location.objectid, 4406 d_type); 4407 4408 skip: 4409 if (name_ptr != tmp_name) 4410 kfree(name_ptr); 4411 4412 if (over) 4413 goto nopos; 4414 di_len = btrfs_dir_name_len(leaf, di) + 4415 btrfs_dir_data_len(leaf, di) + sizeof(*di); 4416 di_cur += di_len; 4417 di = (struct btrfs_dir_item *)((char *)di + di_len); 4418 } 4419 next: 4420 path->slots[0]++; 4421 } 4422 4423 if (key_type == BTRFS_DIR_INDEX_KEY) { 4424 if (is_curr) 4425 filp->f_pos++; 4426 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, 4427 &ins_list); 4428 if (ret) 4429 goto nopos; 4430 } 4431 4432 /* Reached end of directory/root. Bump pos past the last item. */ 4433 if (key_type == BTRFS_DIR_INDEX_KEY) 4434 /* 4435 * 32-bit glibc will use getdents64, but then strtol - 4436 * so the last number we can serve is this. 4437 */ 4438 filp->f_pos = 0x7fffffff; 4439 else 4440 filp->f_pos++; 4441 nopos: 4442 ret = 0; 4443 err: 4444 if (key_type == BTRFS_DIR_INDEX_KEY) 4445 btrfs_put_delayed_items(&ins_list, &del_list); 4446 btrfs_free_path(path); 4447 return ret; 4448 } 4449 4450 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 4451 { 4452 struct btrfs_root *root = BTRFS_I(inode)->root; 4453 struct btrfs_trans_handle *trans; 4454 int ret = 0; 4455 bool nolock = false; 4456 4457 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4458 return 0; 4459 4460 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) 4461 nolock = true; 4462 4463 if (wbc->sync_mode == WB_SYNC_ALL) { 4464 if (nolock) 4465 trans = btrfs_join_transaction_nolock(root); 4466 else 4467 trans = btrfs_join_transaction(root); 4468 if (IS_ERR(trans)) 4469 return PTR_ERR(trans); 4470 if (nolock) 4471 ret = btrfs_end_transaction_nolock(trans, root); 4472 else 4473 ret = btrfs_commit_transaction(trans, root); 4474 } 4475 return ret; 4476 } 4477 4478 /* 4479 * This is somewhat expensive, updating the tree every time the 4480 * inode changes. But, it is most likely to find the inode in cache. 4481 * FIXME, needs more benchmarking...there are no reasons other than performance 4482 * to keep or drop this code. 4483 */ 4484 int btrfs_dirty_inode(struct inode *inode) 4485 { 4486 struct btrfs_root *root = BTRFS_I(inode)->root; 4487 struct btrfs_trans_handle *trans; 4488 int ret; 4489 4490 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4491 return 0; 4492 4493 trans = btrfs_join_transaction(root); 4494 if (IS_ERR(trans)) 4495 return PTR_ERR(trans); 4496 4497 ret = btrfs_update_inode(trans, root, inode); 4498 if (ret && ret == -ENOSPC) { 4499 /* whoops, lets try again with the full transaction */ 4500 btrfs_end_transaction(trans, root); 4501 trans = btrfs_start_transaction(root, 1); 4502 if (IS_ERR(trans)) 4503 return PTR_ERR(trans); 4504 4505 ret = btrfs_update_inode(trans, root, inode); 4506 } 4507 btrfs_end_transaction(trans, root); 4508 if (BTRFS_I(inode)->delayed_node) 4509 btrfs_balance_delayed_items(root); 4510 4511 return ret; 4512 } 4513 4514 /* 4515 * This is a copy of file_update_time. We need this so we can return error on 4516 * ENOSPC for updating the inode in the case of file write and mmap writes. 4517 */ 4518 static int btrfs_update_time(struct inode *inode, struct timespec *now, 4519 int flags) 4520 { 4521 if (flags & S_VERSION) 4522 inode_inc_iversion(inode); 4523 if (flags & S_CTIME) 4524 inode->i_ctime = *now; 4525 if (flags & S_MTIME) 4526 inode->i_mtime = *now; 4527 if (flags & S_ATIME) 4528 inode->i_atime = *now; 4529 return btrfs_dirty_inode(inode); 4530 } 4531 4532 /* 4533 * find the highest existing sequence number in a directory 4534 * and then set the in-memory index_cnt variable to reflect 4535 * free sequence numbers 4536 */ 4537 static int btrfs_set_inode_index_count(struct inode *inode) 4538 { 4539 struct btrfs_root *root = BTRFS_I(inode)->root; 4540 struct btrfs_key key, found_key; 4541 struct btrfs_path *path; 4542 struct extent_buffer *leaf; 4543 int ret; 4544 4545 key.objectid = btrfs_ino(inode); 4546 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 4547 key.offset = (u64)-1; 4548 4549 path = btrfs_alloc_path(); 4550 if (!path) 4551 return -ENOMEM; 4552 4553 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4554 if (ret < 0) 4555 goto out; 4556 /* FIXME: we should be able to handle this */ 4557 if (ret == 0) 4558 goto out; 4559 ret = 0; 4560 4561 /* 4562 * MAGIC NUMBER EXPLANATION: 4563 * since we search a directory based on f_pos we have to start at 2 4564 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 4565 * else has to start at 2 4566 */ 4567 if (path->slots[0] == 0) { 4568 BTRFS_I(inode)->index_cnt = 2; 4569 goto out; 4570 } 4571 4572 path->slots[0]--; 4573 4574 leaf = path->nodes[0]; 4575 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4576 4577 if (found_key.objectid != btrfs_ino(inode) || 4578 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 4579 BTRFS_I(inode)->index_cnt = 2; 4580 goto out; 4581 } 4582 4583 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 4584 out: 4585 btrfs_free_path(path); 4586 return ret; 4587 } 4588 4589 /* 4590 * helper to find a free sequence number in a given directory. This current 4591 * code is very simple, later versions will do smarter things in the btree 4592 */ 4593 int btrfs_set_inode_index(struct inode *dir, u64 *index) 4594 { 4595 int ret = 0; 4596 4597 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 4598 ret = btrfs_inode_delayed_dir_index_count(dir); 4599 if (ret) { 4600 ret = btrfs_set_inode_index_count(dir); 4601 if (ret) 4602 return ret; 4603 } 4604 } 4605 4606 *index = BTRFS_I(dir)->index_cnt; 4607 BTRFS_I(dir)->index_cnt++; 4608 4609 return ret; 4610 } 4611 4612 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 4613 struct btrfs_root *root, 4614 struct inode *dir, 4615 const char *name, int name_len, 4616 u64 ref_objectid, u64 objectid, 4617 umode_t mode, u64 *index) 4618 { 4619 struct inode *inode; 4620 struct btrfs_inode_item *inode_item; 4621 struct btrfs_key *location; 4622 struct btrfs_path *path; 4623 struct btrfs_inode_ref *ref; 4624 struct btrfs_key key[2]; 4625 u32 sizes[2]; 4626 unsigned long ptr; 4627 int ret; 4628 int owner; 4629 4630 path = btrfs_alloc_path(); 4631 if (!path) 4632 return ERR_PTR(-ENOMEM); 4633 4634 inode = new_inode(root->fs_info->sb); 4635 if (!inode) { 4636 btrfs_free_path(path); 4637 return ERR_PTR(-ENOMEM); 4638 } 4639 4640 /* 4641 * we have to initialize this early, so we can reclaim the inode 4642 * number if we fail afterwards in this function. 4643 */ 4644 inode->i_ino = objectid; 4645 4646 if (dir) { 4647 trace_btrfs_inode_request(dir); 4648 4649 ret = btrfs_set_inode_index(dir, index); 4650 if (ret) { 4651 btrfs_free_path(path); 4652 iput(inode); 4653 return ERR_PTR(ret); 4654 } 4655 } 4656 /* 4657 * index_cnt is ignored for everything but a dir, 4658 * btrfs_get_inode_index_count has an explanation for the magic 4659 * number 4660 */ 4661 BTRFS_I(inode)->index_cnt = 2; 4662 BTRFS_I(inode)->root = root; 4663 BTRFS_I(inode)->generation = trans->transid; 4664 inode->i_generation = BTRFS_I(inode)->generation; 4665 btrfs_set_inode_space_info(root, inode); 4666 4667 if (S_ISDIR(mode)) 4668 owner = 0; 4669 else 4670 owner = 1; 4671 4672 key[0].objectid = objectid; 4673 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4674 key[0].offset = 0; 4675 4676 key[1].objectid = objectid; 4677 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4678 key[1].offset = ref_objectid; 4679 4680 sizes[0] = sizeof(struct btrfs_inode_item); 4681 sizes[1] = name_len + sizeof(*ref); 4682 4683 path->leave_spinning = 1; 4684 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 4685 if (ret != 0) 4686 goto fail; 4687 4688 inode_init_owner(inode, dir, mode); 4689 inode_set_bytes(inode, 0); 4690 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4691 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4692 struct btrfs_inode_item); 4693 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4694 4695 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4696 struct btrfs_inode_ref); 4697 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 4698 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 4699 ptr = (unsigned long)(ref + 1); 4700 write_extent_buffer(path->nodes[0], name, ptr, name_len); 4701 4702 btrfs_mark_buffer_dirty(path->nodes[0]); 4703 btrfs_free_path(path); 4704 4705 location = &BTRFS_I(inode)->location; 4706 location->objectid = objectid; 4707 location->offset = 0; 4708 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 4709 4710 btrfs_inherit_iflags(inode, dir); 4711 4712 if (S_ISREG(mode)) { 4713 if (btrfs_test_opt(root, NODATASUM)) 4714 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4715 if (btrfs_test_opt(root, NODATACOW) || 4716 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) 4717 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4718 } 4719 4720 insert_inode_hash(inode); 4721 inode_tree_add(inode); 4722 4723 trace_btrfs_inode_new(inode); 4724 btrfs_set_inode_last_trans(trans, inode); 4725 4726 return inode; 4727 fail: 4728 if (dir) 4729 BTRFS_I(dir)->index_cnt--; 4730 btrfs_free_path(path); 4731 iput(inode); 4732 return ERR_PTR(ret); 4733 } 4734 4735 static inline u8 btrfs_inode_type(struct inode *inode) 4736 { 4737 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 4738 } 4739 4740 /* 4741 * utility function to add 'inode' into 'parent_inode' with 4742 * a give name and a given sequence number. 4743 * if 'add_backref' is true, also insert a backref from the 4744 * inode to the parent directory. 4745 */ 4746 int btrfs_add_link(struct btrfs_trans_handle *trans, 4747 struct inode *parent_inode, struct inode *inode, 4748 const char *name, int name_len, int add_backref, u64 index) 4749 { 4750 int ret = 0; 4751 struct btrfs_key key; 4752 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4753 u64 ino = btrfs_ino(inode); 4754 u64 parent_ino = btrfs_ino(parent_inode); 4755 4756 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4757 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4758 } else { 4759 key.objectid = ino; 4760 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4761 key.offset = 0; 4762 } 4763 4764 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4765 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4766 key.objectid, root->root_key.objectid, 4767 parent_ino, index, name, name_len); 4768 } else if (add_backref) { 4769 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 4770 parent_ino, index); 4771 } 4772 4773 /* Nothing to clean up yet */ 4774 if (ret) 4775 return ret; 4776 4777 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4778 parent_inode, &key, 4779 btrfs_inode_type(inode), index); 4780 if (ret == -EEXIST) 4781 goto fail_dir_item; 4782 else if (ret) { 4783 btrfs_abort_transaction(trans, root, ret); 4784 return ret; 4785 } 4786 4787 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4788 name_len * 2); 4789 inode_inc_iversion(parent_inode); 4790 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4791 ret = btrfs_update_inode(trans, root, parent_inode); 4792 if (ret) 4793 btrfs_abort_transaction(trans, root, ret); 4794 return ret; 4795 4796 fail_dir_item: 4797 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4798 u64 local_index; 4799 int err; 4800 err = btrfs_del_root_ref(trans, root->fs_info->tree_root, 4801 key.objectid, root->root_key.objectid, 4802 parent_ino, &local_index, name, name_len); 4803 4804 } else if (add_backref) { 4805 u64 local_index; 4806 int err; 4807 4808 err = btrfs_del_inode_ref(trans, root, name, name_len, 4809 ino, parent_ino, &local_index); 4810 } 4811 return ret; 4812 } 4813 4814 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4815 struct inode *dir, struct dentry *dentry, 4816 struct inode *inode, int backref, u64 index) 4817 { 4818 int err = btrfs_add_link(trans, dir, inode, 4819 dentry->d_name.name, dentry->d_name.len, 4820 backref, index); 4821 if (err > 0) 4822 err = -EEXIST; 4823 return err; 4824 } 4825 4826 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4827 umode_t mode, dev_t rdev) 4828 { 4829 struct btrfs_trans_handle *trans; 4830 struct btrfs_root *root = BTRFS_I(dir)->root; 4831 struct inode *inode = NULL; 4832 int err; 4833 int drop_inode = 0; 4834 u64 objectid; 4835 unsigned long nr = 0; 4836 u64 index = 0; 4837 4838 if (!new_valid_dev(rdev)) 4839 return -EINVAL; 4840 4841 /* 4842 * 2 for inode item and ref 4843 * 2 for dir items 4844 * 1 for xattr if selinux is on 4845 */ 4846 trans = btrfs_start_transaction(root, 5); 4847 if (IS_ERR(trans)) 4848 return PTR_ERR(trans); 4849 4850 err = btrfs_find_free_ino(root, &objectid); 4851 if (err) 4852 goto out_unlock; 4853 4854 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4855 dentry->d_name.len, btrfs_ino(dir), objectid, 4856 mode, &index); 4857 if (IS_ERR(inode)) { 4858 err = PTR_ERR(inode); 4859 goto out_unlock; 4860 } 4861 4862 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4863 if (err) { 4864 drop_inode = 1; 4865 goto out_unlock; 4866 } 4867 4868 /* 4869 * If the active LSM wants to access the inode during 4870 * d_instantiate it needs these. Smack checks to see 4871 * if the filesystem supports xattrs by looking at the 4872 * ops vector. 4873 */ 4874 4875 inode->i_op = &btrfs_special_inode_operations; 4876 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4877 if (err) 4878 drop_inode = 1; 4879 else { 4880 init_special_inode(inode, inode->i_mode, rdev); 4881 btrfs_update_inode(trans, root, inode); 4882 d_instantiate(dentry, inode); 4883 } 4884 out_unlock: 4885 nr = trans->blocks_used; 4886 btrfs_end_transaction(trans, root); 4887 btrfs_btree_balance_dirty(root, nr); 4888 if (drop_inode) { 4889 inode_dec_link_count(inode); 4890 iput(inode); 4891 } 4892 return err; 4893 } 4894 4895 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4896 umode_t mode, struct nameidata *nd) 4897 { 4898 struct btrfs_trans_handle *trans; 4899 struct btrfs_root *root = BTRFS_I(dir)->root; 4900 struct inode *inode = NULL; 4901 int drop_inode = 0; 4902 int err; 4903 unsigned long nr = 0; 4904 u64 objectid; 4905 u64 index = 0; 4906 4907 /* 4908 * 2 for inode item and ref 4909 * 2 for dir items 4910 * 1 for xattr if selinux is on 4911 */ 4912 trans = btrfs_start_transaction(root, 5); 4913 if (IS_ERR(trans)) 4914 return PTR_ERR(trans); 4915 4916 err = btrfs_find_free_ino(root, &objectid); 4917 if (err) 4918 goto out_unlock; 4919 4920 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4921 dentry->d_name.len, btrfs_ino(dir), objectid, 4922 mode, &index); 4923 if (IS_ERR(inode)) { 4924 err = PTR_ERR(inode); 4925 goto out_unlock; 4926 } 4927 4928 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4929 if (err) { 4930 drop_inode = 1; 4931 goto out_unlock; 4932 } 4933 4934 /* 4935 * If the active LSM wants to access the inode during 4936 * d_instantiate it needs these. Smack checks to see 4937 * if the filesystem supports xattrs by looking at the 4938 * ops vector. 4939 */ 4940 inode->i_fop = &btrfs_file_operations; 4941 inode->i_op = &btrfs_file_inode_operations; 4942 4943 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4944 if (err) 4945 drop_inode = 1; 4946 else { 4947 inode->i_mapping->a_ops = &btrfs_aops; 4948 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4949 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4950 d_instantiate(dentry, inode); 4951 } 4952 out_unlock: 4953 nr = trans->blocks_used; 4954 btrfs_end_transaction(trans, root); 4955 if (drop_inode) { 4956 inode_dec_link_count(inode); 4957 iput(inode); 4958 } 4959 btrfs_btree_balance_dirty(root, nr); 4960 return err; 4961 } 4962 4963 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 4964 struct dentry *dentry) 4965 { 4966 struct btrfs_trans_handle *trans; 4967 struct btrfs_root *root = BTRFS_I(dir)->root; 4968 struct inode *inode = old_dentry->d_inode; 4969 u64 index; 4970 unsigned long nr = 0; 4971 int err; 4972 int drop_inode = 0; 4973 4974 /* do not allow sys_link's with other subvols of the same device */ 4975 if (root->objectid != BTRFS_I(inode)->root->objectid) 4976 return -EXDEV; 4977 4978 if (inode->i_nlink == ~0U) 4979 return -EMLINK; 4980 4981 err = btrfs_set_inode_index(dir, &index); 4982 if (err) 4983 goto fail; 4984 4985 /* 4986 * 2 items for inode and inode ref 4987 * 2 items for dir items 4988 * 1 item for parent inode 4989 */ 4990 trans = btrfs_start_transaction(root, 5); 4991 if (IS_ERR(trans)) { 4992 err = PTR_ERR(trans); 4993 goto fail; 4994 } 4995 4996 btrfs_inc_nlink(inode); 4997 inode_inc_iversion(inode); 4998 inode->i_ctime = CURRENT_TIME; 4999 ihold(inode); 5000 5001 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5002 5003 if (err) { 5004 drop_inode = 1; 5005 } else { 5006 struct dentry *parent = dentry->d_parent; 5007 err = btrfs_update_inode(trans, root, inode); 5008 if (err) 5009 goto fail; 5010 d_instantiate(dentry, inode); 5011 btrfs_log_new_name(trans, inode, NULL, parent); 5012 } 5013 5014 nr = trans->blocks_used; 5015 btrfs_end_transaction(trans, root); 5016 fail: 5017 if (drop_inode) { 5018 inode_dec_link_count(inode); 5019 iput(inode); 5020 } 5021 btrfs_btree_balance_dirty(root, nr); 5022 return err; 5023 } 5024 5025 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 5026 { 5027 struct inode *inode = NULL; 5028 struct btrfs_trans_handle *trans; 5029 struct btrfs_root *root = BTRFS_I(dir)->root; 5030 int err = 0; 5031 int drop_on_err = 0; 5032 u64 objectid = 0; 5033 u64 index = 0; 5034 unsigned long nr = 1; 5035 5036 /* 5037 * 2 items for inode and ref 5038 * 2 items for dir items 5039 * 1 for xattr if selinux is on 5040 */ 5041 trans = btrfs_start_transaction(root, 5); 5042 if (IS_ERR(trans)) 5043 return PTR_ERR(trans); 5044 5045 err = btrfs_find_free_ino(root, &objectid); 5046 if (err) 5047 goto out_fail; 5048 5049 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5050 dentry->d_name.len, btrfs_ino(dir), objectid, 5051 S_IFDIR | mode, &index); 5052 if (IS_ERR(inode)) { 5053 err = PTR_ERR(inode); 5054 goto out_fail; 5055 } 5056 5057 drop_on_err = 1; 5058 5059 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5060 if (err) 5061 goto out_fail; 5062 5063 inode->i_op = &btrfs_dir_inode_operations; 5064 inode->i_fop = &btrfs_dir_file_operations; 5065 5066 btrfs_i_size_write(inode, 0); 5067 err = btrfs_update_inode(trans, root, inode); 5068 if (err) 5069 goto out_fail; 5070 5071 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 5072 dentry->d_name.len, 0, index); 5073 if (err) 5074 goto out_fail; 5075 5076 d_instantiate(dentry, inode); 5077 drop_on_err = 0; 5078 5079 out_fail: 5080 nr = trans->blocks_used; 5081 btrfs_end_transaction(trans, root); 5082 if (drop_on_err) 5083 iput(inode); 5084 btrfs_btree_balance_dirty(root, nr); 5085 return err; 5086 } 5087 5088 /* helper for btfs_get_extent. Given an existing extent in the tree, 5089 * and an extent that you want to insert, deal with overlap and insert 5090 * the new extent into the tree. 5091 */ 5092 static int merge_extent_mapping(struct extent_map_tree *em_tree, 5093 struct extent_map *existing, 5094 struct extent_map *em, 5095 u64 map_start, u64 map_len) 5096 { 5097 u64 start_diff; 5098 5099 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 5100 start_diff = map_start - em->start; 5101 em->start = map_start; 5102 em->len = map_len; 5103 if (em->block_start < EXTENT_MAP_LAST_BYTE && 5104 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 5105 em->block_start += start_diff; 5106 em->block_len -= start_diff; 5107 } 5108 return add_extent_mapping(em_tree, em); 5109 } 5110 5111 static noinline int uncompress_inline(struct btrfs_path *path, 5112 struct inode *inode, struct page *page, 5113 size_t pg_offset, u64 extent_offset, 5114 struct btrfs_file_extent_item *item) 5115 { 5116 int ret; 5117 struct extent_buffer *leaf = path->nodes[0]; 5118 char *tmp; 5119 size_t max_size; 5120 unsigned long inline_size; 5121 unsigned long ptr; 5122 int compress_type; 5123 5124 WARN_ON(pg_offset != 0); 5125 compress_type = btrfs_file_extent_compression(leaf, item); 5126 max_size = btrfs_file_extent_ram_bytes(leaf, item); 5127 inline_size = btrfs_file_extent_inline_item_len(leaf, 5128 btrfs_item_nr(leaf, path->slots[0])); 5129 tmp = kmalloc(inline_size, GFP_NOFS); 5130 if (!tmp) 5131 return -ENOMEM; 5132 ptr = btrfs_file_extent_inline_start(item); 5133 5134 read_extent_buffer(leaf, tmp, ptr, inline_size); 5135 5136 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 5137 ret = btrfs_decompress(compress_type, tmp, page, 5138 extent_offset, inline_size, max_size); 5139 if (ret) { 5140 char *kaddr = kmap_atomic(page); 5141 unsigned long copy_size = min_t(u64, 5142 PAGE_CACHE_SIZE - pg_offset, 5143 max_size - extent_offset); 5144 memset(kaddr + pg_offset, 0, copy_size); 5145 kunmap_atomic(kaddr); 5146 } 5147 kfree(tmp); 5148 return 0; 5149 } 5150 5151 /* 5152 * a bit scary, this does extent mapping from logical file offset to the disk. 5153 * the ugly parts come from merging extents from the disk with the in-ram 5154 * representation. This gets more complex because of the data=ordered code, 5155 * where the in-ram extents might be locked pending data=ordered completion. 5156 * 5157 * This also copies inline extents directly into the page. 5158 */ 5159 5160 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 5161 size_t pg_offset, u64 start, u64 len, 5162 int create) 5163 { 5164 int ret; 5165 int err = 0; 5166 u64 bytenr; 5167 u64 extent_start = 0; 5168 u64 extent_end = 0; 5169 u64 objectid = btrfs_ino(inode); 5170 u32 found_type; 5171 struct btrfs_path *path = NULL; 5172 struct btrfs_root *root = BTRFS_I(inode)->root; 5173 struct btrfs_file_extent_item *item; 5174 struct extent_buffer *leaf; 5175 struct btrfs_key found_key; 5176 struct extent_map *em = NULL; 5177 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5178 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5179 struct btrfs_trans_handle *trans = NULL; 5180 int compress_type; 5181 5182 again: 5183 read_lock(&em_tree->lock); 5184 em = lookup_extent_mapping(em_tree, start, len); 5185 if (em) 5186 em->bdev = root->fs_info->fs_devices->latest_bdev; 5187 read_unlock(&em_tree->lock); 5188 5189 if (em) { 5190 if (em->start > start || em->start + em->len <= start) 5191 free_extent_map(em); 5192 else if (em->block_start == EXTENT_MAP_INLINE && page) 5193 free_extent_map(em); 5194 else 5195 goto out; 5196 } 5197 em = alloc_extent_map(); 5198 if (!em) { 5199 err = -ENOMEM; 5200 goto out; 5201 } 5202 em->bdev = root->fs_info->fs_devices->latest_bdev; 5203 em->start = EXTENT_MAP_HOLE; 5204 em->orig_start = EXTENT_MAP_HOLE; 5205 em->len = (u64)-1; 5206 em->block_len = (u64)-1; 5207 5208 if (!path) { 5209 path = btrfs_alloc_path(); 5210 if (!path) { 5211 err = -ENOMEM; 5212 goto out; 5213 } 5214 /* 5215 * Chances are we'll be called again, so go ahead and do 5216 * readahead 5217 */ 5218 path->reada = 1; 5219 } 5220 5221 ret = btrfs_lookup_file_extent(trans, root, path, 5222 objectid, start, trans != NULL); 5223 if (ret < 0) { 5224 err = ret; 5225 goto out; 5226 } 5227 5228 if (ret != 0) { 5229 if (path->slots[0] == 0) 5230 goto not_found; 5231 path->slots[0]--; 5232 } 5233 5234 leaf = path->nodes[0]; 5235 item = btrfs_item_ptr(leaf, path->slots[0], 5236 struct btrfs_file_extent_item); 5237 /* are we inside the extent that was found? */ 5238 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5239 found_type = btrfs_key_type(&found_key); 5240 if (found_key.objectid != objectid || 5241 found_type != BTRFS_EXTENT_DATA_KEY) { 5242 goto not_found; 5243 } 5244 5245 found_type = btrfs_file_extent_type(leaf, item); 5246 extent_start = found_key.offset; 5247 compress_type = btrfs_file_extent_compression(leaf, item); 5248 if (found_type == BTRFS_FILE_EXTENT_REG || 5249 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5250 extent_end = extent_start + 5251 btrfs_file_extent_num_bytes(leaf, item); 5252 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5253 size_t size; 5254 size = btrfs_file_extent_inline_len(leaf, item); 5255 extent_end = (extent_start + size + root->sectorsize - 1) & 5256 ~((u64)root->sectorsize - 1); 5257 } 5258 5259 if (start >= extent_end) { 5260 path->slots[0]++; 5261 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 5262 ret = btrfs_next_leaf(root, path); 5263 if (ret < 0) { 5264 err = ret; 5265 goto out; 5266 } 5267 if (ret > 0) 5268 goto not_found; 5269 leaf = path->nodes[0]; 5270 } 5271 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5272 if (found_key.objectid != objectid || 5273 found_key.type != BTRFS_EXTENT_DATA_KEY) 5274 goto not_found; 5275 if (start + len <= found_key.offset) 5276 goto not_found; 5277 em->start = start; 5278 em->len = found_key.offset - start; 5279 goto not_found_em; 5280 } 5281 5282 if (found_type == BTRFS_FILE_EXTENT_REG || 5283 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5284 em->start = extent_start; 5285 em->len = extent_end - extent_start; 5286 em->orig_start = extent_start - 5287 btrfs_file_extent_offset(leaf, item); 5288 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5289 if (bytenr == 0) { 5290 em->block_start = EXTENT_MAP_HOLE; 5291 goto insert; 5292 } 5293 if (compress_type != BTRFS_COMPRESS_NONE) { 5294 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5295 em->compress_type = compress_type; 5296 em->block_start = bytenr; 5297 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5298 item); 5299 } else { 5300 bytenr += btrfs_file_extent_offset(leaf, item); 5301 em->block_start = bytenr; 5302 em->block_len = em->len; 5303 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 5304 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5305 } 5306 goto insert; 5307 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5308 unsigned long ptr; 5309 char *map; 5310 size_t size; 5311 size_t extent_offset; 5312 size_t copy_size; 5313 5314 em->block_start = EXTENT_MAP_INLINE; 5315 if (!page || create) { 5316 em->start = extent_start; 5317 em->len = extent_end - extent_start; 5318 goto out; 5319 } 5320 5321 size = btrfs_file_extent_inline_len(leaf, item); 5322 extent_offset = page_offset(page) + pg_offset - extent_start; 5323 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 5324 size - extent_offset); 5325 em->start = extent_start + extent_offset; 5326 em->len = (copy_size + root->sectorsize - 1) & 5327 ~((u64)root->sectorsize - 1); 5328 em->orig_start = EXTENT_MAP_INLINE; 5329 if (compress_type) { 5330 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5331 em->compress_type = compress_type; 5332 } 5333 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5334 if (create == 0 && !PageUptodate(page)) { 5335 if (btrfs_file_extent_compression(leaf, item) != 5336 BTRFS_COMPRESS_NONE) { 5337 ret = uncompress_inline(path, inode, page, 5338 pg_offset, 5339 extent_offset, item); 5340 BUG_ON(ret); /* -ENOMEM */ 5341 } else { 5342 map = kmap(page); 5343 read_extent_buffer(leaf, map + pg_offset, ptr, 5344 copy_size); 5345 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 5346 memset(map + pg_offset + copy_size, 0, 5347 PAGE_CACHE_SIZE - pg_offset - 5348 copy_size); 5349 } 5350 kunmap(page); 5351 } 5352 flush_dcache_page(page); 5353 } else if (create && PageUptodate(page)) { 5354 BUG(); 5355 if (!trans) { 5356 kunmap(page); 5357 free_extent_map(em); 5358 em = NULL; 5359 5360 btrfs_release_path(path); 5361 trans = btrfs_join_transaction(root); 5362 5363 if (IS_ERR(trans)) 5364 return ERR_CAST(trans); 5365 goto again; 5366 } 5367 map = kmap(page); 5368 write_extent_buffer(leaf, map + pg_offset, ptr, 5369 copy_size); 5370 kunmap(page); 5371 btrfs_mark_buffer_dirty(leaf); 5372 } 5373 set_extent_uptodate(io_tree, em->start, 5374 extent_map_end(em) - 1, NULL, GFP_NOFS); 5375 goto insert; 5376 } else { 5377 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5378 WARN_ON(1); 5379 } 5380 not_found: 5381 em->start = start; 5382 em->len = len; 5383 not_found_em: 5384 em->block_start = EXTENT_MAP_HOLE; 5385 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 5386 insert: 5387 btrfs_release_path(path); 5388 if (em->start > start || extent_map_end(em) <= start) { 5389 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 5390 "[%llu %llu]\n", (unsigned long long)em->start, 5391 (unsigned long long)em->len, 5392 (unsigned long long)start, 5393 (unsigned long long)len); 5394 err = -EIO; 5395 goto out; 5396 } 5397 5398 err = 0; 5399 write_lock(&em_tree->lock); 5400 ret = add_extent_mapping(em_tree, em); 5401 /* it is possible that someone inserted the extent into the tree 5402 * while we had the lock dropped. It is also possible that 5403 * an overlapping map exists in the tree 5404 */ 5405 if (ret == -EEXIST) { 5406 struct extent_map *existing; 5407 5408 ret = 0; 5409 5410 existing = lookup_extent_mapping(em_tree, start, len); 5411 if (existing && (existing->start > start || 5412 existing->start + existing->len <= start)) { 5413 free_extent_map(existing); 5414 existing = NULL; 5415 } 5416 if (!existing) { 5417 existing = lookup_extent_mapping(em_tree, em->start, 5418 em->len); 5419 if (existing) { 5420 err = merge_extent_mapping(em_tree, existing, 5421 em, start, 5422 root->sectorsize); 5423 free_extent_map(existing); 5424 if (err) { 5425 free_extent_map(em); 5426 em = NULL; 5427 } 5428 } else { 5429 err = -EIO; 5430 free_extent_map(em); 5431 em = NULL; 5432 } 5433 } else { 5434 free_extent_map(em); 5435 em = existing; 5436 err = 0; 5437 } 5438 } 5439 write_unlock(&em_tree->lock); 5440 out: 5441 5442 trace_btrfs_get_extent(root, em); 5443 5444 if (path) 5445 btrfs_free_path(path); 5446 if (trans) { 5447 ret = btrfs_end_transaction(trans, root); 5448 if (!err) 5449 err = ret; 5450 } 5451 if (err) { 5452 free_extent_map(em); 5453 return ERR_PTR(err); 5454 } 5455 BUG_ON(!em); /* Error is always set */ 5456 return em; 5457 } 5458 5459 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 5460 size_t pg_offset, u64 start, u64 len, 5461 int create) 5462 { 5463 struct extent_map *em; 5464 struct extent_map *hole_em = NULL; 5465 u64 range_start = start; 5466 u64 end; 5467 u64 found; 5468 u64 found_end; 5469 int err = 0; 5470 5471 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 5472 if (IS_ERR(em)) 5473 return em; 5474 if (em) { 5475 /* 5476 * if our em maps to a hole, there might 5477 * actually be delalloc bytes behind it 5478 */ 5479 if (em->block_start != EXTENT_MAP_HOLE) 5480 return em; 5481 else 5482 hole_em = em; 5483 } 5484 5485 /* check to see if we've wrapped (len == -1 or similar) */ 5486 end = start + len; 5487 if (end < start) 5488 end = (u64)-1; 5489 else 5490 end -= 1; 5491 5492 em = NULL; 5493 5494 /* ok, we didn't find anything, lets look for delalloc */ 5495 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 5496 end, len, EXTENT_DELALLOC, 1); 5497 found_end = range_start + found; 5498 if (found_end < range_start) 5499 found_end = (u64)-1; 5500 5501 /* 5502 * we didn't find anything useful, return 5503 * the original results from get_extent() 5504 */ 5505 if (range_start > end || found_end <= start) { 5506 em = hole_em; 5507 hole_em = NULL; 5508 goto out; 5509 } 5510 5511 /* adjust the range_start to make sure it doesn't 5512 * go backwards from the start they passed in 5513 */ 5514 range_start = max(start,range_start); 5515 found = found_end - range_start; 5516 5517 if (found > 0) { 5518 u64 hole_start = start; 5519 u64 hole_len = len; 5520 5521 em = alloc_extent_map(); 5522 if (!em) { 5523 err = -ENOMEM; 5524 goto out; 5525 } 5526 /* 5527 * when btrfs_get_extent can't find anything it 5528 * returns one huge hole 5529 * 5530 * make sure what it found really fits our range, and 5531 * adjust to make sure it is based on the start from 5532 * the caller 5533 */ 5534 if (hole_em) { 5535 u64 calc_end = extent_map_end(hole_em); 5536 5537 if (calc_end <= start || (hole_em->start > end)) { 5538 free_extent_map(hole_em); 5539 hole_em = NULL; 5540 } else { 5541 hole_start = max(hole_em->start, start); 5542 hole_len = calc_end - hole_start; 5543 } 5544 } 5545 em->bdev = NULL; 5546 if (hole_em && range_start > hole_start) { 5547 /* our hole starts before our delalloc, so we 5548 * have to return just the parts of the hole 5549 * that go until the delalloc starts 5550 */ 5551 em->len = min(hole_len, 5552 range_start - hole_start); 5553 em->start = hole_start; 5554 em->orig_start = hole_start; 5555 /* 5556 * don't adjust block start at all, 5557 * it is fixed at EXTENT_MAP_HOLE 5558 */ 5559 em->block_start = hole_em->block_start; 5560 em->block_len = hole_len; 5561 } else { 5562 em->start = range_start; 5563 em->len = found; 5564 em->orig_start = range_start; 5565 em->block_start = EXTENT_MAP_DELALLOC; 5566 em->block_len = found; 5567 } 5568 } else if (hole_em) { 5569 return hole_em; 5570 } 5571 out: 5572 5573 free_extent_map(hole_em); 5574 if (err) { 5575 free_extent_map(em); 5576 return ERR_PTR(err); 5577 } 5578 return em; 5579 } 5580 5581 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5582 struct extent_map *em, 5583 u64 start, u64 len) 5584 { 5585 struct btrfs_root *root = BTRFS_I(inode)->root; 5586 struct btrfs_trans_handle *trans; 5587 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5588 struct btrfs_key ins; 5589 u64 alloc_hint; 5590 int ret; 5591 bool insert = false; 5592 5593 /* 5594 * Ok if the extent map we looked up is a hole and is for the exact 5595 * range we want, there is no reason to allocate a new one, however if 5596 * it is not right then we need to free this one and drop the cache for 5597 * our range. 5598 */ 5599 if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5600 em->len != len) { 5601 free_extent_map(em); 5602 em = NULL; 5603 insert = true; 5604 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5605 } 5606 5607 trans = btrfs_join_transaction(root); 5608 if (IS_ERR(trans)) 5609 return ERR_CAST(trans); 5610 5611 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) 5612 btrfs_add_inode_defrag(trans, inode); 5613 5614 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5615 5616 alloc_hint = get_extent_allocation_hint(inode, start, len); 5617 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, 5618 alloc_hint, &ins, 1); 5619 if (ret) { 5620 em = ERR_PTR(ret); 5621 goto out; 5622 } 5623 5624 if (!em) { 5625 em = alloc_extent_map(); 5626 if (!em) { 5627 em = ERR_PTR(-ENOMEM); 5628 goto out; 5629 } 5630 } 5631 5632 em->start = start; 5633 em->orig_start = em->start; 5634 em->len = ins.offset; 5635 5636 em->block_start = ins.objectid; 5637 em->block_len = ins.offset; 5638 em->bdev = root->fs_info->fs_devices->latest_bdev; 5639 5640 /* 5641 * We need to do this because if we're using the original em we searched 5642 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5643 */ 5644 em->flags = 0; 5645 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5646 5647 while (insert) { 5648 write_lock(&em_tree->lock); 5649 ret = add_extent_mapping(em_tree, em); 5650 write_unlock(&em_tree->lock); 5651 if (ret != -EEXIST) 5652 break; 5653 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5654 } 5655 5656 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5657 ins.offset, ins.offset, 0); 5658 if (ret) { 5659 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 5660 em = ERR_PTR(ret); 5661 } 5662 out: 5663 btrfs_end_transaction(trans, root); 5664 return em; 5665 } 5666 5667 /* 5668 * returns 1 when the nocow is safe, < 1 on error, 0 if the 5669 * block must be cow'd 5670 */ 5671 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 5672 struct inode *inode, u64 offset, u64 len) 5673 { 5674 struct btrfs_path *path; 5675 int ret; 5676 struct extent_buffer *leaf; 5677 struct btrfs_root *root = BTRFS_I(inode)->root; 5678 struct btrfs_file_extent_item *fi; 5679 struct btrfs_key key; 5680 u64 disk_bytenr; 5681 u64 backref_offset; 5682 u64 extent_end; 5683 u64 num_bytes; 5684 int slot; 5685 int found_type; 5686 5687 path = btrfs_alloc_path(); 5688 if (!path) 5689 return -ENOMEM; 5690 5691 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 5692 offset, 0); 5693 if (ret < 0) 5694 goto out; 5695 5696 slot = path->slots[0]; 5697 if (ret == 1) { 5698 if (slot == 0) { 5699 /* can't find the item, must cow */ 5700 ret = 0; 5701 goto out; 5702 } 5703 slot--; 5704 } 5705 ret = 0; 5706 leaf = path->nodes[0]; 5707 btrfs_item_key_to_cpu(leaf, &key, slot); 5708 if (key.objectid != btrfs_ino(inode) || 5709 key.type != BTRFS_EXTENT_DATA_KEY) { 5710 /* not our file or wrong item type, must cow */ 5711 goto out; 5712 } 5713 5714 if (key.offset > offset) { 5715 /* Wrong offset, must cow */ 5716 goto out; 5717 } 5718 5719 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5720 found_type = btrfs_file_extent_type(leaf, fi); 5721 if (found_type != BTRFS_FILE_EXTENT_REG && 5722 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 5723 /* not a regular extent, must cow */ 5724 goto out; 5725 } 5726 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 5727 backref_offset = btrfs_file_extent_offset(leaf, fi); 5728 5729 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 5730 if (extent_end < offset + len) { 5731 /* extent doesn't include our full range, must cow */ 5732 goto out; 5733 } 5734 5735 if (btrfs_extent_readonly(root, disk_bytenr)) 5736 goto out; 5737 5738 /* 5739 * look for other files referencing this extent, if we 5740 * find any we must cow 5741 */ 5742 if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), 5743 key.offset - backref_offset, disk_bytenr)) 5744 goto out; 5745 5746 /* 5747 * adjust disk_bytenr and num_bytes to cover just the bytes 5748 * in this extent we are about to write. If there 5749 * are any csums in that range we have to cow in order 5750 * to keep the csums correct 5751 */ 5752 disk_bytenr += backref_offset; 5753 disk_bytenr += offset - key.offset; 5754 num_bytes = min(offset + len, extent_end) - offset; 5755 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 5756 goto out; 5757 /* 5758 * all of the above have passed, it is safe to overwrite this extent 5759 * without cow 5760 */ 5761 ret = 1; 5762 out: 5763 btrfs_free_path(path); 5764 return ret; 5765 } 5766 5767 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5768 struct buffer_head *bh_result, int create) 5769 { 5770 struct extent_map *em; 5771 struct btrfs_root *root = BTRFS_I(inode)->root; 5772 u64 start = iblock << inode->i_blkbits; 5773 u64 len = bh_result->b_size; 5774 struct btrfs_trans_handle *trans; 5775 5776 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5777 if (IS_ERR(em)) 5778 return PTR_ERR(em); 5779 5780 /* 5781 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5782 * io. INLINE is special, and we could probably kludge it in here, but 5783 * it's still buffered so for safety lets just fall back to the generic 5784 * buffered path. 5785 * 5786 * For COMPRESSED we _have_ to read the entire extent in so we can 5787 * decompress it, so there will be buffering required no matter what we 5788 * do, so go ahead and fallback to buffered. 5789 * 5790 * We return -ENOTBLK because thats what makes DIO go ahead and go back 5791 * to buffered IO. Don't blame me, this is the price we pay for using 5792 * the generic code. 5793 */ 5794 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5795 em->block_start == EXTENT_MAP_INLINE) { 5796 free_extent_map(em); 5797 return -ENOTBLK; 5798 } 5799 5800 /* Just a good old fashioned hole, return */ 5801 if (!create && (em->block_start == EXTENT_MAP_HOLE || 5802 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5803 free_extent_map(em); 5804 /* DIO will do one hole at a time, so just unlock a sector */ 5805 unlock_extent(&BTRFS_I(inode)->io_tree, start, 5806 start + root->sectorsize - 1); 5807 return 0; 5808 } 5809 5810 /* 5811 * We don't allocate a new extent in the following cases 5812 * 5813 * 1) The inode is marked as NODATACOW. In this case we'll just use the 5814 * existing extent. 5815 * 2) The extent is marked as PREALLOC. We're good to go here and can 5816 * just use the extent. 5817 * 5818 */ 5819 if (!create) { 5820 len = em->len - (start - em->start); 5821 goto map; 5822 } 5823 5824 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 5825 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 5826 em->block_start != EXTENT_MAP_HOLE)) { 5827 int type; 5828 int ret; 5829 u64 block_start; 5830 5831 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5832 type = BTRFS_ORDERED_PREALLOC; 5833 else 5834 type = BTRFS_ORDERED_NOCOW; 5835 len = min(len, em->len - (start - em->start)); 5836 block_start = em->block_start + (start - em->start); 5837 5838 /* 5839 * we're not going to log anything, but we do need 5840 * to make sure the current transaction stays open 5841 * while we look for nocow cross refs 5842 */ 5843 trans = btrfs_join_transaction(root); 5844 if (IS_ERR(trans)) 5845 goto must_cow; 5846 5847 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5848 ret = btrfs_add_ordered_extent_dio(inode, start, 5849 block_start, len, len, type); 5850 btrfs_end_transaction(trans, root); 5851 if (ret) { 5852 free_extent_map(em); 5853 return ret; 5854 } 5855 goto unlock; 5856 } 5857 btrfs_end_transaction(trans, root); 5858 } 5859 must_cow: 5860 /* 5861 * this will cow the extent, reset the len in case we changed 5862 * it above 5863 */ 5864 len = bh_result->b_size; 5865 em = btrfs_new_extent_direct(inode, em, start, len); 5866 if (IS_ERR(em)) 5867 return PTR_ERR(em); 5868 len = min(len, em->len - (start - em->start)); 5869 unlock: 5870 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, 5871 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, 5872 0, NULL, GFP_NOFS); 5873 map: 5874 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 5875 inode->i_blkbits; 5876 bh_result->b_size = len; 5877 bh_result->b_bdev = em->bdev; 5878 set_buffer_mapped(bh_result); 5879 if (create) { 5880 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5881 set_buffer_new(bh_result); 5882 5883 /* 5884 * Need to update the i_size under the extent lock so buffered 5885 * readers will get the updated i_size when we unlock. 5886 */ 5887 if (start + len > i_size_read(inode)) 5888 i_size_write(inode, start + len); 5889 } 5890 5891 free_extent_map(em); 5892 5893 return 0; 5894 } 5895 5896 struct btrfs_dio_private { 5897 struct inode *inode; 5898 u64 logical_offset; 5899 u64 disk_bytenr; 5900 u64 bytes; 5901 u32 *csums; 5902 void *private; 5903 5904 /* number of bios pending for this dio */ 5905 atomic_t pending_bios; 5906 5907 /* IO errors */ 5908 int errors; 5909 5910 struct bio *orig_bio; 5911 }; 5912 5913 static void btrfs_endio_direct_read(struct bio *bio, int err) 5914 { 5915 struct btrfs_dio_private *dip = bio->bi_private; 5916 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5917 struct bio_vec *bvec = bio->bi_io_vec; 5918 struct inode *inode = dip->inode; 5919 struct btrfs_root *root = BTRFS_I(inode)->root; 5920 u64 start; 5921 u32 *private = dip->csums; 5922 5923 start = dip->logical_offset; 5924 do { 5925 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 5926 struct page *page = bvec->bv_page; 5927 char *kaddr; 5928 u32 csum = ~(u32)0; 5929 unsigned long flags; 5930 5931 local_irq_save(flags); 5932 kaddr = kmap_atomic(page); 5933 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, 5934 csum, bvec->bv_len); 5935 btrfs_csum_final(csum, (char *)&csum); 5936 kunmap_atomic(kaddr); 5937 local_irq_restore(flags); 5938 5939 flush_dcache_page(bvec->bv_page); 5940 if (csum != *private) { 5941 printk(KERN_ERR "btrfs csum failed ino %llu off" 5942 " %llu csum %u private %u\n", 5943 (unsigned long long)btrfs_ino(inode), 5944 (unsigned long long)start, 5945 csum, *private); 5946 err = -EIO; 5947 } 5948 } 5949 5950 start += bvec->bv_len; 5951 private++; 5952 bvec++; 5953 } while (bvec <= bvec_end); 5954 5955 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 5956 dip->logical_offset + dip->bytes - 1); 5957 bio->bi_private = dip->private; 5958 5959 kfree(dip->csums); 5960 kfree(dip); 5961 5962 /* If we had a csum failure make sure to clear the uptodate flag */ 5963 if (err) 5964 clear_bit(BIO_UPTODATE, &bio->bi_flags); 5965 dio_end_io(bio, err); 5966 } 5967 5968 static void btrfs_endio_direct_write(struct bio *bio, int err) 5969 { 5970 struct btrfs_dio_private *dip = bio->bi_private; 5971 struct inode *inode = dip->inode; 5972 struct btrfs_root *root = BTRFS_I(inode)->root; 5973 struct btrfs_ordered_extent *ordered = NULL; 5974 u64 ordered_offset = dip->logical_offset; 5975 u64 ordered_bytes = dip->bytes; 5976 int ret; 5977 5978 if (err) 5979 goto out_done; 5980 again: 5981 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5982 &ordered_offset, 5983 ordered_bytes, !err); 5984 if (!ret) 5985 goto out_test; 5986 5987 ordered->work.func = finish_ordered_fn; 5988 ordered->work.flags = 0; 5989 btrfs_queue_worker(&root->fs_info->endio_write_workers, 5990 &ordered->work); 5991 out_test: 5992 /* 5993 * our bio might span multiple ordered extents. If we haven't 5994 * completed the accounting for the whole dio, go back and try again 5995 */ 5996 if (ordered_offset < dip->logical_offset + dip->bytes) { 5997 ordered_bytes = dip->logical_offset + dip->bytes - 5998 ordered_offset; 5999 ordered = NULL; 6000 goto again; 6001 } 6002 out_done: 6003 bio->bi_private = dip->private; 6004 6005 kfree(dip); 6006 6007 /* If we had an error make sure to clear the uptodate flag */ 6008 if (err) 6009 clear_bit(BIO_UPTODATE, &bio->bi_flags); 6010 dio_end_io(bio, err); 6011 } 6012 6013 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 6014 struct bio *bio, int mirror_num, 6015 unsigned long bio_flags, u64 offset) 6016 { 6017 int ret; 6018 struct btrfs_root *root = BTRFS_I(inode)->root; 6019 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 6020 BUG_ON(ret); /* -ENOMEM */ 6021 return 0; 6022 } 6023 6024 static void btrfs_end_dio_bio(struct bio *bio, int err) 6025 { 6026 struct btrfs_dio_private *dip = bio->bi_private; 6027 6028 if (err) { 6029 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " 6030 "sector %#Lx len %u err no %d\n", 6031 (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, 6032 (unsigned long long)bio->bi_sector, bio->bi_size, err); 6033 dip->errors = 1; 6034 6035 /* 6036 * before atomic variable goto zero, we must make sure 6037 * dip->errors is perceived to be set. 6038 */ 6039 smp_mb__before_atomic_dec(); 6040 } 6041 6042 /* if there are more bios still pending for this dio, just exit */ 6043 if (!atomic_dec_and_test(&dip->pending_bios)) 6044 goto out; 6045 6046 if (dip->errors) 6047 bio_io_error(dip->orig_bio); 6048 else { 6049 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); 6050 bio_endio(dip->orig_bio, 0); 6051 } 6052 out: 6053 bio_put(bio); 6054 } 6055 6056 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 6057 u64 first_sector, gfp_t gfp_flags) 6058 { 6059 int nr_vecs = bio_get_nr_vecs(bdev); 6060 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 6061 } 6062 6063 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 6064 int rw, u64 file_offset, int skip_sum, 6065 u32 *csums, int async_submit) 6066 { 6067 int write = rw & REQ_WRITE; 6068 struct btrfs_root *root = BTRFS_I(inode)->root; 6069 int ret; 6070 6071 bio_get(bio); 6072 6073 if (!write) { 6074 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6075 if (ret) 6076 goto err; 6077 } 6078 6079 if (skip_sum) 6080 goto map; 6081 6082 if (write && async_submit) { 6083 ret = btrfs_wq_submit_bio(root->fs_info, 6084 inode, rw, bio, 0, 0, 6085 file_offset, 6086 __btrfs_submit_bio_start_direct_io, 6087 __btrfs_submit_bio_done); 6088 goto err; 6089 } else if (write) { 6090 /* 6091 * If we aren't doing async submit, calculate the csum of the 6092 * bio now. 6093 */ 6094 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 6095 if (ret) 6096 goto err; 6097 } else if (!skip_sum) { 6098 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, 6099 file_offset, csums); 6100 if (ret) 6101 goto err; 6102 } 6103 6104 map: 6105 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 6106 err: 6107 bio_put(bio); 6108 return ret; 6109 } 6110 6111 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 6112 int skip_sum) 6113 { 6114 struct inode *inode = dip->inode; 6115 struct btrfs_root *root = BTRFS_I(inode)->root; 6116 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 6117 struct bio *bio; 6118 struct bio *orig_bio = dip->orig_bio; 6119 struct bio_vec *bvec = orig_bio->bi_io_vec; 6120 u64 start_sector = orig_bio->bi_sector; 6121 u64 file_offset = dip->logical_offset; 6122 u64 submit_len = 0; 6123 u64 map_length; 6124 int nr_pages = 0; 6125 u32 *csums = dip->csums; 6126 int ret = 0; 6127 int async_submit = 0; 6128 int write = rw & REQ_WRITE; 6129 6130 map_length = orig_bio->bi_size; 6131 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6132 &map_length, NULL, 0); 6133 if (ret) { 6134 bio_put(orig_bio); 6135 return -EIO; 6136 } 6137 6138 if (map_length >= orig_bio->bi_size) { 6139 bio = orig_bio; 6140 goto submit; 6141 } 6142 6143 async_submit = 1; 6144 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6145 if (!bio) 6146 return -ENOMEM; 6147 bio->bi_private = dip; 6148 bio->bi_end_io = btrfs_end_dio_bio; 6149 atomic_inc(&dip->pending_bios); 6150 6151 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6152 if (unlikely(map_length < submit_len + bvec->bv_len || 6153 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 6154 bvec->bv_offset) < bvec->bv_len)) { 6155 /* 6156 * inc the count before we submit the bio so 6157 * we know the end IO handler won't happen before 6158 * we inc the count. Otherwise, the dip might get freed 6159 * before we're done setting it up 6160 */ 6161 atomic_inc(&dip->pending_bios); 6162 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6163 file_offset, skip_sum, 6164 csums, async_submit); 6165 if (ret) { 6166 bio_put(bio); 6167 atomic_dec(&dip->pending_bios); 6168 goto out_err; 6169 } 6170 6171 /* Write's use the ordered csums */ 6172 if (!write && !skip_sum) 6173 csums = csums + nr_pages; 6174 start_sector += submit_len >> 9; 6175 file_offset += submit_len; 6176 6177 submit_len = 0; 6178 nr_pages = 0; 6179 6180 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 6181 start_sector, GFP_NOFS); 6182 if (!bio) 6183 goto out_err; 6184 bio->bi_private = dip; 6185 bio->bi_end_io = btrfs_end_dio_bio; 6186 6187 map_length = orig_bio->bi_size; 6188 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6189 &map_length, NULL, 0); 6190 if (ret) { 6191 bio_put(bio); 6192 goto out_err; 6193 } 6194 } else { 6195 submit_len += bvec->bv_len; 6196 nr_pages ++; 6197 bvec++; 6198 } 6199 } 6200 6201 submit: 6202 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6203 csums, async_submit); 6204 if (!ret) 6205 return 0; 6206 6207 bio_put(bio); 6208 out_err: 6209 dip->errors = 1; 6210 /* 6211 * before atomic variable goto zero, we must 6212 * make sure dip->errors is perceived to be set. 6213 */ 6214 smp_mb__before_atomic_dec(); 6215 if (atomic_dec_and_test(&dip->pending_bios)) 6216 bio_io_error(dip->orig_bio); 6217 6218 /* bio_end_io() will handle error, so we needn't return it */ 6219 return 0; 6220 } 6221 6222 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 6223 loff_t file_offset) 6224 { 6225 struct btrfs_root *root = BTRFS_I(inode)->root; 6226 struct btrfs_dio_private *dip; 6227 struct bio_vec *bvec = bio->bi_io_vec; 6228 int skip_sum; 6229 int write = rw & REQ_WRITE; 6230 int ret = 0; 6231 6232 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 6233 6234 dip = kmalloc(sizeof(*dip), GFP_NOFS); 6235 if (!dip) { 6236 ret = -ENOMEM; 6237 goto free_ordered; 6238 } 6239 dip->csums = NULL; 6240 6241 /* Write's use the ordered csum stuff, so we don't need dip->csums */ 6242 if (!write && !skip_sum) { 6243 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6244 if (!dip->csums) { 6245 kfree(dip); 6246 ret = -ENOMEM; 6247 goto free_ordered; 6248 } 6249 } 6250 6251 dip->private = bio->bi_private; 6252 dip->inode = inode; 6253 dip->logical_offset = file_offset; 6254 6255 dip->bytes = 0; 6256 do { 6257 dip->bytes += bvec->bv_len; 6258 bvec++; 6259 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); 6260 6261 dip->disk_bytenr = (u64)bio->bi_sector << 9; 6262 bio->bi_private = dip; 6263 dip->errors = 0; 6264 dip->orig_bio = bio; 6265 atomic_set(&dip->pending_bios, 0); 6266 6267 if (write) 6268 bio->bi_end_io = btrfs_endio_direct_write; 6269 else 6270 bio->bi_end_io = btrfs_endio_direct_read; 6271 6272 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 6273 if (!ret) 6274 return; 6275 free_ordered: 6276 /* 6277 * If this is a write, we need to clean up the reserved space and kill 6278 * the ordered extent. 6279 */ 6280 if (write) { 6281 struct btrfs_ordered_extent *ordered; 6282 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 6283 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 6284 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 6285 btrfs_free_reserved_extent(root, ordered->start, 6286 ordered->disk_len); 6287 btrfs_put_ordered_extent(ordered); 6288 btrfs_put_ordered_extent(ordered); 6289 } 6290 bio_endio(bio, ret); 6291 } 6292 6293 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 6294 const struct iovec *iov, loff_t offset, 6295 unsigned long nr_segs) 6296 { 6297 int seg; 6298 int i; 6299 size_t size; 6300 unsigned long addr; 6301 unsigned blocksize_mask = root->sectorsize - 1; 6302 ssize_t retval = -EINVAL; 6303 loff_t end = offset; 6304 6305 if (offset & blocksize_mask) 6306 goto out; 6307 6308 /* Check the memory alignment. Blocks cannot straddle pages */ 6309 for (seg = 0; seg < nr_segs; seg++) { 6310 addr = (unsigned long)iov[seg].iov_base; 6311 size = iov[seg].iov_len; 6312 end += size; 6313 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6314 goto out; 6315 6316 /* If this is a write we don't need to check anymore */ 6317 if (rw & WRITE) 6318 continue; 6319 6320 /* 6321 * Check to make sure we don't have duplicate iov_base's in this 6322 * iovec, if so return EINVAL, otherwise we'll get csum errors 6323 * when reading back. 6324 */ 6325 for (i = seg + 1; i < nr_segs; i++) { 6326 if (iov[seg].iov_base == iov[i].iov_base) 6327 goto out; 6328 } 6329 } 6330 retval = 0; 6331 out: 6332 return retval; 6333 } 6334 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 6335 const struct iovec *iov, loff_t offset, 6336 unsigned long nr_segs) 6337 { 6338 struct file *file = iocb->ki_filp; 6339 struct inode *inode = file->f_mapping->host; 6340 struct btrfs_ordered_extent *ordered; 6341 struct extent_state *cached_state = NULL; 6342 u64 lockstart, lockend; 6343 ssize_t ret; 6344 int writing = rw & WRITE; 6345 int write_bits = 0; 6346 size_t count = iov_length(iov, nr_segs); 6347 6348 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6349 offset, nr_segs)) { 6350 return 0; 6351 } 6352 6353 lockstart = offset; 6354 lockend = offset + count - 1; 6355 6356 if (writing) { 6357 ret = btrfs_delalloc_reserve_space(inode, count); 6358 if (ret) 6359 goto out; 6360 } 6361 6362 while (1) { 6363 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6364 0, &cached_state); 6365 /* 6366 * We're concerned with the entire range that we're going to be 6367 * doing DIO to, so we need to make sure theres no ordered 6368 * extents in this range. 6369 */ 6370 ordered = btrfs_lookup_ordered_range(inode, lockstart, 6371 lockend - lockstart + 1); 6372 6373 /* 6374 * We need to make sure there are no buffered pages in this 6375 * range either, we could have raced between the invalidate in 6376 * generic_file_direct_write and locking the extent. The 6377 * invalidate needs to happen so that reads after a write do not 6378 * get stale data. 6379 */ 6380 if (!ordered && (!writing || 6381 !test_range_bit(&BTRFS_I(inode)->io_tree, 6382 lockstart, lockend, EXTENT_UPTODATE, 0, 6383 cached_state))) 6384 break; 6385 6386 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6387 &cached_state, GFP_NOFS); 6388 6389 if (ordered) { 6390 btrfs_start_ordered_extent(inode, ordered, 1); 6391 btrfs_put_ordered_extent(ordered); 6392 } else { 6393 /* Screw you mmap */ 6394 ret = filemap_write_and_wait_range(file->f_mapping, 6395 lockstart, 6396 lockend); 6397 if (ret) 6398 goto out; 6399 6400 /* 6401 * If we found a page that couldn't be invalidated just 6402 * fall back to buffered. 6403 */ 6404 ret = invalidate_inode_pages2_range(file->f_mapping, 6405 lockstart >> PAGE_CACHE_SHIFT, 6406 lockend >> PAGE_CACHE_SHIFT); 6407 if (ret) { 6408 if (ret == -EBUSY) 6409 ret = 0; 6410 goto out; 6411 } 6412 } 6413 6414 cond_resched(); 6415 } 6416 6417 /* 6418 * we don't use btrfs_set_extent_delalloc because we don't want 6419 * the dirty or uptodate bits 6420 */ 6421 if (writing) { 6422 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; 6423 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6424 EXTENT_DELALLOC, NULL, &cached_state, 6425 GFP_NOFS); 6426 if (ret) { 6427 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6428 lockend, EXTENT_LOCKED | write_bits, 6429 1, 0, &cached_state, GFP_NOFS); 6430 goto out; 6431 } 6432 } 6433 6434 free_extent_state(cached_state); 6435 cached_state = NULL; 6436 6437 ret = __blockdev_direct_IO(rw, iocb, inode, 6438 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6439 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6440 btrfs_submit_direct, 0); 6441 6442 if (ret < 0 && ret != -EIOCBQUEUED) { 6443 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, 6444 offset + iov_length(iov, nr_segs) - 1, 6445 EXTENT_LOCKED | write_bits, 1, 0, 6446 &cached_state, GFP_NOFS); 6447 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { 6448 /* 6449 * We're falling back to buffered, unlock the section we didn't 6450 * do IO on. 6451 */ 6452 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, 6453 offset + iov_length(iov, nr_segs) - 1, 6454 EXTENT_LOCKED | write_bits, 1, 0, 6455 &cached_state, GFP_NOFS); 6456 } 6457 out: 6458 free_extent_state(cached_state); 6459 return ret; 6460 } 6461 6462 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6463 __u64 start, __u64 len) 6464 { 6465 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6466 } 6467 6468 int btrfs_readpage(struct file *file, struct page *page) 6469 { 6470 struct extent_io_tree *tree; 6471 tree = &BTRFS_I(page->mapping->host)->io_tree; 6472 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 6473 } 6474 6475 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6476 { 6477 struct extent_io_tree *tree; 6478 6479 6480 if (current->flags & PF_MEMALLOC) { 6481 redirty_page_for_writepage(wbc, page); 6482 unlock_page(page); 6483 return 0; 6484 } 6485 tree = &BTRFS_I(page->mapping->host)->io_tree; 6486 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 6487 } 6488 6489 int btrfs_writepages(struct address_space *mapping, 6490 struct writeback_control *wbc) 6491 { 6492 struct extent_io_tree *tree; 6493 6494 tree = &BTRFS_I(mapping->host)->io_tree; 6495 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 6496 } 6497 6498 static int 6499 btrfs_readpages(struct file *file, struct address_space *mapping, 6500 struct list_head *pages, unsigned nr_pages) 6501 { 6502 struct extent_io_tree *tree; 6503 tree = &BTRFS_I(mapping->host)->io_tree; 6504 return extent_readpages(tree, mapping, pages, nr_pages, 6505 btrfs_get_extent); 6506 } 6507 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6508 { 6509 struct extent_io_tree *tree; 6510 struct extent_map_tree *map; 6511 int ret; 6512 6513 tree = &BTRFS_I(page->mapping->host)->io_tree; 6514 map = &BTRFS_I(page->mapping->host)->extent_tree; 6515 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 6516 if (ret == 1) { 6517 ClearPagePrivate(page); 6518 set_page_private(page, 0); 6519 page_cache_release(page); 6520 } 6521 return ret; 6522 } 6523 6524 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6525 { 6526 if (PageWriteback(page) || PageDirty(page)) 6527 return 0; 6528 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 6529 } 6530 6531 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6532 { 6533 struct inode *inode = page->mapping->host; 6534 struct extent_io_tree *tree; 6535 struct btrfs_ordered_extent *ordered; 6536 struct extent_state *cached_state = NULL; 6537 u64 page_start = page_offset(page); 6538 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6539 6540 /* 6541 * we have the page locked, so new writeback can't start, 6542 * and the dirty bit won't be cleared while we are here. 6543 * 6544 * Wait for IO on this page so that we can safely clear 6545 * the PagePrivate2 bit and do ordered accounting 6546 */ 6547 wait_on_page_writeback(page); 6548 6549 tree = &BTRFS_I(inode)->io_tree; 6550 if (offset) { 6551 btrfs_releasepage(page, GFP_NOFS); 6552 return; 6553 } 6554 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6555 ordered = btrfs_lookup_ordered_extent(inode, 6556 page_offset(page)); 6557 if (ordered) { 6558 /* 6559 * IO on this page will never be started, so we need 6560 * to account for any ordered extents now 6561 */ 6562 clear_extent_bit(tree, page_start, page_end, 6563 EXTENT_DIRTY | EXTENT_DELALLOC | 6564 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6565 &cached_state, GFP_NOFS); 6566 /* 6567 * whoever cleared the private bit is responsible 6568 * for the finish_ordered_io 6569 */ 6570 if (TestClearPagePrivate2(page) && 6571 btrfs_dec_test_ordered_pending(inode, &ordered, page_start, 6572 PAGE_CACHE_SIZE, 1)) { 6573 btrfs_finish_ordered_io(ordered); 6574 } 6575 btrfs_put_ordered_extent(ordered); 6576 cached_state = NULL; 6577 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6578 } 6579 clear_extent_bit(tree, page_start, page_end, 6580 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6581 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6582 __btrfs_releasepage(page, GFP_NOFS); 6583 6584 ClearPageChecked(page); 6585 if (PagePrivate(page)) { 6586 ClearPagePrivate(page); 6587 set_page_private(page, 0); 6588 page_cache_release(page); 6589 } 6590 } 6591 6592 /* 6593 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 6594 * called from a page fault handler when a page is first dirtied. Hence we must 6595 * be careful to check for EOF conditions here. We set the page up correctly 6596 * for a written page which means we get ENOSPC checking when writing into 6597 * holes and correct delalloc and unwritten extent mapping on filesystems that 6598 * support these features. 6599 * 6600 * We are not allowed to take the i_mutex here so we have to play games to 6601 * protect against truncate races as the page could now be beyond EOF. Because 6602 * vmtruncate() writes the inode size before removing pages, once we have the 6603 * page lock we can determine safely if the page is beyond EOF. If it is not 6604 * beyond EOF, then the page is guaranteed safe against truncation until we 6605 * unlock the page. 6606 */ 6607 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 6608 { 6609 struct page *page = vmf->page; 6610 struct inode *inode = fdentry(vma->vm_file)->d_inode; 6611 struct btrfs_root *root = BTRFS_I(inode)->root; 6612 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6613 struct btrfs_ordered_extent *ordered; 6614 struct extent_state *cached_state = NULL; 6615 char *kaddr; 6616 unsigned long zero_start; 6617 loff_t size; 6618 int ret; 6619 int reserved = 0; 6620 u64 page_start; 6621 u64 page_end; 6622 6623 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6624 if (!ret) { 6625 ret = file_update_time(vma->vm_file); 6626 reserved = 1; 6627 } 6628 if (ret) { 6629 if (ret == -ENOMEM) 6630 ret = VM_FAULT_OOM; 6631 else /* -ENOSPC, -EIO, etc */ 6632 ret = VM_FAULT_SIGBUS; 6633 if (reserved) 6634 goto out; 6635 goto out_noreserve; 6636 } 6637 6638 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6639 again: 6640 lock_page(page); 6641 size = i_size_read(inode); 6642 page_start = page_offset(page); 6643 page_end = page_start + PAGE_CACHE_SIZE - 1; 6644 6645 if ((page->mapping != inode->i_mapping) || 6646 (page_start >= size)) { 6647 /* page got truncated out from underneath us */ 6648 goto out_unlock; 6649 } 6650 wait_on_page_writeback(page); 6651 6652 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 6653 set_page_extent_mapped(page); 6654 6655 /* 6656 * we can't set the delalloc bits if there are pending ordered 6657 * extents. Drop our locks and wait for them to finish 6658 */ 6659 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6660 if (ordered) { 6661 unlock_extent_cached(io_tree, page_start, page_end, 6662 &cached_state, GFP_NOFS); 6663 unlock_page(page); 6664 btrfs_start_ordered_extent(inode, ordered, 1); 6665 btrfs_put_ordered_extent(ordered); 6666 goto again; 6667 } 6668 6669 /* 6670 * XXX - page_mkwrite gets called every time the page is dirtied, even 6671 * if it was already dirty, so for space accounting reasons we need to 6672 * clear any delalloc bits for the range we are fixing to save. There 6673 * is probably a better way to do this, but for now keep consistent with 6674 * prepare_pages in the normal write path. 6675 */ 6676 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6677 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6678 0, 0, &cached_state, GFP_NOFS); 6679 6680 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6681 &cached_state); 6682 if (ret) { 6683 unlock_extent_cached(io_tree, page_start, page_end, 6684 &cached_state, GFP_NOFS); 6685 ret = VM_FAULT_SIGBUS; 6686 goto out_unlock; 6687 } 6688 ret = 0; 6689 6690 /* page is wholly or partially inside EOF */ 6691 if (page_start + PAGE_CACHE_SIZE > size) 6692 zero_start = size & ~PAGE_CACHE_MASK; 6693 else 6694 zero_start = PAGE_CACHE_SIZE; 6695 6696 if (zero_start != PAGE_CACHE_SIZE) { 6697 kaddr = kmap(page); 6698 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 6699 flush_dcache_page(page); 6700 kunmap(page); 6701 } 6702 ClearPageChecked(page); 6703 set_page_dirty(page); 6704 SetPageUptodate(page); 6705 6706 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6707 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6708 6709 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6710 6711 out_unlock: 6712 if (!ret) 6713 return VM_FAULT_LOCKED; 6714 unlock_page(page); 6715 out: 6716 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6717 out_noreserve: 6718 return ret; 6719 } 6720 6721 static int btrfs_truncate(struct inode *inode) 6722 { 6723 struct btrfs_root *root = BTRFS_I(inode)->root; 6724 struct btrfs_block_rsv *rsv; 6725 int ret; 6726 int err = 0; 6727 struct btrfs_trans_handle *trans; 6728 unsigned long nr; 6729 u64 mask = root->sectorsize - 1; 6730 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6731 6732 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6733 if (ret) 6734 return ret; 6735 6736 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6737 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6738 6739 /* 6740 * Yes ladies and gentelment, this is indeed ugly. The fact is we have 6741 * 3 things going on here 6742 * 6743 * 1) We need to reserve space for our orphan item and the space to 6744 * delete our orphan item. Lord knows we don't want to have a dangling 6745 * orphan item because we didn't reserve space to remove it. 6746 * 6747 * 2) We need to reserve space to update our inode. 6748 * 6749 * 3) We need to have something to cache all the space that is going to 6750 * be free'd up by the truncate operation, but also have some slack 6751 * space reserved in case it uses space during the truncate (thank you 6752 * very much snapshotting). 6753 * 6754 * And we need these to all be seperate. The fact is we can use alot of 6755 * space doing the truncate, and we have no earthly idea how much space 6756 * we will use, so we need the truncate reservation to be seperate so it 6757 * doesn't end up using space reserved for updating the inode or 6758 * removing the orphan item. We also need to be able to stop the 6759 * transaction and start a new one, which means we need to be able to 6760 * update the inode several times, and we have no idea of knowing how 6761 * many times that will be, so we can't just reserve 1 item for the 6762 * entirety of the opration, so that has to be done seperately as well. 6763 * Then there is the orphan item, which does indeed need to be held on 6764 * to for the whole operation, and we need nobody to touch this reserved 6765 * space except the orphan code. 6766 * 6767 * So that leaves us with 6768 * 6769 * 1) root->orphan_block_rsv - for the orphan deletion. 6770 * 2) rsv - for the truncate reservation, which we will steal from the 6771 * transaction reservation. 6772 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6773 * updating the inode. 6774 */ 6775 rsv = btrfs_alloc_block_rsv(root); 6776 if (!rsv) 6777 return -ENOMEM; 6778 rsv->size = min_size; 6779 6780 /* 6781 * 1 for the truncate slack space 6782 * 1 for the orphan item we're going to add 6783 * 1 for the orphan item deletion 6784 * 1 for updating the inode. 6785 */ 6786 trans = btrfs_start_transaction(root, 4); 6787 if (IS_ERR(trans)) { 6788 err = PTR_ERR(trans); 6789 goto out; 6790 } 6791 6792 /* Migrate the slack space for the truncate to our reserve */ 6793 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 6794 min_size); 6795 BUG_ON(ret); 6796 6797 ret = btrfs_orphan_add(trans, inode); 6798 if (ret) { 6799 btrfs_end_transaction(trans, root); 6800 goto out; 6801 } 6802 6803 /* 6804 * setattr is responsible for setting the ordered_data_close flag, 6805 * but that is only tested during the last file release. That 6806 * could happen well after the next commit, leaving a great big 6807 * window where new writes may get lost if someone chooses to write 6808 * to this file after truncating to zero 6809 * 6810 * The inode doesn't have any dirty data here, and so if we commit 6811 * this is a noop. If someone immediately starts writing to the inode 6812 * it is very likely we'll catch some of their writes in this 6813 * transaction, and the commit will find this file on the ordered 6814 * data list with good things to send down. 6815 * 6816 * This is a best effort solution, there is still a window where 6817 * using truncate to replace the contents of the file will 6818 * end up with a zero length file after a crash. 6819 */ 6820 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 6821 &BTRFS_I(inode)->runtime_flags)) 6822 btrfs_add_ordered_operation(trans, root, inode); 6823 6824 while (1) { 6825 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6826 if (ret) { 6827 /* 6828 * This can only happen with the original transaction we 6829 * started above, every other time we shouldn't have a 6830 * transaction started yet. 6831 */ 6832 if (ret == -EAGAIN) 6833 goto end_trans; 6834 err = ret; 6835 break; 6836 } 6837 6838 if (!trans) { 6839 /* Just need the 1 for updating the inode */ 6840 trans = btrfs_start_transaction(root, 1); 6841 if (IS_ERR(trans)) { 6842 ret = err = PTR_ERR(trans); 6843 trans = NULL; 6844 break; 6845 } 6846 } 6847 6848 trans->block_rsv = rsv; 6849 6850 ret = btrfs_truncate_inode_items(trans, root, inode, 6851 inode->i_size, 6852 BTRFS_EXTENT_DATA_KEY); 6853 if (ret != -EAGAIN) { 6854 err = ret; 6855 break; 6856 } 6857 6858 trans->block_rsv = &root->fs_info->trans_block_rsv; 6859 ret = btrfs_update_inode(trans, root, inode); 6860 if (ret) { 6861 err = ret; 6862 break; 6863 } 6864 end_trans: 6865 nr = trans->blocks_used; 6866 btrfs_end_transaction(trans, root); 6867 trans = NULL; 6868 btrfs_btree_balance_dirty(root, nr); 6869 } 6870 6871 if (ret == 0 && inode->i_nlink > 0) { 6872 trans->block_rsv = root->orphan_block_rsv; 6873 ret = btrfs_orphan_del(trans, inode); 6874 if (ret) 6875 err = ret; 6876 } else if (ret && inode->i_nlink > 0) { 6877 /* 6878 * Failed to do the truncate, remove us from the in memory 6879 * orphan list. 6880 */ 6881 ret = btrfs_orphan_del(NULL, inode); 6882 } 6883 6884 if (trans) { 6885 trans->block_rsv = &root->fs_info->trans_block_rsv; 6886 ret = btrfs_update_inode(trans, root, inode); 6887 if (ret && !err) 6888 err = ret; 6889 6890 nr = trans->blocks_used; 6891 ret = btrfs_end_transaction(trans, root); 6892 btrfs_btree_balance_dirty(root, nr); 6893 } 6894 6895 out: 6896 btrfs_free_block_rsv(root, rsv); 6897 6898 if (ret && !err) 6899 err = ret; 6900 6901 return err; 6902 } 6903 6904 /* 6905 * create a new subvolume directory/inode (helper for the ioctl). 6906 */ 6907 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 6908 struct btrfs_root *new_root, u64 new_dirid) 6909 { 6910 struct inode *inode; 6911 int err; 6912 u64 index = 0; 6913 6914 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 6915 new_dirid, new_dirid, 6916 S_IFDIR | (~current_umask() & S_IRWXUGO), 6917 &index); 6918 if (IS_ERR(inode)) 6919 return PTR_ERR(inode); 6920 inode->i_op = &btrfs_dir_inode_operations; 6921 inode->i_fop = &btrfs_dir_file_operations; 6922 6923 set_nlink(inode, 1); 6924 btrfs_i_size_write(inode, 0); 6925 6926 err = btrfs_update_inode(trans, new_root, inode); 6927 6928 iput(inode); 6929 return err; 6930 } 6931 6932 struct inode *btrfs_alloc_inode(struct super_block *sb) 6933 { 6934 struct btrfs_inode *ei; 6935 struct inode *inode; 6936 6937 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6938 if (!ei) 6939 return NULL; 6940 6941 ei->root = NULL; 6942 ei->space_info = NULL; 6943 ei->generation = 0; 6944 ei->last_trans = 0; 6945 ei->last_sub_trans = 0; 6946 ei->logged_trans = 0; 6947 ei->delalloc_bytes = 0; 6948 ei->disk_i_size = 0; 6949 ei->flags = 0; 6950 ei->csum_bytes = 0; 6951 ei->index_cnt = (u64)-1; 6952 ei->last_unlink_trans = 0; 6953 6954 spin_lock_init(&ei->lock); 6955 ei->outstanding_extents = 0; 6956 ei->reserved_extents = 0; 6957 6958 ei->runtime_flags = 0; 6959 ei->force_compress = BTRFS_COMPRESS_NONE; 6960 6961 ei->delayed_node = NULL; 6962 6963 inode = &ei->vfs_inode; 6964 extent_map_tree_init(&ei->extent_tree); 6965 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6966 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6967 ei->io_tree.track_uptodate = 1; 6968 ei->io_failure_tree.track_uptodate = 1; 6969 mutex_init(&ei->log_mutex); 6970 mutex_init(&ei->delalloc_mutex); 6971 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6972 INIT_LIST_HEAD(&ei->delalloc_inodes); 6973 INIT_LIST_HEAD(&ei->ordered_operations); 6974 RB_CLEAR_NODE(&ei->rb_node); 6975 6976 return inode; 6977 } 6978 6979 static void btrfs_i_callback(struct rcu_head *head) 6980 { 6981 struct inode *inode = container_of(head, struct inode, i_rcu); 6982 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6983 } 6984 6985 void btrfs_destroy_inode(struct inode *inode) 6986 { 6987 struct btrfs_ordered_extent *ordered; 6988 struct btrfs_root *root = BTRFS_I(inode)->root; 6989 6990 WARN_ON(!list_empty(&inode->i_dentry)); 6991 WARN_ON(inode->i_data.nrpages); 6992 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6993 WARN_ON(BTRFS_I(inode)->reserved_extents); 6994 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 6995 WARN_ON(BTRFS_I(inode)->csum_bytes); 6996 6997 /* 6998 * This can happen where we create an inode, but somebody else also 6999 * created the same inode and we need to destroy the one we already 7000 * created. 7001 */ 7002 if (!root) 7003 goto free; 7004 7005 /* 7006 * Make sure we're properly removed from the ordered operation 7007 * lists. 7008 */ 7009 smp_mb(); 7010 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7011 spin_lock(&root->fs_info->ordered_extent_lock); 7012 list_del_init(&BTRFS_I(inode)->ordered_operations); 7013 spin_unlock(&root->fs_info->ordered_extent_lock); 7014 } 7015 7016 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7017 &BTRFS_I(inode)->runtime_flags)) { 7018 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 7019 (unsigned long long)btrfs_ino(inode)); 7020 atomic_dec(&root->orphan_inodes); 7021 } 7022 7023 while (1) { 7024 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 7025 if (!ordered) 7026 break; 7027 else { 7028 printk(KERN_ERR "btrfs found ordered " 7029 "extent %llu %llu on inode cleanup\n", 7030 (unsigned long long)ordered->file_offset, 7031 (unsigned long long)ordered->len); 7032 btrfs_remove_ordered_extent(inode, ordered); 7033 btrfs_put_ordered_extent(ordered); 7034 btrfs_put_ordered_extent(ordered); 7035 } 7036 } 7037 inode_tree_del(inode); 7038 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 7039 free: 7040 btrfs_remove_delayed_node(inode); 7041 call_rcu(&inode->i_rcu, btrfs_i_callback); 7042 } 7043 7044 int btrfs_drop_inode(struct inode *inode) 7045 { 7046 struct btrfs_root *root = BTRFS_I(inode)->root; 7047 7048 if (btrfs_root_refs(&root->root_item) == 0 && 7049 !btrfs_is_free_space_inode(root, inode)) 7050 return 1; 7051 else 7052 return generic_drop_inode(inode); 7053 } 7054 7055 static void init_once(void *foo) 7056 { 7057 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 7058 7059 inode_init_once(&ei->vfs_inode); 7060 } 7061 7062 void btrfs_destroy_cachep(void) 7063 { 7064 if (btrfs_inode_cachep) 7065 kmem_cache_destroy(btrfs_inode_cachep); 7066 if (btrfs_trans_handle_cachep) 7067 kmem_cache_destroy(btrfs_trans_handle_cachep); 7068 if (btrfs_transaction_cachep) 7069 kmem_cache_destroy(btrfs_transaction_cachep); 7070 if (btrfs_path_cachep) 7071 kmem_cache_destroy(btrfs_path_cachep); 7072 if (btrfs_free_space_cachep) 7073 kmem_cache_destroy(btrfs_free_space_cachep); 7074 } 7075 7076 int btrfs_init_cachep(void) 7077 { 7078 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 7079 sizeof(struct btrfs_inode), 0, 7080 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7081 if (!btrfs_inode_cachep) 7082 goto fail; 7083 7084 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 7085 sizeof(struct btrfs_trans_handle), 0, 7086 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7087 if (!btrfs_trans_handle_cachep) 7088 goto fail; 7089 7090 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 7091 sizeof(struct btrfs_transaction), 0, 7092 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7093 if (!btrfs_transaction_cachep) 7094 goto fail; 7095 7096 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 7097 sizeof(struct btrfs_path), 0, 7098 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7099 if (!btrfs_path_cachep) 7100 goto fail; 7101 7102 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 7103 sizeof(struct btrfs_free_space), 0, 7104 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7105 if (!btrfs_free_space_cachep) 7106 goto fail; 7107 7108 return 0; 7109 fail: 7110 btrfs_destroy_cachep(); 7111 return -ENOMEM; 7112 } 7113 7114 static int btrfs_getattr(struct vfsmount *mnt, 7115 struct dentry *dentry, struct kstat *stat) 7116 { 7117 struct inode *inode = dentry->d_inode; 7118 u32 blocksize = inode->i_sb->s_blocksize; 7119 7120 generic_fillattr(inode, stat); 7121 stat->dev = BTRFS_I(inode)->root->anon_dev; 7122 stat->blksize = PAGE_CACHE_SIZE; 7123 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 7124 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; 7125 return 0; 7126 } 7127 7128 /* 7129 * If a file is moved, it will inherit the cow and compression flags of the new 7130 * directory. 7131 */ 7132 static void fixup_inode_flags(struct inode *dir, struct inode *inode) 7133 { 7134 struct btrfs_inode *b_dir = BTRFS_I(dir); 7135 struct btrfs_inode *b_inode = BTRFS_I(inode); 7136 7137 if (b_dir->flags & BTRFS_INODE_NODATACOW) 7138 b_inode->flags |= BTRFS_INODE_NODATACOW; 7139 else 7140 b_inode->flags &= ~BTRFS_INODE_NODATACOW; 7141 7142 if (b_dir->flags & BTRFS_INODE_COMPRESS) { 7143 b_inode->flags |= BTRFS_INODE_COMPRESS; 7144 b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; 7145 } else { 7146 b_inode->flags &= ~(BTRFS_INODE_COMPRESS | 7147 BTRFS_INODE_NOCOMPRESS); 7148 } 7149 } 7150 7151 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 7152 struct inode *new_dir, struct dentry *new_dentry) 7153 { 7154 struct btrfs_trans_handle *trans; 7155 struct btrfs_root *root = BTRFS_I(old_dir)->root; 7156 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 7157 struct inode *new_inode = new_dentry->d_inode; 7158 struct inode *old_inode = old_dentry->d_inode; 7159 struct timespec ctime = CURRENT_TIME; 7160 u64 index = 0; 7161 u64 root_objectid; 7162 int ret; 7163 u64 old_ino = btrfs_ino(old_inode); 7164 7165 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 7166 return -EPERM; 7167 7168 /* we only allow rename subvolume link between subvolumes */ 7169 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 7170 return -EXDEV; 7171 7172 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 7173 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) 7174 return -ENOTEMPTY; 7175 7176 if (S_ISDIR(old_inode->i_mode) && new_inode && 7177 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7178 return -ENOTEMPTY; 7179 /* 7180 * we're using rename to replace one file with another. 7181 * and the replacement file is large. Start IO on it now so 7182 * we don't add too much work to the end of the transaction 7183 */ 7184 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 7185 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 7186 filemap_flush(old_inode->i_mapping); 7187 7188 /* close the racy window with snapshot create/destroy ioctl */ 7189 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7190 down_read(&root->fs_info->subvol_sem); 7191 /* 7192 * We want to reserve the absolute worst case amount of items. So if 7193 * both inodes are subvols and we need to unlink them then that would 7194 * require 4 item modifications, but if they are both normal inodes it 7195 * would require 5 item modifications, so we'll assume their normal 7196 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 7197 * should cover the worst case number of items we'll modify. 7198 */ 7199 trans = btrfs_start_transaction(root, 20); 7200 if (IS_ERR(trans)) { 7201 ret = PTR_ERR(trans); 7202 goto out_notrans; 7203 } 7204 7205 if (dest != root) 7206 btrfs_record_root_in_trans(trans, dest); 7207 7208 ret = btrfs_set_inode_index(new_dir, &index); 7209 if (ret) 7210 goto out_fail; 7211 7212 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7213 /* force full log commit if subvolume involved. */ 7214 root->fs_info->last_trans_log_full_commit = trans->transid; 7215 } else { 7216 ret = btrfs_insert_inode_ref(trans, dest, 7217 new_dentry->d_name.name, 7218 new_dentry->d_name.len, 7219 old_ino, 7220 btrfs_ino(new_dir), index); 7221 if (ret) 7222 goto out_fail; 7223 /* 7224 * this is an ugly little race, but the rename is required 7225 * to make sure that if we crash, the inode is either at the 7226 * old name or the new one. pinning the log transaction lets 7227 * us make sure we don't allow a log commit to come in after 7228 * we unlink the name but before we add the new name back in. 7229 */ 7230 btrfs_pin_log_trans(root); 7231 } 7232 /* 7233 * make sure the inode gets flushed if it is replacing 7234 * something. 7235 */ 7236 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 7237 btrfs_add_ordered_operation(trans, root, old_inode); 7238 7239 inode_inc_iversion(old_dir); 7240 inode_inc_iversion(new_dir); 7241 inode_inc_iversion(old_inode); 7242 old_dir->i_ctime = old_dir->i_mtime = ctime; 7243 new_dir->i_ctime = new_dir->i_mtime = ctime; 7244 old_inode->i_ctime = ctime; 7245 7246 if (old_dentry->d_parent != new_dentry->d_parent) 7247 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 7248 7249 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7250 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 7251 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 7252 old_dentry->d_name.name, 7253 old_dentry->d_name.len); 7254 } else { 7255 ret = __btrfs_unlink_inode(trans, root, old_dir, 7256 old_dentry->d_inode, 7257 old_dentry->d_name.name, 7258 old_dentry->d_name.len); 7259 if (!ret) 7260 ret = btrfs_update_inode(trans, root, old_inode); 7261 } 7262 if (ret) { 7263 btrfs_abort_transaction(trans, root, ret); 7264 goto out_fail; 7265 } 7266 7267 if (new_inode) { 7268 inode_inc_iversion(new_inode); 7269 new_inode->i_ctime = CURRENT_TIME; 7270 if (unlikely(btrfs_ino(new_inode) == 7271 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7272 root_objectid = BTRFS_I(new_inode)->location.objectid; 7273 ret = btrfs_unlink_subvol(trans, dest, new_dir, 7274 root_objectid, 7275 new_dentry->d_name.name, 7276 new_dentry->d_name.len); 7277 BUG_ON(new_inode->i_nlink == 0); 7278 } else { 7279 ret = btrfs_unlink_inode(trans, dest, new_dir, 7280 new_dentry->d_inode, 7281 new_dentry->d_name.name, 7282 new_dentry->d_name.len); 7283 } 7284 if (!ret && new_inode->i_nlink == 0) { 7285 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 7286 BUG_ON(ret); 7287 } 7288 if (ret) { 7289 btrfs_abort_transaction(trans, root, ret); 7290 goto out_fail; 7291 } 7292 } 7293 7294 fixup_inode_flags(new_dir, old_inode); 7295 7296 ret = btrfs_add_link(trans, new_dir, old_inode, 7297 new_dentry->d_name.name, 7298 new_dentry->d_name.len, 0, index); 7299 if (ret) { 7300 btrfs_abort_transaction(trans, root, ret); 7301 goto out_fail; 7302 } 7303 7304 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 7305 struct dentry *parent = new_dentry->d_parent; 7306 btrfs_log_new_name(trans, old_inode, old_dir, parent); 7307 btrfs_end_log_trans(root); 7308 } 7309 out_fail: 7310 btrfs_end_transaction(trans, root); 7311 out_notrans: 7312 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7313 up_read(&root->fs_info->subvol_sem); 7314 7315 return ret; 7316 } 7317 7318 /* 7319 * some fairly slow code that needs optimization. This walks the list 7320 * of all the inodes with pending delalloc and forces them to disk. 7321 */ 7322 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 7323 { 7324 struct list_head *head = &root->fs_info->delalloc_inodes; 7325 struct btrfs_inode *binode; 7326 struct inode *inode; 7327 7328 if (root->fs_info->sb->s_flags & MS_RDONLY) 7329 return -EROFS; 7330 7331 spin_lock(&root->fs_info->delalloc_lock); 7332 while (!list_empty(head)) { 7333 binode = list_entry(head->next, struct btrfs_inode, 7334 delalloc_inodes); 7335 inode = igrab(&binode->vfs_inode); 7336 if (!inode) 7337 list_del_init(&binode->delalloc_inodes); 7338 spin_unlock(&root->fs_info->delalloc_lock); 7339 if (inode) { 7340 filemap_flush(inode->i_mapping); 7341 if (delay_iput) 7342 btrfs_add_delayed_iput(inode); 7343 else 7344 iput(inode); 7345 } 7346 cond_resched(); 7347 spin_lock(&root->fs_info->delalloc_lock); 7348 } 7349 spin_unlock(&root->fs_info->delalloc_lock); 7350 7351 /* the filemap_flush will queue IO into the worker threads, but 7352 * we have to make sure the IO is actually started and that 7353 * ordered extents get created before we return 7354 */ 7355 atomic_inc(&root->fs_info->async_submit_draining); 7356 while (atomic_read(&root->fs_info->nr_async_submits) || 7357 atomic_read(&root->fs_info->async_delalloc_pages)) { 7358 wait_event(root->fs_info->async_submit_wait, 7359 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 7360 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7361 } 7362 atomic_dec(&root->fs_info->async_submit_draining); 7363 return 0; 7364 } 7365 7366 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7367 const char *symname) 7368 { 7369 struct btrfs_trans_handle *trans; 7370 struct btrfs_root *root = BTRFS_I(dir)->root; 7371 struct btrfs_path *path; 7372 struct btrfs_key key; 7373 struct inode *inode = NULL; 7374 int err; 7375 int drop_inode = 0; 7376 u64 objectid; 7377 u64 index = 0 ; 7378 int name_len; 7379 int datasize; 7380 unsigned long ptr; 7381 struct btrfs_file_extent_item *ei; 7382 struct extent_buffer *leaf; 7383 unsigned long nr = 0; 7384 7385 name_len = strlen(symname) + 1; 7386 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7387 return -ENAMETOOLONG; 7388 7389 /* 7390 * 2 items for inode item and ref 7391 * 2 items for dir items 7392 * 1 item for xattr if selinux is on 7393 */ 7394 trans = btrfs_start_transaction(root, 5); 7395 if (IS_ERR(trans)) 7396 return PTR_ERR(trans); 7397 7398 err = btrfs_find_free_ino(root, &objectid); 7399 if (err) 7400 goto out_unlock; 7401 7402 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 7403 dentry->d_name.len, btrfs_ino(dir), objectid, 7404 S_IFLNK|S_IRWXUGO, &index); 7405 if (IS_ERR(inode)) { 7406 err = PTR_ERR(inode); 7407 goto out_unlock; 7408 } 7409 7410 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 7411 if (err) { 7412 drop_inode = 1; 7413 goto out_unlock; 7414 } 7415 7416 /* 7417 * If the active LSM wants to access the inode during 7418 * d_instantiate it needs these. Smack checks to see 7419 * if the filesystem supports xattrs by looking at the 7420 * ops vector. 7421 */ 7422 inode->i_fop = &btrfs_file_operations; 7423 inode->i_op = &btrfs_file_inode_operations; 7424 7425 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7426 if (err) 7427 drop_inode = 1; 7428 else { 7429 inode->i_mapping->a_ops = &btrfs_aops; 7430 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7431 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7432 } 7433 if (drop_inode) 7434 goto out_unlock; 7435 7436 path = btrfs_alloc_path(); 7437 if (!path) { 7438 err = -ENOMEM; 7439 drop_inode = 1; 7440 goto out_unlock; 7441 } 7442 key.objectid = btrfs_ino(inode); 7443 key.offset = 0; 7444 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7445 datasize = btrfs_file_extent_calc_inline_size(name_len); 7446 err = btrfs_insert_empty_item(trans, root, path, &key, 7447 datasize); 7448 if (err) { 7449 drop_inode = 1; 7450 btrfs_free_path(path); 7451 goto out_unlock; 7452 } 7453 leaf = path->nodes[0]; 7454 ei = btrfs_item_ptr(leaf, path->slots[0], 7455 struct btrfs_file_extent_item); 7456 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 7457 btrfs_set_file_extent_type(leaf, ei, 7458 BTRFS_FILE_EXTENT_INLINE); 7459 btrfs_set_file_extent_encryption(leaf, ei, 0); 7460 btrfs_set_file_extent_compression(leaf, ei, 0); 7461 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 7462 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 7463 7464 ptr = btrfs_file_extent_inline_start(ei); 7465 write_extent_buffer(leaf, symname, ptr, name_len); 7466 btrfs_mark_buffer_dirty(leaf); 7467 btrfs_free_path(path); 7468 7469 inode->i_op = &btrfs_symlink_inode_operations; 7470 inode->i_mapping->a_ops = &btrfs_symlink_aops; 7471 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7472 inode_set_bytes(inode, name_len); 7473 btrfs_i_size_write(inode, name_len - 1); 7474 err = btrfs_update_inode(trans, root, inode); 7475 if (err) 7476 drop_inode = 1; 7477 7478 out_unlock: 7479 if (!err) 7480 d_instantiate(dentry, inode); 7481 nr = trans->blocks_used; 7482 btrfs_end_transaction(trans, root); 7483 if (drop_inode) { 7484 inode_dec_link_count(inode); 7485 iput(inode); 7486 } 7487 btrfs_btree_balance_dirty(root, nr); 7488 return err; 7489 } 7490 7491 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 7492 u64 start, u64 num_bytes, u64 min_size, 7493 loff_t actual_len, u64 *alloc_hint, 7494 struct btrfs_trans_handle *trans) 7495 { 7496 struct btrfs_root *root = BTRFS_I(inode)->root; 7497 struct btrfs_key ins; 7498 u64 cur_offset = start; 7499 u64 i_size; 7500 int ret = 0; 7501 bool own_trans = true; 7502 7503 if (trans) 7504 own_trans = false; 7505 while (num_bytes > 0) { 7506 if (own_trans) { 7507 trans = btrfs_start_transaction(root, 3); 7508 if (IS_ERR(trans)) { 7509 ret = PTR_ERR(trans); 7510 break; 7511 } 7512 } 7513 7514 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 7515 0, *alloc_hint, &ins, 1); 7516 if (ret) { 7517 if (own_trans) 7518 btrfs_end_transaction(trans, root); 7519 break; 7520 } 7521 7522 ret = insert_reserved_file_extent(trans, inode, 7523 cur_offset, ins.objectid, 7524 ins.offset, ins.offset, 7525 ins.offset, 0, 0, 0, 7526 BTRFS_FILE_EXTENT_PREALLOC); 7527 if (ret) { 7528 btrfs_abort_transaction(trans, root, ret); 7529 if (own_trans) 7530 btrfs_end_transaction(trans, root); 7531 break; 7532 } 7533 btrfs_drop_extent_cache(inode, cur_offset, 7534 cur_offset + ins.offset -1, 0); 7535 7536 num_bytes -= ins.offset; 7537 cur_offset += ins.offset; 7538 *alloc_hint = ins.objectid + ins.offset; 7539 7540 inode_inc_iversion(inode); 7541 inode->i_ctime = CURRENT_TIME; 7542 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7543 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7544 (actual_len > inode->i_size) && 7545 (cur_offset > inode->i_size)) { 7546 if (cur_offset > actual_len) 7547 i_size = actual_len; 7548 else 7549 i_size = cur_offset; 7550 i_size_write(inode, i_size); 7551 btrfs_ordered_update_i_size(inode, i_size, NULL); 7552 } 7553 7554 ret = btrfs_update_inode(trans, root, inode); 7555 7556 if (ret) { 7557 btrfs_abort_transaction(trans, root, ret); 7558 if (own_trans) 7559 btrfs_end_transaction(trans, root); 7560 break; 7561 } 7562 7563 if (own_trans) 7564 btrfs_end_transaction(trans, root); 7565 } 7566 return ret; 7567 } 7568 7569 int btrfs_prealloc_file_range(struct inode *inode, int mode, 7570 u64 start, u64 num_bytes, u64 min_size, 7571 loff_t actual_len, u64 *alloc_hint) 7572 { 7573 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7574 min_size, actual_len, alloc_hint, 7575 NULL); 7576 } 7577 7578 int btrfs_prealloc_file_range_trans(struct inode *inode, 7579 struct btrfs_trans_handle *trans, int mode, 7580 u64 start, u64 num_bytes, u64 min_size, 7581 loff_t actual_len, u64 *alloc_hint) 7582 { 7583 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7584 min_size, actual_len, alloc_hint, trans); 7585 } 7586 7587 static int btrfs_set_page_dirty(struct page *page) 7588 { 7589 return __set_page_dirty_nobuffers(page); 7590 } 7591 7592 static int btrfs_permission(struct inode *inode, int mask) 7593 { 7594 struct btrfs_root *root = BTRFS_I(inode)->root; 7595 umode_t mode = inode->i_mode; 7596 7597 if (mask & MAY_WRITE && 7598 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 7599 if (btrfs_root_readonly(root)) 7600 return -EROFS; 7601 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 7602 return -EACCES; 7603 } 7604 return generic_permission(inode, mask); 7605 } 7606 7607 static const struct inode_operations btrfs_dir_inode_operations = { 7608 .getattr = btrfs_getattr, 7609 .lookup = btrfs_lookup, 7610 .create = btrfs_create, 7611 .unlink = btrfs_unlink, 7612 .link = btrfs_link, 7613 .mkdir = btrfs_mkdir, 7614 .rmdir = btrfs_rmdir, 7615 .rename = btrfs_rename, 7616 .symlink = btrfs_symlink, 7617 .setattr = btrfs_setattr, 7618 .mknod = btrfs_mknod, 7619 .setxattr = btrfs_setxattr, 7620 .getxattr = btrfs_getxattr, 7621 .listxattr = btrfs_listxattr, 7622 .removexattr = btrfs_removexattr, 7623 .permission = btrfs_permission, 7624 .get_acl = btrfs_get_acl, 7625 }; 7626 static const struct inode_operations btrfs_dir_ro_inode_operations = { 7627 .lookup = btrfs_lookup, 7628 .permission = btrfs_permission, 7629 .get_acl = btrfs_get_acl, 7630 }; 7631 7632 static const struct file_operations btrfs_dir_file_operations = { 7633 .llseek = generic_file_llseek, 7634 .read = generic_read_dir, 7635 .readdir = btrfs_real_readdir, 7636 .unlocked_ioctl = btrfs_ioctl, 7637 #ifdef CONFIG_COMPAT 7638 .compat_ioctl = btrfs_ioctl, 7639 #endif 7640 .release = btrfs_release_file, 7641 .fsync = btrfs_sync_file, 7642 }; 7643 7644 static struct extent_io_ops btrfs_extent_io_ops = { 7645 .fill_delalloc = run_delalloc_range, 7646 .submit_bio_hook = btrfs_submit_bio_hook, 7647 .merge_bio_hook = btrfs_merge_bio_hook, 7648 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7649 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7650 .writepage_start_hook = btrfs_writepage_start_hook, 7651 .set_bit_hook = btrfs_set_bit_hook, 7652 .clear_bit_hook = btrfs_clear_bit_hook, 7653 .merge_extent_hook = btrfs_merge_extent_hook, 7654 .split_extent_hook = btrfs_split_extent_hook, 7655 }; 7656 7657 /* 7658 * btrfs doesn't support the bmap operation because swapfiles 7659 * use bmap to make a mapping of extents in the file. They assume 7660 * these extents won't change over the life of the file and they 7661 * use the bmap result to do IO directly to the drive. 7662 * 7663 * the btrfs bmap call would return logical addresses that aren't 7664 * suitable for IO and they also will change frequently as COW 7665 * operations happen. So, swapfile + btrfs == corruption. 7666 * 7667 * For now we're avoiding this by dropping bmap. 7668 */ 7669 static const struct address_space_operations btrfs_aops = { 7670 .readpage = btrfs_readpage, 7671 .writepage = btrfs_writepage, 7672 .writepages = btrfs_writepages, 7673 .readpages = btrfs_readpages, 7674 .direct_IO = btrfs_direct_IO, 7675 .invalidatepage = btrfs_invalidatepage, 7676 .releasepage = btrfs_releasepage, 7677 .set_page_dirty = btrfs_set_page_dirty, 7678 .error_remove_page = generic_error_remove_page, 7679 }; 7680 7681 static const struct address_space_operations btrfs_symlink_aops = { 7682 .readpage = btrfs_readpage, 7683 .writepage = btrfs_writepage, 7684 .invalidatepage = btrfs_invalidatepage, 7685 .releasepage = btrfs_releasepage, 7686 }; 7687 7688 static const struct inode_operations btrfs_file_inode_operations = { 7689 .getattr = btrfs_getattr, 7690 .setattr = btrfs_setattr, 7691 .setxattr = btrfs_setxattr, 7692 .getxattr = btrfs_getxattr, 7693 .listxattr = btrfs_listxattr, 7694 .removexattr = btrfs_removexattr, 7695 .permission = btrfs_permission, 7696 .fiemap = btrfs_fiemap, 7697 .get_acl = btrfs_get_acl, 7698 .update_time = btrfs_update_time, 7699 }; 7700 static const struct inode_operations btrfs_special_inode_operations = { 7701 .getattr = btrfs_getattr, 7702 .setattr = btrfs_setattr, 7703 .permission = btrfs_permission, 7704 .setxattr = btrfs_setxattr, 7705 .getxattr = btrfs_getxattr, 7706 .listxattr = btrfs_listxattr, 7707 .removexattr = btrfs_removexattr, 7708 .get_acl = btrfs_get_acl, 7709 .update_time = btrfs_update_time, 7710 }; 7711 static const struct inode_operations btrfs_symlink_inode_operations = { 7712 .readlink = generic_readlink, 7713 .follow_link = page_follow_link_light, 7714 .put_link = page_put_link, 7715 .getattr = btrfs_getattr, 7716 .setattr = btrfs_setattr, 7717 .permission = btrfs_permission, 7718 .setxattr = btrfs_setxattr, 7719 .getxattr = btrfs_getxattr, 7720 .listxattr = btrfs_listxattr, 7721 .removexattr = btrfs_removexattr, 7722 .get_acl = btrfs_get_acl, 7723 .update_time = btrfs_update_time, 7724 }; 7725 7726 const struct dentry_operations btrfs_dentry_operations = { 7727 .d_delete = btrfs_dentry_delete, 7728 .d_release = btrfs_dentry_release, 7729 }; 7730