1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include "compat.h" 40 #include "ctree.h" 41 #include "disk-io.h" 42 #include "transaction.h" 43 #include "btrfs_inode.h" 44 #include "ioctl.h" 45 #include "print-tree.h" 46 #include "volumes.h" 47 #include "ordered-data.h" 48 #include "xattr.h" 49 #include "tree-log.h" 50 #include "compression.h" 51 #include "locking.h" 52 53 struct btrfs_iget_args { 54 u64 ino; 55 struct btrfs_root *root; 56 }; 57 58 static const struct inode_operations btrfs_dir_inode_operations; 59 static const struct inode_operations btrfs_symlink_inode_operations; 60 static const struct inode_operations btrfs_dir_ro_inode_operations; 61 static const struct inode_operations btrfs_special_inode_operations; 62 static const struct inode_operations btrfs_file_inode_operations; 63 static const struct address_space_operations btrfs_aops; 64 static const struct address_space_operations btrfs_symlink_aops; 65 static struct file_operations btrfs_dir_file_operations; 66 static struct extent_io_ops btrfs_extent_io_ops; 67 68 static struct kmem_cache *btrfs_inode_cachep; 69 struct kmem_cache *btrfs_trans_handle_cachep; 70 struct kmem_cache *btrfs_transaction_cachep; 71 struct kmem_cache *btrfs_path_cachep; 72 73 #define S_SHIFT 12 74 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 75 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 76 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 77 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 78 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 79 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 80 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 81 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 82 }; 83 84 static void btrfs_truncate(struct inode *inode); 85 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 86 static noinline int cow_file_range(struct inode *inode, 87 struct page *locked_page, 88 u64 start, u64 end, int *page_started, 89 unsigned long *nr_written, int unlock); 90 91 static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) 92 { 93 int err; 94 95 err = btrfs_init_acl(inode, dir); 96 if (!err) 97 err = btrfs_xattr_security_init(inode, dir); 98 return err; 99 } 100 101 /* 102 * this does all the hard work for inserting an inline extent into 103 * the btree. The caller should have done a btrfs_drop_extents so that 104 * no overlapping inline items exist in the btree 105 */ 106 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 107 struct btrfs_root *root, struct inode *inode, 108 u64 start, size_t size, size_t compressed_size, 109 struct page **compressed_pages) 110 { 111 struct btrfs_key key; 112 struct btrfs_path *path; 113 struct extent_buffer *leaf; 114 struct page *page = NULL; 115 char *kaddr; 116 unsigned long ptr; 117 struct btrfs_file_extent_item *ei; 118 int err = 0; 119 int ret; 120 size_t cur_size = size; 121 size_t datasize; 122 unsigned long offset; 123 int use_compress = 0; 124 125 if (compressed_size && compressed_pages) { 126 use_compress = 1; 127 cur_size = compressed_size; 128 } 129 130 path = btrfs_alloc_path(); 131 if (!path) 132 return -ENOMEM; 133 134 path->leave_spinning = 1; 135 btrfs_set_trans_block_group(trans, inode); 136 137 key.objectid = inode->i_ino; 138 key.offset = start; 139 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 140 datasize = btrfs_file_extent_calc_inline_size(cur_size); 141 142 inode_add_bytes(inode, size); 143 ret = btrfs_insert_empty_item(trans, root, path, &key, 144 datasize); 145 BUG_ON(ret); 146 if (ret) { 147 err = ret; 148 goto fail; 149 } 150 leaf = path->nodes[0]; 151 ei = btrfs_item_ptr(leaf, path->slots[0], 152 struct btrfs_file_extent_item); 153 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 154 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 155 btrfs_set_file_extent_encryption(leaf, ei, 0); 156 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 157 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 158 ptr = btrfs_file_extent_inline_start(ei); 159 160 if (use_compress) { 161 struct page *cpage; 162 int i = 0; 163 while (compressed_size > 0) { 164 cpage = compressed_pages[i]; 165 cur_size = min_t(unsigned long, compressed_size, 166 PAGE_CACHE_SIZE); 167 168 kaddr = kmap_atomic(cpage, KM_USER0); 169 write_extent_buffer(leaf, kaddr, ptr, cur_size); 170 kunmap_atomic(kaddr, KM_USER0); 171 172 i++; 173 ptr += cur_size; 174 compressed_size -= cur_size; 175 } 176 btrfs_set_file_extent_compression(leaf, ei, 177 BTRFS_COMPRESS_ZLIB); 178 } else { 179 page = find_get_page(inode->i_mapping, 180 start >> PAGE_CACHE_SHIFT); 181 btrfs_set_file_extent_compression(leaf, ei, 0); 182 kaddr = kmap_atomic(page, KM_USER0); 183 offset = start & (PAGE_CACHE_SIZE - 1); 184 write_extent_buffer(leaf, kaddr + offset, ptr, size); 185 kunmap_atomic(kaddr, KM_USER0); 186 page_cache_release(page); 187 } 188 btrfs_mark_buffer_dirty(leaf); 189 btrfs_free_path(path); 190 191 BTRFS_I(inode)->disk_i_size = inode->i_size; 192 btrfs_update_inode(trans, root, inode); 193 return 0; 194 fail: 195 btrfs_free_path(path); 196 return err; 197 } 198 199 200 /* 201 * conditionally insert an inline extent into the file. This 202 * does the checks required to make sure the data is small enough 203 * to fit as an inline extent. 204 */ 205 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 206 struct btrfs_root *root, 207 struct inode *inode, u64 start, u64 end, 208 size_t compressed_size, 209 struct page **compressed_pages) 210 { 211 u64 isize = i_size_read(inode); 212 u64 actual_end = min(end + 1, isize); 213 u64 inline_len = actual_end - start; 214 u64 aligned_end = (end + root->sectorsize - 1) & 215 ~((u64)root->sectorsize - 1); 216 u64 hint_byte; 217 u64 data_len = inline_len; 218 int ret; 219 220 if (compressed_size) 221 data_len = compressed_size; 222 223 if (start > 0 || 224 actual_end >= PAGE_CACHE_SIZE || 225 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 226 (!compressed_size && 227 (actual_end & (root->sectorsize - 1)) == 0) || 228 end + 1 < isize || 229 data_len > root->fs_info->max_inline) { 230 return 1; 231 } 232 233 ret = btrfs_drop_extents(trans, root, inode, start, 234 aligned_end, aligned_end, start, 235 &hint_byte, 1); 236 BUG_ON(ret); 237 238 if (isize > actual_end) 239 inline_len = min_t(u64, isize, actual_end); 240 ret = insert_inline_extent(trans, root, inode, start, 241 inline_len, compressed_size, 242 compressed_pages); 243 BUG_ON(ret); 244 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 245 return 0; 246 } 247 248 struct async_extent { 249 u64 start; 250 u64 ram_size; 251 u64 compressed_size; 252 struct page **pages; 253 unsigned long nr_pages; 254 struct list_head list; 255 }; 256 257 struct async_cow { 258 struct inode *inode; 259 struct btrfs_root *root; 260 struct page *locked_page; 261 u64 start; 262 u64 end; 263 struct list_head extents; 264 struct btrfs_work work; 265 }; 266 267 static noinline int add_async_extent(struct async_cow *cow, 268 u64 start, u64 ram_size, 269 u64 compressed_size, 270 struct page **pages, 271 unsigned long nr_pages) 272 { 273 struct async_extent *async_extent; 274 275 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 276 async_extent->start = start; 277 async_extent->ram_size = ram_size; 278 async_extent->compressed_size = compressed_size; 279 async_extent->pages = pages; 280 async_extent->nr_pages = nr_pages; 281 list_add_tail(&async_extent->list, &cow->extents); 282 return 0; 283 } 284 285 /* 286 * we create compressed extents in two phases. The first 287 * phase compresses a range of pages that have already been 288 * locked (both pages and state bits are locked). 289 * 290 * This is done inside an ordered work queue, and the compression 291 * is spread across many cpus. The actual IO submission is step 292 * two, and the ordered work queue takes care of making sure that 293 * happens in the same order things were put onto the queue by 294 * writepages and friends. 295 * 296 * If this code finds it can't get good compression, it puts an 297 * entry onto the work queue to write the uncompressed bytes. This 298 * makes sure that both compressed inodes and uncompressed inodes 299 * are written in the same order that pdflush sent them down. 300 */ 301 static noinline int compress_file_range(struct inode *inode, 302 struct page *locked_page, 303 u64 start, u64 end, 304 struct async_cow *async_cow, 305 int *num_added) 306 { 307 struct btrfs_root *root = BTRFS_I(inode)->root; 308 struct btrfs_trans_handle *trans; 309 u64 num_bytes; 310 u64 orig_start; 311 u64 disk_num_bytes; 312 u64 blocksize = root->sectorsize; 313 u64 actual_end; 314 u64 isize = i_size_read(inode); 315 int ret = 0; 316 struct page **pages = NULL; 317 unsigned long nr_pages; 318 unsigned long nr_pages_ret = 0; 319 unsigned long total_compressed = 0; 320 unsigned long total_in = 0; 321 unsigned long max_compressed = 128 * 1024; 322 unsigned long max_uncompressed = 128 * 1024; 323 int i; 324 int will_compress; 325 326 orig_start = start; 327 328 actual_end = min_t(u64, isize, end + 1); 329 again: 330 will_compress = 0; 331 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 332 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 333 334 /* 335 * we don't want to send crud past the end of i_size through 336 * compression, that's just a waste of CPU time. So, if the 337 * end of the file is before the start of our current 338 * requested range of bytes, we bail out to the uncompressed 339 * cleanup code that can deal with all of this. 340 * 341 * It isn't really the fastest way to fix things, but this is a 342 * very uncommon corner. 343 */ 344 if (actual_end <= start) 345 goto cleanup_and_bail_uncompressed; 346 347 total_compressed = actual_end - start; 348 349 /* we want to make sure that amount of ram required to uncompress 350 * an extent is reasonable, so we limit the total size in ram 351 * of a compressed extent to 128k. This is a crucial number 352 * because it also controls how easily we can spread reads across 353 * cpus for decompression. 354 * 355 * We also want to make sure the amount of IO required to do 356 * a random read is reasonably small, so we limit the size of 357 * a compressed extent to 128k. 358 */ 359 total_compressed = min(total_compressed, max_uncompressed); 360 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 361 num_bytes = max(blocksize, num_bytes); 362 disk_num_bytes = num_bytes; 363 total_in = 0; 364 ret = 0; 365 366 /* 367 * we do compression for mount -o compress and when the 368 * inode has not been flagged as nocompress. This flag can 369 * change at any time if we discover bad compression ratios. 370 */ 371 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 372 btrfs_test_opt(root, COMPRESS)) { 373 WARN_ON(pages); 374 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 375 376 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 377 total_compressed, pages, 378 nr_pages, &nr_pages_ret, 379 &total_in, 380 &total_compressed, 381 max_compressed); 382 383 if (!ret) { 384 unsigned long offset = total_compressed & 385 (PAGE_CACHE_SIZE - 1); 386 struct page *page = pages[nr_pages_ret - 1]; 387 char *kaddr; 388 389 /* zero the tail end of the last page, we might be 390 * sending it down to disk 391 */ 392 if (offset) { 393 kaddr = kmap_atomic(page, KM_USER0); 394 memset(kaddr + offset, 0, 395 PAGE_CACHE_SIZE - offset); 396 kunmap_atomic(kaddr, KM_USER0); 397 } 398 will_compress = 1; 399 } 400 } 401 if (start == 0) { 402 trans = btrfs_join_transaction(root, 1); 403 BUG_ON(!trans); 404 btrfs_set_trans_block_group(trans, inode); 405 406 /* lets try to make an inline extent */ 407 if (ret || total_in < (actual_end - start)) { 408 /* we didn't compress the entire range, try 409 * to make an uncompressed inline extent. 410 */ 411 ret = cow_file_range_inline(trans, root, inode, 412 start, end, 0, NULL); 413 } else { 414 /* try making a compressed inline extent */ 415 ret = cow_file_range_inline(trans, root, inode, 416 start, end, 417 total_compressed, pages); 418 } 419 btrfs_end_transaction(trans, root); 420 if (ret == 0) { 421 /* 422 * inline extent creation worked, we don't need 423 * to create any more async work items. Unlock 424 * and free up our temp pages. 425 */ 426 extent_clear_unlock_delalloc(inode, 427 &BTRFS_I(inode)->io_tree, 428 start, end, NULL, 1, 0, 429 0, 1, 1, 1, 0); 430 ret = 0; 431 goto free_pages_out; 432 } 433 } 434 435 if (will_compress) { 436 /* 437 * we aren't doing an inline extent round the compressed size 438 * up to a block size boundary so the allocator does sane 439 * things 440 */ 441 total_compressed = (total_compressed + blocksize - 1) & 442 ~(blocksize - 1); 443 444 /* 445 * one last check to make sure the compression is really a 446 * win, compare the page count read with the blocks on disk 447 */ 448 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 449 ~(PAGE_CACHE_SIZE - 1); 450 if (total_compressed >= total_in) { 451 will_compress = 0; 452 } else { 453 disk_num_bytes = total_compressed; 454 num_bytes = total_in; 455 } 456 } 457 if (!will_compress && pages) { 458 /* 459 * the compression code ran but failed to make things smaller, 460 * free any pages it allocated and our page pointer array 461 */ 462 for (i = 0; i < nr_pages_ret; i++) { 463 WARN_ON(pages[i]->mapping); 464 page_cache_release(pages[i]); 465 } 466 kfree(pages); 467 pages = NULL; 468 total_compressed = 0; 469 nr_pages_ret = 0; 470 471 /* flag the file so we don't compress in the future */ 472 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 473 } 474 if (will_compress) { 475 *num_added += 1; 476 477 /* the async work queues will take care of doing actual 478 * allocation on disk for these compressed pages, 479 * and will submit them to the elevator. 480 */ 481 add_async_extent(async_cow, start, num_bytes, 482 total_compressed, pages, nr_pages_ret); 483 484 if (start + num_bytes < end && start + num_bytes < actual_end) { 485 start += num_bytes; 486 pages = NULL; 487 cond_resched(); 488 goto again; 489 } 490 } else { 491 cleanup_and_bail_uncompressed: 492 /* 493 * No compression, but we still need to write the pages in 494 * the file we've been given so far. redirty the locked 495 * page if it corresponds to our extent and set things up 496 * for the async work queue to run cow_file_range to do 497 * the normal delalloc dance 498 */ 499 if (page_offset(locked_page) >= start && 500 page_offset(locked_page) <= end) { 501 __set_page_dirty_nobuffers(locked_page); 502 /* unlocked later on in the async handlers */ 503 } 504 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 505 *num_added += 1; 506 } 507 508 out: 509 return 0; 510 511 free_pages_out: 512 for (i = 0; i < nr_pages_ret; i++) { 513 WARN_ON(pages[i]->mapping); 514 page_cache_release(pages[i]); 515 } 516 kfree(pages); 517 518 goto out; 519 } 520 521 /* 522 * phase two of compressed writeback. This is the ordered portion 523 * of the code, which only gets called in the order the work was 524 * queued. We walk all the async extents created by compress_file_range 525 * and send them down to the disk. 526 */ 527 static noinline int submit_compressed_extents(struct inode *inode, 528 struct async_cow *async_cow) 529 { 530 struct async_extent *async_extent; 531 u64 alloc_hint = 0; 532 struct btrfs_trans_handle *trans; 533 struct btrfs_key ins; 534 struct extent_map *em; 535 struct btrfs_root *root = BTRFS_I(inode)->root; 536 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 537 struct extent_io_tree *io_tree; 538 int ret; 539 540 if (list_empty(&async_cow->extents)) 541 return 0; 542 543 trans = btrfs_join_transaction(root, 1); 544 545 while (!list_empty(&async_cow->extents)) { 546 async_extent = list_entry(async_cow->extents.next, 547 struct async_extent, list); 548 list_del(&async_extent->list); 549 550 io_tree = &BTRFS_I(inode)->io_tree; 551 552 /* did the compression code fall back to uncompressed IO? */ 553 if (!async_extent->pages) { 554 int page_started = 0; 555 unsigned long nr_written = 0; 556 557 lock_extent(io_tree, async_extent->start, 558 async_extent->start + 559 async_extent->ram_size - 1, GFP_NOFS); 560 561 /* allocate blocks */ 562 cow_file_range(inode, async_cow->locked_page, 563 async_extent->start, 564 async_extent->start + 565 async_extent->ram_size - 1, 566 &page_started, &nr_written, 0); 567 568 /* 569 * if page_started, cow_file_range inserted an 570 * inline extent and took care of all the unlocking 571 * and IO for us. Otherwise, we need to submit 572 * all those pages down to the drive. 573 */ 574 if (!page_started) 575 extent_write_locked_range(io_tree, 576 inode, async_extent->start, 577 async_extent->start + 578 async_extent->ram_size - 1, 579 btrfs_get_extent, 580 WB_SYNC_ALL); 581 kfree(async_extent); 582 cond_resched(); 583 continue; 584 } 585 586 lock_extent(io_tree, async_extent->start, 587 async_extent->start + async_extent->ram_size - 1, 588 GFP_NOFS); 589 /* 590 * here we're doing allocation and writeback of the 591 * compressed pages 592 */ 593 btrfs_drop_extent_cache(inode, async_extent->start, 594 async_extent->start + 595 async_extent->ram_size - 1, 0); 596 597 ret = btrfs_reserve_extent(trans, root, 598 async_extent->compressed_size, 599 async_extent->compressed_size, 600 0, alloc_hint, 601 (u64)-1, &ins, 1); 602 BUG_ON(ret); 603 em = alloc_extent_map(GFP_NOFS); 604 em->start = async_extent->start; 605 em->len = async_extent->ram_size; 606 em->orig_start = em->start; 607 608 em->block_start = ins.objectid; 609 em->block_len = ins.offset; 610 em->bdev = root->fs_info->fs_devices->latest_bdev; 611 set_bit(EXTENT_FLAG_PINNED, &em->flags); 612 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 613 614 while (1) { 615 write_lock(&em_tree->lock); 616 ret = add_extent_mapping(em_tree, em); 617 write_unlock(&em_tree->lock); 618 if (ret != -EEXIST) { 619 free_extent_map(em); 620 break; 621 } 622 btrfs_drop_extent_cache(inode, async_extent->start, 623 async_extent->start + 624 async_extent->ram_size - 1, 0); 625 } 626 627 ret = btrfs_add_ordered_extent(inode, async_extent->start, 628 ins.objectid, 629 async_extent->ram_size, 630 ins.offset, 631 BTRFS_ORDERED_COMPRESSED); 632 BUG_ON(ret); 633 634 btrfs_end_transaction(trans, root); 635 636 /* 637 * clear dirty, set writeback and unlock the pages. 638 */ 639 extent_clear_unlock_delalloc(inode, 640 &BTRFS_I(inode)->io_tree, 641 async_extent->start, 642 async_extent->start + 643 async_extent->ram_size - 1, 644 NULL, 1, 1, 0, 1, 1, 0, 0); 645 646 ret = btrfs_submit_compressed_write(inode, 647 async_extent->start, 648 async_extent->ram_size, 649 ins.objectid, 650 ins.offset, async_extent->pages, 651 async_extent->nr_pages); 652 653 BUG_ON(ret); 654 trans = btrfs_join_transaction(root, 1); 655 alloc_hint = ins.objectid + ins.offset; 656 kfree(async_extent); 657 cond_resched(); 658 } 659 660 btrfs_end_transaction(trans, root); 661 return 0; 662 } 663 664 /* 665 * when extent_io.c finds a delayed allocation range in the file, 666 * the call backs end up in this code. The basic idea is to 667 * allocate extents on disk for the range, and create ordered data structs 668 * in ram to track those extents. 669 * 670 * locked_page is the page that writepage had locked already. We use 671 * it to make sure we don't do extra locks or unlocks. 672 * 673 * *page_started is set to one if we unlock locked_page and do everything 674 * required to start IO on it. It may be clean and already done with 675 * IO when we return. 676 */ 677 static noinline int cow_file_range(struct inode *inode, 678 struct page *locked_page, 679 u64 start, u64 end, int *page_started, 680 unsigned long *nr_written, 681 int unlock) 682 { 683 struct btrfs_root *root = BTRFS_I(inode)->root; 684 struct btrfs_trans_handle *trans; 685 u64 alloc_hint = 0; 686 u64 num_bytes; 687 unsigned long ram_size; 688 u64 disk_num_bytes; 689 u64 cur_alloc_size; 690 u64 blocksize = root->sectorsize; 691 u64 actual_end; 692 u64 isize = i_size_read(inode); 693 struct btrfs_key ins; 694 struct extent_map *em; 695 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 696 int ret = 0; 697 698 trans = btrfs_join_transaction(root, 1); 699 BUG_ON(!trans); 700 btrfs_set_trans_block_group(trans, inode); 701 702 actual_end = min_t(u64, isize, end + 1); 703 704 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 705 num_bytes = max(blocksize, num_bytes); 706 disk_num_bytes = num_bytes; 707 ret = 0; 708 709 if (start == 0) { 710 /* lets try to make an inline extent */ 711 ret = cow_file_range_inline(trans, root, inode, 712 start, end, 0, NULL); 713 if (ret == 0) { 714 extent_clear_unlock_delalloc(inode, 715 &BTRFS_I(inode)->io_tree, 716 start, end, NULL, 1, 1, 717 1, 1, 1, 1, 0); 718 *nr_written = *nr_written + 719 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 720 *page_started = 1; 721 ret = 0; 722 goto out; 723 } 724 } 725 726 BUG_ON(disk_num_bytes > 727 btrfs_super_total_bytes(&root->fs_info->super_copy)); 728 729 730 read_lock(&BTRFS_I(inode)->extent_tree.lock); 731 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, 732 start, num_bytes); 733 if (em) { 734 alloc_hint = em->block_start; 735 free_extent_map(em); 736 } 737 read_unlock(&BTRFS_I(inode)->extent_tree.lock); 738 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 739 740 while (disk_num_bytes > 0) { 741 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 742 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 743 root->sectorsize, 0, alloc_hint, 744 (u64)-1, &ins, 1); 745 BUG_ON(ret); 746 747 em = alloc_extent_map(GFP_NOFS); 748 em->start = start; 749 em->orig_start = em->start; 750 ram_size = ins.offset; 751 em->len = ins.offset; 752 753 em->block_start = ins.objectid; 754 em->block_len = ins.offset; 755 em->bdev = root->fs_info->fs_devices->latest_bdev; 756 set_bit(EXTENT_FLAG_PINNED, &em->flags); 757 758 while (1) { 759 write_lock(&em_tree->lock); 760 ret = add_extent_mapping(em_tree, em); 761 write_unlock(&em_tree->lock); 762 if (ret != -EEXIST) { 763 free_extent_map(em); 764 break; 765 } 766 btrfs_drop_extent_cache(inode, start, 767 start + ram_size - 1, 0); 768 } 769 770 cur_alloc_size = ins.offset; 771 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 772 ram_size, cur_alloc_size, 0); 773 BUG_ON(ret); 774 775 if (root->root_key.objectid == 776 BTRFS_DATA_RELOC_TREE_OBJECTID) { 777 ret = btrfs_reloc_clone_csums(inode, start, 778 cur_alloc_size); 779 BUG_ON(ret); 780 } 781 782 if (disk_num_bytes < cur_alloc_size) 783 break; 784 785 /* we're not doing compressed IO, don't unlock the first 786 * page (which the caller expects to stay locked), don't 787 * clear any dirty bits and don't set any writeback bits 788 * 789 * Do set the Private2 bit so we know this page was properly 790 * setup for writepage 791 */ 792 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 793 start, start + ram_size - 1, 794 locked_page, unlock, 1, 795 1, 0, 0, 0, 1); 796 disk_num_bytes -= cur_alloc_size; 797 num_bytes -= cur_alloc_size; 798 alloc_hint = ins.objectid + ins.offset; 799 start += cur_alloc_size; 800 } 801 out: 802 ret = 0; 803 btrfs_end_transaction(trans, root); 804 805 return ret; 806 } 807 808 /* 809 * work queue call back to started compression on a file and pages 810 */ 811 static noinline void async_cow_start(struct btrfs_work *work) 812 { 813 struct async_cow *async_cow; 814 int num_added = 0; 815 async_cow = container_of(work, struct async_cow, work); 816 817 compress_file_range(async_cow->inode, async_cow->locked_page, 818 async_cow->start, async_cow->end, async_cow, 819 &num_added); 820 if (num_added == 0) 821 async_cow->inode = NULL; 822 } 823 824 /* 825 * work queue call back to submit previously compressed pages 826 */ 827 static noinline void async_cow_submit(struct btrfs_work *work) 828 { 829 struct async_cow *async_cow; 830 struct btrfs_root *root; 831 unsigned long nr_pages; 832 833 async_cow = container_of(work, struct async_cow, work); 834 835 root = async_cow->root; 836 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 837 PAGE_CACHE_SHIFT; 838 839 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 840 841 if (atomic_read(&root->fs_info->async_delalloc_pages) < 842 5 * 1042 * 1024 && 843 waitqueue_active(&root->fs_info->async_submit_wait)) 844 wake_up(&root->fs_info->async_submit_wait); 845 846 if (async_cow->inode) 847 submit_compressed_extents(async_cow->inode, async_cow); 848 } 849 850 static noinline void async_cow_free(struct btrfs_work *work) 851 { 852 struct async_cow *async_cow; 853 async_cow = container_of(work, struct async_cow, work); 854 kfree(async_cow); 855 } 856 857 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 858 u64 start, u64 end, int *page_started, 859 unsigned long *nr_written) 860 { 861 struct async_cow *async_cow; 862 struct btrfs_root *root = BTRFS_I(inode)->root; 863 unsigned long nr_pages; 864 u64 cur_end; 865 int limit = 10 * 1024 * 1042; 866 867 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 868 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); 869 while (start < end) { 870 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 871 async_cow->inode = inode; 872 async_cow->root = root; 873 async_cow->locked_page = locked_page; 874 async_cow->start = start; 875 876 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 877 cur_end = end; 878 else 879 cur_end = min(end, start + 512 * 1024 - 1); 880 881 async_cow->end = cur_end; 882 INIT_LIST_HEAD(&async_cow->extents); 883 884 async_cow->work.func = async_cow_start; 885 async_cow->work.ordered_func = async_cow_submit; 886 async_cow->work.ordered_free = async_cow_free; 887 async_cow->work.flags = 0; 888 889 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 890 PAGE_CACHE_SHIFT; 891 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 892 893 btrfs_queue_worker(&root->fs_info->delalloc_workers, 894 &async_cow->work); 895 896 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 897 wait_event(root->fs_info->async_submit_wait, 898 (atomic_read(&root->fs_info->async_delalloc_pages) < 899 limit)); 900 } 901 902 while (atomic_read(&root->fs_info->async_submit_draining) && 903 atomic_read(&root->fs_info->async_delalloc_pages)) { 904 wait_event(root->fs_info->async_submit_wait, 905 (atomic_read(&root->fs_info->async_delalloc_pages) == 906 0)); 907 } 908 909 *nr_written += nr_pages; 910 start = cur_end + 1; 911 } 912 *page_started = 1; 913 return 0; 914 } 915 916 static noinline int csum_exist_in_range(struct btrfs_root *root, 917 u64 bytenr, u64 num_bytes) 918 { 919 int ret; 920 struct btrfs_ordered_sum *sums; 921 LIST_HEAD(list); 922 923 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 924 bytenr + num_bytes - 1, &list); 925 if (ret == 0 && list_empty(&list)) 926 return 0; 927 928 while (!list_empty(&list)) { 929 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 930 list_del(&sums->list); 931 kfree(sums); 932 } 933 return 1; 934 } 935 936 /* 937 * when nowcow writeback call back. This checks for snapshots or COW copies 938 * of the extents that exist in the file, and COWs the file as required. 939 * 940 * If no cow copies or snapshots exist, we write directly to the existing 941 * blocks on disk 942 */ 943 static noinline int run_delalloc_nocow(struct inode *inode, 944 struct page *locked_page, 945 u64 start, u64 end, int *page_started, int force, 946 unsigned long *nr_written) 947 { 948 struct btrfs_root *root = BTRFS_I(inode)->root; 949 struct btrfs_trans_handle *trans; 950 struct extent_buffer *leaf; 951 struct btrfs_path *path; 952 struct btrfs_file_extent_item *fi; 953 struct btrfs_key found_key; 954 u64 cow_start; 955 u64 cur_offset; 956 u64 extent_end; 957 u64 extent_offset; 958 u64 disk_bytenr; 959 u64 num_bytes; 960 int extent_type; 961 int ret; 962 int type; 963 int nocow; 964 int check_prev = 1; 965 966 path = btrfs_alloc_path(); 967 BUG_ON(!path); 968 trans = btrfs_join_transaction(root, 1); 969 BUG_ON(!trans); 970 971 cow_start = (u64)-1; 972 cur_offset = start; 973 while (1) { 974 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 975 cur_offset, 0); 976 BUG_ON(ret < 0); 977 if (ret > 0 && path->slots[0] > 0 && check_prev) { 978 leaf = path->nodes[0]; 979 btrfs_item_key_to_cpu(leaf, &found_key, 980 path->slots[0] - 1); 981 if (found_key.objectid == inode->i_ino && 982 found_key.type == BTRFS_EXTENT_DATA_KEY) 983 path->slots[0]--; 984 } 985 check_prev = 0; 986 next_slot: 987 leaf = path->nodes[0]; 988 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 989 ret = btrfs_next_leaf(root, path); 990 if (ret < 0) 991 BUG_ON(1); 992 if (ret > 0) 993 break; 994 leaf = path->nodes[0]; 995 } 996 997 nocow = 0; 998 disk_bytenr = 0; 999 num_bytes = 0; 1000 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1001 1002 if (found_key.objectid > inode->i_ino || 1003 found_key.type > BTRFS_EXTENT_DATA_KEY || 1004 found_key.offset > end) 1005 break; 1006 1007 if (found_key.offset > cur_offset) { 1008 extent_end = found_key.offset; 1009 goto out_check; 1010 } 1011 1012 fi = btrfs_item_ptr(leaf, path->slots[0], 1013 struct btrfs_file_extent_item); 1014 extent_type = btrfs_file_extent_type(leaf, fi); 1015 1016 if (extent_type == BTRFS_FILE_EXTENT_REG || 1017 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1018 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1019 extent_offset = btrfs_file_extent_offset(leaf, fi); 1020 extent_end = found_key.offset + 1021 btrfs_file_extent_num_bytes(leaf, fi); 1022 if (extent_end <= start) { 1023 path->slots[0]++; 1024 goto next_slot; 1025 } 1026 if (disk_bytenr == 0) 1027 goto out_check; 1028 if (btrfs_file_extent_compression(leaf, fi) || 1029 btrfs_file_extent_encryption(leaf, fi) || 1030 btrfs_file_extent_other_encoding(leaf, fi)) 1031 goto out_check; 1032 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1033 goto out_check; 1034 if (btrfs_extent_readonly(root, disk_bytenr)) 1035 goto out_check; 1036 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 1037 found_key.offset - 1038 extent_offset, disk_bytenr)) 1039 goto out_check; 1040 disk_bytenr += extent_offset; 1041 disk_bytenr += cur_offset - found_key.offset; 1042 num_bytes = min(end + 1, extent_end) - cur_offset; 1043 /* 1044 * force cow if csum exists in the range. 1045 * this ensure that csum for a given extent are 1046 * either valid or do not exist. 1047 */ 1048 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1049 goto out_check; 1050 nocow = 1; 1051 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1052 extent_end = found_key.offset + 1053 btrfs_file_extent_inline_len(leaf, fi); 1054 extent_end = ALIGN(extent_end, root->sectorsize); 1055 } else { 1056 BUG_ON(1); 1057 } 1058 out_check: 1059 if (extent_end <= start) { 1060 path->slots[0]++; 1061 goto next_slot; 1062 } 1063 if (!nocow) { 1064 if (cow_start == (u64)-1) 1065 cow_start = cur_offset; 1066 cur_offset = extent_end; 1067 if (cur_offset > end) 1068 break; 1069 path->slots[0]++; 1070 goto next_slot; 1071 } 1072 1073 btrfs_release_path(root, path); 1074 if (cow_start != (u64)-1) { 1075 ret = cow_file_range(inode, locked_page, cow_start, 1076 found_key.offset - 1, page_started, 1077 nr_written, 1); 1078 BUG_ON(ret); 1079 cow_start = (u64)-1; 1080 } 1081 1082 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1083 struct extent_map *em; 1084 struct extent_map_tree *em_tree; 1085 em_tree = &BTRFS_I(inode)->extent_tree; 1086 em = alloc_extent_map(GFP_NOFS); 1087 em->start = cur_offset; 1088 em->orig_start = em->start; 1089 em->len = num_bytes; 1090 em->block_len = num_bytes; 1091 em->block_start = disk_bytenr; 1092 em->bdev = root->fs_info->fs_devices->latest_bdev; 1093 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1094 while (1) { 1095 write_lock(&em_tree->lock); 1096 ret = add_extent_mapping(em_tree, em); 1097 write_unlock(&em_tree->lock); 1098 if (ret != -EEXIST) { 1099 free_extent_map(em); 1100 break; 1101 } 1102 btrfs_drop_extent_cache(inode, em->start, 1103 em->start + em->len - 1, 0); 1104 } 1105 type = BTRFS_ORDERED_PREALLOC; 1106 } else { 1107 type = BTRFS_ORDERED_NOCOW; 1108 } 1109 1110 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1111 num_bytes, num_bytes, type); 1112 BUG_ON(ret); 1113 1114 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1115 cur_offset, cur_offset + num_bytes - 1, 1116 locked_page, 1, 1, 1, 0, 0, 0, 1); 1117 cur_offset = extent_end; 1118 if (cur_offset > end) 1119 break; 1120 } 1121 btrfs_release_path(root, path); 1122 1123 if (cur_offset <= end && cow_start == (u64)-1) 1124 cow_start = cur_offset; 1125 if (cow_start != (u64)-1) { 1126 ret = cow_file_range(inode, locked_page, cow_start, end, 1127 page_started, nr_written, 1); 1128 BUG_ON(ret); 1129 } 1130 1131 ret = btrfs_end_transaction(trans, root); 1132 BUG_ON(ret); 1133 btrfs_free_path(path); 1134 return 0; 1135 } 1136 1137 /* 1138 * extent_io.c call back to do delayed allocation processing 1139 */ 1140 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1141 u64 start, u64 end, int *page_started, 1142 unsigned long *nr_written) 1143 { 1144 int ret; 1145 struct btrfs_root *root = BTRFS_I(inode)->root; 1146 1147 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) 1148 ret = run_delalloc_nocow(inode, locked_page, start, end, 1149 page_started, 1, nr_written); 1150 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1151 ret = run_delalloc_nocow(inode, locked_page, start, end, 1152 page_started, 0, nr_written); 1153 else if (!btrfs_test_opt(root, COMPRESS)) 1154 ret = cow_file_range(inode, locked_page, start, end, 1155 page_started, nr_written, 1); 1156 else 1157 ret = cow_file_range_async(inode, locked_page, start, end, 1158 page_started, nr_written); 1159 return ret; 1160 } 1161 1162 /* 1163 * extent_io.c set_bit_hook, used to track delayed allocation 1164 * bytes in this file, and to maintain the list of inodes that 1165 * have pending delalloc work to be done. 1166 */ 1167 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1168 unsigned long old, unsigned long bits) 1169 { 1170 /* 1171 * set_bit and clear bit hooks normally require _irqsave/restore 1172 * but in this case, we are only testeing for the DELALLOC 1173 * bit, which is only set or cleared with irqs on 1174 */ 1175 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1176 struct btrfs_root *root = BTRFS_I(inode)->root; 1177 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1178 spin_lock(&root->fs_info->delalloc_lock); 1179 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1180 root->fs_info->delalloc_bytes += end - start + 1; 1181 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1182 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1183 &root->fs_info->delalloc_inodes); 1184 } 1185 spin_unlock(&root->fs_info->delalloc_lock); 1186 } 1187 return 0; 1188 } 1189 1190 /* 1191 * extent_io.c clear_bit_hook, see set_bit_hook for why 1192 */ 1193 static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1194 unsigned long old, unsigned long bits) 1195 { 1196 /* 1197 * set_bit and clear bit hooks normally require _irqsave/restore 1198 * but in this case, we are only testeing for the DELALLOC 1199 * bit, which is only set or cleared with irqs on 1200 */ 1201 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1202 struct btrfs_root *root = BTRFS_I(inode)->root; 1203 1204 spin_lock(&root->fs_info->delalloc_lock); 1205 if (end - start + 1 > root->fs_info->delalloc_bytes) { 1206 printk(KERN_INFO "btrfs warning: delalloc account " 1207 "%llu %llu\n", 1208 (unsigned long long)end - start + 1, 1209 (unsigned long long) 1210 root->fs_info->delalloc_bytes); 1211 btrfs_delalloc_free_space(root, inode, (u64)-1); 1212 root->fs_info->delalloc_bytes = 0; 1213 BTRFS_I(inode)->delalloc_bytes = 0; 1214 } else { 1215 btrfs_delalloc_free_space(root, inode, 1216 end - start + 1); 1217 root->fs_info->delalloc_bytes -= end - start + 1; 1218 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1219 } 1220 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1221 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1222 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1223 } 1224 spin_unlock(&root->fs_info->delalloc_lock); 1225 } 1226 return 0; 1227 } 1228 1229 /* 1230 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1231 * we don't create bios that span stripes or chunks 1232 */ 1233 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1234 size_t size, struct bio *bio, 1235 unsigned long bio_flags) 1236 { 1237 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1238 struct btrfs_mapping_tree *map_tree; 1239 u64 logical = (u64)bio->bi_sector << 9; 1240 u64 length = 0; 1241 u64 map_length; 1242 int ret; 1243 1244 if (bio_flags & EXTENT_BIO_COMPRESSED) 1245 return 0; 1246 1247 length = bio->bi_size; 1248 map_tree = &root->fs_info->mapping_tree; 1249 map_length = length; 1250 ret = btrfs_map_block(map_tree, READ, logical, 1251 &map_length, NULL, 0); 1252 1253 if (map_length < length + size) 1254 return 1; 1255 return 0; 1256 } 1257 1258 /* 1259 * in order to insert checksums into the metadata in large chunks, 1260 * we wait until bio submission time. All the pages in the bio are 1261 * checksummed and sums are attached onto the ordered extent record. 1262 * 1263 * At IO completion time the cums attached on the ordered extent record 1264 * are inserted into the btree 1265 */ 1266 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1267 struct bio *bio, int mirror_num, 1268 unsigned long bio_flags) 1269 { 1270 struct btrfs_root *root = BTRFS_I(inode)->root; 1271 int ret = 0; 1272 1273 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1274 BUG_ON(ret); 1275 return 0; 1276 } 1277 1278 /* 1279 * in order to insert checksums into the metadata in large chunks, 1280 * we wait until bio submission time. All the pages in the bio are 1281 * checksummed and sums are attached onto the ordered extent record. 1282 * 1283 * At IO completion time the cums attached on the ordered extent record 1284 * are inserted into the btree 1285 */ 1286 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1287 int mirror_num, unsigned long bio_flags) 1288 { 1289 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1291 } 1292 1293 /* 1294 * extent_io.c submission hook. This does the right thing for csum calculation 1295 * on write, or reading the csums from the tree before a read 1296 */ 1297 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1298 int mirror_num, unsigned long bio_flags) 1299 { 1300 struct btrfs_root *root = BTRFS_I(inode)->root; 1301 int ret = 0; 1302 int skip_sum; 1303 1304 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1305 1306 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1307 BUG_ON(ret); 1308 1309 if (!(rw & (1 << BIO_RW))) { 1310 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1311 return btrfs_submit_compressed_read(inode, bio, 1312 mirror_num, bio_flags); 1313 } else if (!skip_sum) 1314 btrfs_lookup_bio_sums(root, inode, bio, NULL); 1315 goto mapit; 1316 } else if (!skip_sum) { 1317 /* csum items have already been cloned */ 1318 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1319 goto mapit; 1320 /* we're doing a write, do the async checksumming */ 1321 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1322 inode, rw, bio, mirror_num, 1323 bio_flags, __btrfs_submit_bio_start, 1324 __btrfs_submit_bio_done); 1325 } 1326 1327 mapit: 1328 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1329 } 1330 1331 /* 1332 * given a list of ordered sums record them in the inode. This happens 1333 * at IO completion time based on sums calculated at bio submission time. 1334 */ 1335 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1336 struct inode *inode, u64 file_offset, 1337 struct list_head *list) 1338 { 1339 struct btrfs_ordered_sum *sum; 1340 1341 btrfs_set_trans_block_group(trans, inode); 1342 1343 list_for_each_entry(sum, list, list) { 1344 btrfs_csum_file_blocks(trans, 1345 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1346 } 1347 return 0; 1348 } 1349 1350 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1351 { 1352 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1353 WARN_ON(1); 1354 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1355 GFP_NOFS); 1356 } 1357 1358 /* see btrfs_writepage_start_hook for details on why this is required */ 1359 struct btrfs_writepage_fixup { 1360 struct page *page; 1361 struct btrfs_work work; 1362 }; 1363 1364 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1365 { 1366 struct btrfs_writepage_fixup *fixup; 1367 struct btrfs_ordered_extent *ordered; 1368 struct page *page; 1369 struct inode *inode; 1370 u64 page_start; 1371 u64 page_end; 1372 1373 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1374 page = fixup->page; 1375 again: 1376 lock_page(page); 1377 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1378 ClearPageChecked(page); 1379 goto out_page; 1380 } 1381 1382 inode = page->mapping->host; 1383 page_start = page_offset(page); 1384 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1385 1386 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1387 1388 /* already ordered? We're done */ 1389 if (PagePrivate2(page)) 1390 goto out; 1391 1392 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1393 if (ordered) { 1394 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1395 page_end, GFP_NOFS); 1396 unlock_page(page); 1397 btrfs_start_ordered_extent(inode, ordered, 1); 1398 goto again; 1399 } 1400 1401 btrfs_set_extent_delalloc(inode, page_start, page_end); 1402 ClearPageChecked(page); 1403 out: 1404 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1405 out_page: 1406 unlock_page(page); 1407 page_cache_release(page); 1408 } 1409 1410 /* 1411 * There are a few paths in the higher layers of the kernel that directly 1412 * set the page dirty bit without asking the filesystem if it is a 1413 * good idea. This causes problems because we want to make sure COW 1414 * properly happens and the data=ordered rules are followed. 1415 * 1416 * In our case any range that doesn't have the ORDERED bit set 1417 * hasn't been properly setup for IO. We kick off an async process 1418 * to fix it up. The async helper will wait for ordered extents, set 1419 * the delalloc bit and make it safe to write the page. 1420 */ 1421 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1422 { 1423 struct inode *inode = page->mapping->host; 1424 struct btrfs_writepage_fixup *fixup; 1425 struct btrfs_root *root = BTRFS_I(inode)->root; 1426 1427 /* this page is properly in the ordered list */ 1428 if (TestClearPagePrivate2(page)) 1429 return 0; 1430 1431 if (PageChecked(page)) 1432 return -EAGAIN; 1433 1434 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1435 if (!fixup) 1436 return -EAGAIN; 1437 1438 SetPageChecked(page); 1439 page_cache_get(page); 1440 fixup->work.func = btrfs_writepage_fixup_worker; 1441 fixup->page = page; 1442 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1443 return -EAGAIN; 1444 } 1445 1446 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1447 struct inode *inode, u64 file_pos, 1448 u64 disk_bytenr, u64 disk_num_bytes, 1449 u64 num_bytes, u64 ram_bytes, 1450 u64 locked_end, 1451 u8 compression, u8 encryption, 1452 u16 other_encoding, int extent_type) 1453 { 1454 struct btrfs_root *root = BTRFS_I(inode)->root; 1455 struct btrfs_file_extent_item *fi; 1456 struct btrfs_path *path; 1457 struct extent_buffer *leaf; 1458 struct btrfs_key ins; 1459 u64 hint; 1460 int ret; 1461 1462 path = btrfs_alloc_path(); 1463 BUG_ON(!path); 1464 1465 path->leave_spinning = 1; 1466 1467 /* 1468 * we may be replacing one extent in the tree with another. 1469 * The new extent is pinned in the extent map, and we don't want 1470 * to drop it from the cache until it is completely in the btree. 1471 * 1472 * So, tell btrfs_drop_extents to leave this extent in the cache. 1473 * the caller is expected to unpin it and allow it to be merged 1474 * with the others. 1475 */ 1476 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1477 file_pos + num_bytes, locked_end, 1478 file_pos, &hint, 0); 1479 BUG_ON(ret); 1480 1481 ins.objectid = inode->i_ino; 1482 ins.offset = file_pos; 1483 ins.type = BTRFS_EXTENT_DATA_KEY; 1484 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1485 BUG_ON(ret); 1486 leaf = path->nodes[0]; 1487 fi = btrfs_item_ptr(leaf, path->slots[0], 1488 struct btrfs_file_extent_item); 1489 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1490 btrfs_set_file_extent_type(leaf, fi, extent_type); 1491 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1492 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1493 btrfs_set_file_extent_offset(leaf, fi, 0); 1494 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1495 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1496 btrfs_set_file_extent_compression(leaf, fi, compression); 1497 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1498 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1499 1500 btrfs_unlock_up_safe(path, 1); 1501 btrfs_set_lock_blocking(leaf); 1502 1503 btrfs_mark_buffer_dirty(leaf); 1504 1505 inode_add_bytes(inode, num_bytes); 1506 1507 ins.objectid = disk_bytenr; 1508 ins.offset = disk_num_bytes; 1509 ins.type = BTRFS_EXTENT_ITEM_KEY; 1510 ret = btrfs_alloc_reserved_file_extent(trans, root, 1511 root->root_key.objectid, 1512 inode->i_ino, file_pos, &ins); 1513 BUG_ON(ret); 1514 btrfs_free_path(path); 1515 1516 return 0; 1517 } 1518 1519 /* 1520 * helper function for btrfs_finish_ordered_io, this 1521 * just reads in some of the csum leaves to prime them into ram 1522 * before we start the transaction. It limits the amount of btree 1523 * reads required while inside the transaction. 1524 */ 1525 static noinline void reada_csum(struct btrfs_root *root, 1526 struct btrfs_path *path, 1527 struct btrfs_ordered_extent *ordered_extent) 1528 { 1529 struct btrfs_ordered_sum *sum; 1530 u64 bytenr; 1531 1532 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, 1533 list); 1534 bytenr = sum->sums[0].bytenr; 1535 1536 /* 1537 * we don't care about the results, the point of this search is 1538 * just to get the btree leaves into ram 1539 */ 1540 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); 1541 } 1542 1543 /* as ordered data IO finishes, this gets called so we can finish 1544 * an ordered extent if the range of bytes in the file it covers are 1545 * fully written. 1546 */ 1547 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1548 { 1549 struct btrfs_root *root = BTRFS_I(inode)->root; 1550 struct btrfs_trans_handle *trans; 1551 struct btrfs_ordered_extent *ordered_extent = NULL; 1552 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1553 struct btrfs_path *path; 1554 int compressed = 0; 1555 int ret; 1556 1557 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1558 if (!ret) 1559 return 0; 1560 1561 /* 1562 * before we join the transaction, try to do some of our IO. 1563 * This will limit the amount of IO that we have to do with 1564 * the transaction running. We're unlikely to need to do any 1565 * IO if the file extents are new, the disk_i_size checks 1566 * covers the most common case. 1567 */ 1568 if (start < BTRFS_I(inode)->disk_i_size) { 1569 path = btrfs_alloc_path(); 1570 if (path) { 1571 ret = btrfs_lookup_file_extent(NULL, root, path, 1572 inode->i_ino, 1573 start, 0); 1574 ordered_extent = btrfs_lookup_ordered_extent(inode, 1575 start); 1576 if (!list_empty(&ordered_extent->list)) { 1577 btrfs_release_path(root, path); 1578 reada_csum(root, path, ordered_extent); 1579 } 1580 btrfs_free_path(path); 1581 } 1582 } 1583 1584 trans = btrfs_join_transaction(root, 1); 1585 1586 if (!ordered_extent) 1587 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1588 BUG_ON(!ordered_extent); 1589 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1590 goto nocow; 1591 1592 lock_extent(io_tree, ordered_extent->file_offset, 1593 ordered_extent->file_offset + ordered_extent->len - 1, 1594 GFP_NOFS); 1595 1596 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1597 compressed = 1; 1598 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1599 BUG_ON(compressed); 1600 ret = btrfs_mark_extent_written(trans, root, inode, 1601 ordered_extent->file_offset, 1602 ordered_extent->file_offset + 1603 ordered_extent->len); 1604 BUG_ON(ret); 1605 } else { 1606 ret = insert_reserved_file_extent(trans, inode, 1607 ordered_extent->file_offset, 1608 ordered_extent->start, 1609 ordered_extent->disk_len, 1610 ordered_extent->len, 1611 ordered_extent->len, 1612 ordered_extent->file_offset + 1613 ordered_extent->len, 1614 compressed, 0, 0, 1615 BTRFS_FILE_EXTENT_REG); 1616 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1617 ordered_extent->file_offset, 1618 ordered_extent->len); 1619 BUG_ON(ret); 1620 } 1621 unlock_extent(io_tree, ordered_extent->file_offset, 1622 ordered_extent->file_offset + ordered_extent->len - 1, 1623 GFP_NOFS); 1624 nocow: 1625 add_pending_csums(trans, inode, ordered_extent->file_offset, 1626 &ordered_extent->list); 1627 1628 mutex_lock(&BTRFS_I(inode)->extent_mutex); 1629 btrfs_ordered_update_i_size(inode, ordered_extent); 1630 btrfs_update_inode(trans, root, inode); 1631 btrfs_remove_ordered_extent(inode, ordered_extent); 1632 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 1633 1634 /* once for us */ 1635 btrfs_put_ordered_extent(ordered_extent); 1636 /* once for the tree */ 1637 btrfs_put_ordered_extent(ordered_extent); 1638 1639 btrfs_end_transaction(trans, root); 1640 return 0; 1641 } 1642 1643 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1644 struct extent_state *state, int uptodate) 1645 { 1646 ClearPagePrivate2(page); 1647 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1648 } 1649 1650 /* 1651 * When IO fails, either with EIO or csum verification fails, we 1652 * try other mirrors that might have a good copy of the data. This 1653 * io_failure_record is used to record state as we go through all the 1654 * mirrors. If another mirror has good data, the page is set up to date 1655 * and things continue. If a good mirror can't be found, the original 1656 * bio end_io callback is called to indicate things have failed. 1657 */ 1658 struct io_failure_record { 1659 struct page *page; 1660 u64 start; 1661 u64 len; 1662 u64 logical; 1663 unsigned long bio_flags; 1664 int last_mirror; 1665 }; 1666 1667 static int btrfs_io_failed_hook(struct bio *failed_bio, 1668 struct page *page, u64 start, u64 end, 1669 struct extent_state *state) 1670 { 1671 struct io_failure_record *failrec = NULL; 1672 u64 private; 1673 struct extent_map *em; 1674 struct inode *inode = page->mapping->host; 1675 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1676 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1677 struct bio *bio; 1678 int num_copies; 1679 int ret; 1680 int rw; 1681 u64 logical; 1682 1683 ret = get_state_private(failure_tree, start, &private); 1684 if (ret) { 1685 failrec = kmalloc(sizeof(*failrec), GFP_NOFS); 1686 if (!failrec) 1687 return -ENOMEM; 1688 failrec->start = start; 1689 failrec->len = end - start + 1; 1690 failrec->last_mirror = 0; 1691 failrec->bio_flags = 0; 1692 1693 read_lock(&em_tree->lock); 1694 em = lookup_extent_mapping(em_tree, start, failrec->len); 1695 if (em->start > start || em->start + em->len < start) { 1696 free_extent_map(em); 1697 em = NULL; 1698 } 1699 read_unlock(&em_tree->lock); 1700 1701 if (!em || IS_ERR(em)) { 1702 kfree(failrec); 1703 return -EIO; 1704 } 1705 logical = start - em->start; 1706 logical = em->block_start + logical; 1707 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1708 logical = em->block_start; 1709 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1710 } 1711 failrec->logical = logical; 1712 free_extent_map(em); 1713 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | 1714 EXTENT_DIRTY, GFP_NOFS); 1715 set_state_private(failure_tree, start, 1716 (u64)(unsigned long)failrec); 1717 } else { 1718 failrec = (struct io_failure_record *)(unsigned long)private; 1719 } 1720 num_copies = btrfs_num_copies( 1721 &BTRFS_I(inode)->root->fs_info->mapping_tree, 1722 failrec->logical, failrec->len); 1723 failrec->last_mirror++; 1724 if (!state) { 1725 spin_lock(&BTRFS_I(inode)->io_tree.lock); 1726 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 1727 failrec->start, 1728 EXTENT_LOCKED); 1729 if (state && state->start != failrec->start) 1730 state = NULL; 1731 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 1732 } 1733 if (!state || failrec->last_mirror > num_copies) { 1734 set_state_private(failure_tree, failrec->start, 0); 1735 clear_extent_bits(failure_tree, failrec->start, 1736 failrec->start + failrec->len - 1, 1737 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1738 kfree(failrec); 1739 return -EIO; 1740 } 1741 bio = bio_alloc(GFP_NOFS, 1); 1742 bio->bi_private = state; 1743 bio->bi_end_io = failed_bio->bi_end_io; 1744 bio->bi_sector = failrec->logical >> 9; 1745 bio->bi_bdev = failed_bio->bi_bdev; 1746 bio->bi_size = 0; 1747 1748 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1749 if (failed_bio->bi_rw & (1 << BIO_RW)) 1750 rw = WRITE; 1751 else 1752 rw = READ; 1753 1754 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1755 failrec->last_mirror, 1756 failrec->bio_flags); 1757 return 0; 1758 } 1759 1760 /* 1761 * each time an IO finishes, we do a fast check in the IO failure tree 1762 * to see if we need to process or clean up an io_failure_record 1763 */ 1764 static int btrfs_clean_io_failures(struct inode *inode, u64 start) 1765 { 1766 u64 private; 1767 u64 private_failure; 1768 struct io_failure_record *failure; 1769 int ret; 1770 1771 private = 0; 1772 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1773 (u64)-1, 1, EXTENT_DIRTY)) { 1774 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, 1775 start, &private_failure); 1776 if (ret == 0) { 1777 failure = (struct io_failure_record *)(unsigned long) 1778 private_failure; 1779 set_state_private(&BTRFS_I(inode)->io_failure_tree, 1780 failure->start, 0); 1781 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, 1782 failure->start, 1783 failure->start + failure->len - 1, 1784 EXTENT_DIRTY | EXTENT_LOCKED, 1785 GFP_NOFS); 1786 kfree(failure); 1787 } 1788 } 1789 return 0; 1790 } 1791 1792 /* 1793 * when reads are done, we need to check csums to verify the data is correct 1794 * if there's a match, we allow the bio to finish. If not, we go through 1795 * the io_failure_record routines to find good copies 1796 */ 1797 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1798 struct extent_state *state) 1799 { 1800 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 1801 struct inode *inode = page->mapping->host; 1802 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1803 char *kaddr; 1804 u64 private = ~(u32)0; 1805 int ret; 1806 struct btrfs_root *root = BTRFS_I(inode)->root; 1807 u32 csum = ~(u32)0; 1808 1809 if (PageChecked(page)) { 1810 ClearPageChecked(page); 1811 goto good; 1812 } 1813 1814 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 1815 return 0; 1816 1817 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1818 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 1819 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 1820 GFP_NOFS); 1821 return 0; 1822 } 1823 1824 if (state && state->start == start) { 1825 private = state->private; 1826 ret = 0; 1827 } else { 1828 ret = get_state_private(io_tree, start, &private); 1829 } 1830 kaddr = kmap_atomic(page, KM_USER0); 1831 if (ret) 1832 goto zeroit; 1833 1834 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 1835 btrfs_csum_final(csum, (char *)&csum); 1836 if (csum != private) 1837 goto zeroit; 1838 1839 kunmap_atomic(kaddr, KM_USER0); 1840 good: 1841 /* if the io failure tree for this inode is non-empty, 1842 * check to see if we've recovered from a failed IO 1843 */ 1844 btrfs_clean_io_failures(inode, start); 1845 return 0; 1846 1847 zeroit: 1848 if (printk_ratelimit()) { 1849 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " 1850 "private %llu\n", page->mapping->host->i_ino, 1851 (unsigned long long)start, csum, 1852 (unsigned long long)private); 1853 } 1854 memset(kaddr + offset, 1, end - start + 1); 1855 flush_dcache_page(page); 1856 kunmap_atomic(kaddr, KM_USER0); 1857 if (private == 0) 1858 return 0; 1859 return -EIO; 1860 } 1861 1862 /* 1863 * This creates an orphan entry for the given inode in case something goes 1864 * wrong in the middle of an unlink/truncate. 1865 */ 1866 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 1867 { 1868 struct btrfs_root *root = BTRFS_I(inode)->root; 1869 int ret = 0; 1870 1871 spin_lock(&root->list_lock); 1872 1873 /* already on the orphan list, we're good */ 1874 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 1875 spin_unlock(&root->list_lock); 1876 return 0; 1877 } 1878 1879 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 1880 1881 spin_unlock(&root->list_lock); 1882 1883 /* 1884 * insert an orphan item to track this unlinked/truncated file 1885 */ 1886 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 1887 1888 return ret; 1889 } 1890 1891 /* 1892 * We have done the truncate/delete so we can go ahead and remove the orphan 1893 * item for this particular inode. 1894 */ 1895 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 1896 { 1897 struct btrfs_root *root = BTRFS_I(inode)->root; 1898 int ret = 0; 1899 1900 spin_lock(&root->list_lock); 1901 1902 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 1903 spin_unlock(&root->list_lock); 1904 return 0; 1905 } 1906 1907 list_del_init(&BTRFS_I(inode)->i_orphan); 1908 if (!trans) { 1909 spin_unlock(&root->list_lock); 1910 return 0; 1911 } 1912 1913 spin_unlock(&root->list_lock); 1914 1915 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 1916 1917 return ret; 1918 } 1919 1920 /* 1921 * this cleans up any orphans that may be left on the list from the last use 1922 * of this root. 1923 */ 1924 void btrfs_orphan_cleanup(struct btrfs_root *root) 1925 { 1926 struct btrfs_path *path; 1927 struct extent_buffer *leaf; 1928 struct btrfs_item *item; 1929 struct btrfs_key key, found_key; 1930 struct btrfs_trans_handle *trans; 1931 struct inode *inode; 1932 int ret = 0, nr_unlink = 0, nr_truncate = 0; 1933 1934 path = btrfs_alloc_path(); 1935 if (!path) 1936 return; 1937 path->reada = -1; 1938 1939 key.objectid = BTRFS_ORPHAN_OBJECTID; 1940 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1941 key.offset = (u64)-1; 1942 1943 1944 while (1) { 1945 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1946 if (ret < 0) { 1947 printk(KERN_ERR "Error searching slot for orphan: %d" 1948 "\n", ret); 1949 break; 1950 } 1951 1952 /* 1953 * if ret == 0 means we found what we were searching for, which 1954 * is weird, but possible, so only screw with path if we didnt 1955 * find the key and see if we have stuff that matches 1956 */ 1957 if (ret > 0) { 1958 if (path->slots[0] == 0) 1959 break; 1960 path->slots[0]--; 1961 } 1962 1963 /* pull out the item */ 1964 leaf = path->nodes[0]; 1965 item = btrfs_item_nr(leaf, path->slots[0]); 1966 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1967 1968 /* make sure the item matches what we want */ 1969 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 1970 break; 1971 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 1972 break; 1973 1974 /* release the path since we're done with it */ 1975 btrfs_release_path(root, path); 1976 1977 /* 1978 * this is where we are basically btrfs_lookup, without the 1979 * crossing root thing. we store the inode number in the 1980 * offset of the orphan item. 1981 */ 1982 found_key.objectid = found_key.offset; 1983 found_key.type = BTRFS_INODE_ITEM_KEY; 1984 found_key.offset = 0; 1985 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 1986 if (IS_ERR(inode)) 1987 break; 1988 1989 /* 1990 * add this inode to the orphan list so btrfs_orphan_del does 1991 * the proper thing when we hit it 1992 */ 1993 spin_lock(&root->list_lock); 1994 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 1995 spin_unlock(&root->list_lock); 1996 1997 /* 1998 * if this is a bad inode, means we actually succeeded in 1999 * removing the inode, but not the orphan record, which means 2000 * we need to manually delete the orphan since iput will just 2001 * do a destroy_inode 2002 */ 2003 if (is_bad_inode(inode)) { 2004 trans = btrfs_start_transaction(root, 1); 2005 btrfs_orphan_del(trans, inode); 2006 btrfs_end_transaction(trans, root); 2007 iput(inode); 2008 continue; 2009 } 2010 2011 /* if we have links, this was a truncate, lets do that */ 2012 if (inode->i_nlink) { 2013 nr_truncate++; 2014 btrfs_truncate(inode); 2015 } else { 2016 nr_unlink++; 2017 } 2018 2019 /* this will do delete_inode and everything for us */ 2020 iput(inode); 2021 } 2022 2023 if (nr_unlink) 2024 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2025 if (nr_truncate) 2026 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2027 2028 btrfs_free_path(path); 2029 } 2030 2031 /* 2032 * very simple check to peek ahead in the leaf looking for xattrs. If we 2033 * don't find any xattrs, we know there can't be any acls. 2034 * 2035 * slot is the slot the inode is in, objectid is the objectid of the inode 2036 */ 2037 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2038 int slot, u64 objectid) 2039 { 2040 u32 nritems = btrfs_header_nritems(leaf); 2041 struct btrfs_key found_key; 2042 int scanned = 0; 2043 2044 slot++; 2045 while (slot < nritems) { 2046 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2047 2048 /* we found a different objectid, there must not be acls */ 2049 if (found_key.objectid != objectid) 2050 return 0; 2051 2052 /* we found an xattr, assume we've got an acl */ 2053 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2054 return 1; 2055 2056 /* 2057 * we found a key greater than an xattr key, there can't 2058 * be any acls later on 2059 */ 2060 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2061 return 0; 2062 2063 slot++; 2064 scanned++; 2065 2066 /* 2067 * it goes inode, inode backrefs, xattrs, extents, 2068 * so if there are a ton of hard links to an inode there can 2069 * be a lot of backrefs. Don't waste time searching too hard, 2070 * this is just an optimization 2071 */ 2072 if (scanned >= 8) 2073 break; 2074 } 2075 /* we hit the end of the leaf before we found an xattr or 2076 * something larger than an xattr. We have to assume the inode 2077 * has acls 2078 */ 2079 return 1; 2080 } 2081 2082 /* 2083 * read an inode from the btree into the in-memory inode 2084 */ 2085 static void btrfs_read_locked_inode(struct inode *inode) 2086 { 2087 struct btrfs_path *path; 2088 struct extent_buffer *leaf; 2089 struct btrfs_inode_item *inode_item; 2090 struct btrfs_timespec *tspec; 2091 struct btrfs_root *root = BTRFS_I(inode)->root; 2092 struct btrfs_key location; 2093 int maybe_acls; 2094 u64 alloc_group_block; 2095 u32 rdev; 2096 int ret; 2097 2098 path = btrfs_alloc_path(); 2099 BUG_ON(!path); 2100 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2101 2102 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2103 if (ret) 2104 goto make_bad; 2105 2106 leaf = path->nodes[0]; 2107 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2108 struct btrfs_inode_item); 2109 2110 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2111 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2112 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2113 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2114 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2115 2116 tspec = btrfs_inode_atime(inode_item); 2117 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2118 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2119 2120 tspec = btrfs_inode_mtime(inode_item); 2121 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2122 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2123 2124 tspec = btrfs_inode_ctime(inode_item); 2125 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2126 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2127 2128 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2129 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2130 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2131 inode->i_generation = BTRFS_I(inode)->generation; 2132 inode->i_rdev = 0; 2133 rdev = btrfs_inode_rdev(leaf, inode_item); 2134 2135 BTRFS_I(inode)->index_cnt = (u64)-1; 2136 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2137 2138 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2139 2140 /* 2141 * try to precache a NULL acl entry for files that don't have 2142 * any xattrs or acls 2143 */ 2144 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); 2145 if (!maybe_acls) 2146 cache_no_acl(inode); 2147 2148 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2149 alloc_group_block, 0); 2150 btrfs_free_path(path); 2151 inode_item = NULL; 2152 2153 switch (inode->i_mode & S_IFMT) { 2154 case S_IFREG: 2155 inode->i_mapping->a_ops = &btrfs_aops; 2156 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2157 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2158 inode->i_fop = &btrfs_file_operations; 2159 inode->i_op = &btrfs_file_inode_operations; 2160 break; 2161 case S_IFDIR: 2162 inode->i_fop = &btrfs_dir_file_operations; 2163 if (root == root->fs_info->tree_root) 2164 inode->i_op = &btrfs_dir_ro_inode_operations; 2165 else 2166 inode->i_op = &btrfs_dir_inode_operations; 2167 break; 2168 case S_IFLNK: 2169 inode->i_op = &btrfs_symlink_inode_operations; 2170 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2171 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2172 break; 2173 default: 2174 inode->i_op = &btrfs_special_inode_operations; 2175 init_special_inode(inode, inode->i_mode, rdev); 2176 break; 2177 } 2178 2179 btrfs_update_iflags(inode); 2180 return; 2181 2182 make_bad: 2183 btrfs_free_path(path); 2184 make_bad_inode(inode); 2185 } 2186 2187 /* 2188 * given a leaf and an inode, copy the inode fields into the leaf 2189 */ 2190 static void fill_inode_item(struct btrfs_trans_handle *trans, 2191 struct extent_buffer *leaf, 2192 struct btrfs_inode_item *item, 2193 struct inode *inode) 2194 { 2195 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2196 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2197 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2198 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2199 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2200 2201 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2202 inode->i_atime.tv_sec); 2203 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2204 inode->i_atime.tv_nsec); 2205 2206 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2207 inode->i_mtime.tv_sec); 2208 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2209 inode->i_mtime.tv_nsec); 2210 2211 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2212 inode->i_ctime.tv_sec); 2213 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2214 inode->i_ctime.tv_nsec); 2215 2216 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2217 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2218 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2219 btrfs_set_inode_transid(leaf, item, trans->transid); 2220 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2221 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2222 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2223 } 2224 2225 /* 2226 * copy everything in the in-memory inode into the btree. 2227 */ 2228 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2229 struct btrfs_root *root, struct inode *inode) 2230 { 2231 struct btrfs_inode_item *inode_item; 2232 struct btrfs_path *path; 2233 struct extent_buffer *leaf; 2234 int ret; 2235 2236 path = btrfs_alloc_path(); 2237 BUG_ON(!path); 2238 path->leave_spinning = 1; 2239 ret = btrfs_lookup_inode(trans, root, path, 2240 &BTRFS_I(inode)->location, 1); 2241 if (ret) { 2242 if (ret > 0) 2243 ret = -ENOENT; 2244 goto failed; 2245 } 2246 2247 btrfs_unlock_up_safe(path, 1); 2248 leaf = path->nodes[0]; 2249 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2250 struct btrfs_inode_item); 2251 2252 fill_inode_item(trans, leaf, inode_item, inode); 2253 btrfs_mark_buffer_dirty(leaf); 2254 btrfs_set_inode_last_trans(trans, inode); 2255 ret = 0; 2256 failed: 2257 btrfs_free_path(path); 2258 return ret; 2259 } 2260 2261 2262 /* 2263 * unlink helper that gets used here in inode.c and in the tree logging 2264 * recovery code. It remove a link in a directory with a given name, and 2265 * also drops the back refs in the inode to the directory 2266 */ 2267 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2268 struct btrfs_root *root, 2269 struct inode *dir, struct inode *inode, 2270 const char *name, int name_len) 2271 { 2272 struct btrfs_path *path; 2273 int ret = 0; 2274 struct extent_buffer *leaf; 2275 struct btrfs_dir_item *di; 2276 struct btrfs_key key; 2277 u64 index; 2278 2279 path = btrfs_alloc_path(); 2280 if (!path) { 2281 ret = -ENOMEM; 2282 goto err; 2283 } 2284 2285 path->leave_spinning = 1; 2286 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2287 name, name_len, -1); 2288 if (IS_ERR(di)) { 2289 ret = PTR_ERR(di); 2290 goto err; 2291 } 2292 if (!di) { 2293 ret = -ENOENT; 2294 goto err; 2295 } 2296 leaf = path->nodes[0]; 2297 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2298 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2299 if (ret) 2300 goto err; 2301 btrfs_release_path(root, path); 2302 2303 ret = btrfs_del_inode_ref(trans, root, name, name_len, 2304 inode->i_ino, 2305 dir->i_ino, &index); 2306 if (ret) { 2307 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2308 "inode %lu parent %lu\n", name_len, name, 2309 inode->i_ino, dir->i_ino); 2310 goto err; 2311 } 2312 2313 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 2314 index, name, name_len, -1); 2315 if (IS_ERR(di)) { 2316 ret = PTR_ERR(di); 2317 goto err; 2318 } 2319 if (!di) { 2320 ret = -ENOENT; 2321 goto err; 2322 } 2323 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2324 btrfs_release_path(root, path); 2325 2326 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2327 inode, dir->i_ino); 2328 BUG_ON(ret != 0 && ret != -ENOENT); 2329 2330 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2331 dir, index); 2332 BUG_ON(ret); 2333 err: 2334 btrfs_free_path(path); 2335 if (ret) 2336 goto out; 2337 2338 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2339 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2340 btrfs_update_inode(trans, root, dir); 2341 btrfs_drop_nlink(inode); 2342 ret = btrfs_update_inode(trans, root, inode); 2343 out: 2344 return ret; 2345 } 2346 2347 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2348 { 2349 struct btrfs_root *root; 2350 struct btrfs_trans_handle *trans; 2351 struct inode *inode = dentry->d_inode; 2352 int ret; 2353 unsigned long nr = 0; 2354 2355 root = BTRFS_I(dir)->root; 2356 2357 trans = btrfs_start_transaction(root, 1); 2358 2359 btrfs_set_trans_block_group(trans, dir); 2360 2361 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 2362 2363 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2364 dentry->d_name.name, dentry->d_name.len); 2365 2366 if (inode->i_nlink == 0) 2367 ret = btrfs_orphan_add(trans, inode); 2368 2369 nr = trans->blocks_used; 2370 2371 btrfs_end_transaction_throttle(trans, root); 2372 btrfs_btree_balance_dirty(root, nr); 2373 return ret; 2374 } 2375 2376 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 2377 struct btrfs_root *root, 2378 struct inode *dir, u64 objectid, 2379 const char *name, int name_len) 2380 { 2381 struct btrfs_path *path; 2382 struct extent_buffer *leaf; 2383 struct btrfs_dir_item *di; 2384 struct btrfs_key key; 2385 u64 index; 2386 int ret; 2387 2388 path = btrfs_alloc_path(); 2389 if (!path) 2390 return -ENOMEM; 2391 2392 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2393 name, name_len, -1); 2394 BUG_ON(!di || IS_ERR(di)); 2395 2396 leaf = path->nodes[0]; 2397 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2398 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 2399 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2400 BUG_ON(ret); 2401 btrfs_release_path(root, path); 2402 2403 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 2404 objectid, root->root_key.objectid, 2405 dir->i_ino, &index, name, name_len); 2406 if (ret < 0) { 2407 BUG_ON(ret != -ENOENT); 2408 di = btrfs_search_dir_index_item(root, path, dir->i_ino, 2409 name, name_len); 2410 BUG_ON(!di || IS_ERR(di)); 2411 2412 leaf = path->nodes[0]; 2413 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2414 btrfs_release_path(root, path); 2415 index = key.offset; 2416 } 2417 2418 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 2419 index, name, name_len, -1); 2420 BUG_ON(!di || IS_ERR(di)); 2421 2422 leaf = path->nodes[0]; 2423 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2424 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 2425 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2426 BUG_ON(ret); 2427 btrfs_release_path(root, path); 2428 2429 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2430 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2431 ret = btrfs_update_inode(trans, root, dir); 2432 BUG_ON(ret); 2433 dir->i_sb->s_dirt = 1; 2434 2435 btrfs_free_path(path); 2436 return 0; 2437 } 2438 2439 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 2440 { 2441 struct inode *inode = dentry->d_inode; 2442 int err = 0; 2443 int ret; 2444 struct btrfs_root *root = BTRFS_I(dir)->root; 2445 struct btrfs_trans_handle *trans; 2446 unsigned long nr = 0; 2447 2448 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 2449 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2450 return -ENOTEMPTY; 2451 2452 trans = btrfs_start_transaction(root, 1); 2453 btrfs_set_trans_block_group(trans, dir); 2454 2455 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 2456 err = btrfs_unlink_subvol(trans, root, dir, 2457 BTRFS_I(inode)->location.objectid, 2458 dentry->d_name.name, 2459 dentry->d_name.len); 2460 goto out; 2461 } 2462 2463 err = btrfs_orphan_add(trans, inode); 2464 if (err) 2465 goto out; 2466 2467 /* now the directory is empty */ 2468 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2469 dentry->d_name.name, dentry->d_name.len); 2470 if (!err) 2471 btrfs_i_size_write(inode, 0); 2472 out: 2473 nr = trans->blocks_used; 2474 ret = btrfs_end_transaction_throttle(trans, root); 2475 btrfs_btree_balance_dirty(root, nr); 2476 2477 if (ret && !err) 2478 err = ret; 2479 return err; 2480 } 2481 2482 #if 0 2483 /* 2484 * when truncating bytes in a file, it is possible to avoid reading 2485 * the leaves that contain only checksum items. This can be the 2486 * majority of the IO required to delete a large file, but it must 2487 * be done carefully. 2488 * 2489 * The keys in the level just above the leaves are checked to make sure 2490 * the lowest key in a given leaf is a csum key, and starts at an offset 2491 * after the new size. 2492 * 2493 * Then the key for the next leaf is checked to make sure it also has 2494 * a checksum item for the same file. If it does, we know our target leaf 2495 * contains only checksum items, and it can be safely freed without reading 2496 * it. 2497 * 2498 * This is just an optimization targeted at large files. It may do 2499 * nothing. It will return 0 unless things went badly. 2500 */ 2501 static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, 2502 struct btrfs_root *root, 2503 struct btrfs_path *path, 2504 struct inode *inode, u64 new_size) 2505 { 2506 struct btrfs_key key; 2507 int ret; 2508 int nritems; 2509 struct btrfs_key found_key; 2510 struct btrfs_key other_key; 2511 struct btrfs_leaf_ref *ref; 2512 u64 leaf_gen; 2513 u64 leaf_start; 2514 2515 path->lowest_level = 1; 2516 key.objectid = inode->i_ino; 2517 key.type = BTRFS_CSUM_ITEM_KEY; 2518 key.offset = new_size; 2519 again: 2520 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2521 if (ret < 0) 2522 goto out; 2523 2524 if (path->nodes[1] == NULL) { 2525 ret = 0; 2526 goto out; 2527 } 2528 ret = 0; 2529 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); 2530 nritems = btrfs_header_nritems(path->nodes[1]); 2531 2532 if (!nritems) 2533 goto out; 2534 2535 if (path->slots[1] >= nritems) 2536 goto next_node; 2537 2538 /* did we find a key greater than anything we want to delete? */ 2539 if (found_key.objectid > inode->i_ino || 2540 (found_key.objectid == inode->i_ino && found_key.type > key.type)) 2541 goto out; 2542 2543 /* we check the next key in the node to make sure the leave contains 2544 * only checksum items. This comparison doesn't work if our 2545 * leaf is the last one in the node 2546 */ 2547 if (path->slots[1] + 1 >= nritems) { 2548 next_node: 2549 /* search forward from the last key in the node, this 2550 * will bring us into the next node in the tree 2551 */ 2552 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); 2553 2554 /* unlikely, but we inc below, so check to be safe */ 2555 if (found_key.offset == (u64)-1) 2556 goto out; 2557 2558 /* search_forward needs a path with locks held, do the 2559 * search again for the original key. It is possible 2560 * this will race with a balance and return a path that 2561 * we could modify, but this drop is just an optimization 2562 * and is allowed to miss some leaves. 2563 */ 2564 btrfs_release_path(root, path); 2565 found_key.offset++; 2566 2567 /* setup a max key for search_forward */ 2568 other_key.offset = (u64)-1; 2569 other_key.type = key.type; 2570 other_key.objectid = key.objectid; 2571 2572 path->keep_locks = 1; 2573 ret = btrfs_search_forward(root, &found_key, &other_key, 2574 path, 0, 0); 2575 path->keep_locks = 0; 2576 if (ret || found_key.objectid != key.objectid || 2577 found_key.type != key.type) { 2578 ret = 0; 2579 goto out; 2580 } 2581 2582 key.offset = found_key.offset; 2583 btrfs_release_path(root, path); 2584 cond_resched(); 2585 goto again; 2586 } 2587 2588 /* we know there's one more slot after us in the tree, 2589 * read that key so we can verify it is also a checksum item 2590 */ 2591 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); 2592 2593 if (found_key.objectid < inode->i_ino) 2594 goto next_key; 2595 2596 if (found_key.type != key.type || found_key.offset < new_size) 2597 goto next_key; 2598 2599 /* 2600 * if the key for the next leaf isn't a csum key from this objectid, 2601 * we can't be sure there aren't good items inside this leaf. 2602 * Bail out 2603 */ 2604 if (other_key.objectid != inode->i_ino || other_key.type != key.type) 2605 goto out; 2606 2607 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); 2608 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); 2609 /* 2610 * it is safe to delete this leaf, it contains only 2611 * csum items from this inode at an offset >= new_size 2612 */ 2613 ret = btrfs_del_leaf(trans, root, path, leaf_start); 2614 BUG_ON(ret); 2615 2616 if (root->ref_cows && leaf_gen < trans->transid) { 2617 ref = btrfs_alloc_leaf_ref(root, 0); 2618 if (ref) { 2619 ref->root_gen = root->root_key.offset; 2620 ref->bytenr = leaf_start; 2621 ref->owner = 0; 2622 ref->generation = leaf_gen; 2623 ref->nritems = 0; 2624 2625 btrfs_sort_leaf_ref(ref); 2626 2627 ret = btrfs_add_leaf_ref(root, ref, 0); 2628 WARN_ON(ret); 2629 btrfs_free_leaf_ref(root, ref); 2630 } else { 2631 WARN_ON(1); 2632 } 2633 } 2634 next_key: 2635 btrfs_release_path(root, path); 2636 2637 if (other_key.objectid == inode->i_ino && 2638 other_key.type == key.type && other_key.offset > key.offset) { 2639 key.offset = other_key.offset; 2640 cond_resched(); 2641 goto again; 2642 } 2643 ret = 0; 2644 out: 2645 /* fixup any changes we've made to the path */ 2646 path->lowest_level = 0; 2647 path->keep_locks = 0; 2648 btrfs_release_path(root, path); 2649 return ret; 2650 } 2651 2652 #endif 2653 2654 /* 2655 * this can truncate away extent items, csum items and directory items. 2656 * It starts at a high offset and removes keys until it can't find 2657 * any higher than new_size 2658 * 2659 * csum items that cross the new i_size are truncated to the new size 2660 * as well. 2661 * 2662 * min_type is the minimum key type to truncate down to. If set to 0, this 2663 * will kill all the items on this inode, including the INODE_ITEM_KEY. 2664 */ 2665 noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 2666 struct btrfs_root *root, 2667 struct inode *inode, 2668 u64 new_size, u32 min_type) 2669 { 2670 int ret; 2671 struct btrfs_path *path; 2672 struct btrfs_key key; 2673 struct btrfs_key found_key; 2674 u32 found_type = (u8)-1; 2675 struct extent_buffer *leaf; 2676 struct btrfs_file_extent_item *fi; 2677 u64 extent_start = 0; 2678 u64 extent_num_bytes = 0; 2679 u64 extent_offset = 0; 2680 u64 item_end = 0; 2681 int found_extent; 2682 int del_item; 2683 int pending_del_nr = 0; 2684 int pending_del_slot = 0; 2685 int extent_type = -1; 2686 int encoding; 2687 u64 mask = root->sectorsize - 1; 2688 2689 if (root->ref_cows) 2690 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 2691 path = btrfs_alloc_path(); 2692 BUG_ON(!path); 2693 path->reada = -1; 2694 2695 /* FIXME, add redo link to tree so we don't leak on crash */ 2696 key.objectid = inode->i_ino; 2697 key.offset = (u64)-1; 2698 key.type = (u8)-1; 2699 2700 search_again: 2701 path->leave_spinning = 1; 2702 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2703 if (ret < 0) 2704 goto error; 2705 2706 if (ret > 0) { 2707 /* there are no items in the tree for us to truncate, we're 2708 * done 2709 */ 2710 if (path->slots[0] == 0) { 2711 ret = 0; 2712 goto error; 2713 } 2714 path->slots[0]--; 2715 } 2716 2717 while (1) { 2718 fi = NULL; 2719 leaf = path->nodes[0]; 2720 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2721 found_type = btrfs_key_type(&found_key); 2722 encoding = 0; 2723 2724 if (found_key.objectid != inode->i_ino) 2725 break; 2726 2727 if (found_type < min_type) 2728 break; 2729 2730 item_end = found_key.offset; 2731 if (found_type == BTRFS_EXTENT_DATA_KEY) { 2732 fi = btrfs_item_ptr(leaf, path->slots[0], 2733 struct btrfs_file_extent_item); 2734 extent_type = btrfs_file_extent_type(leaf, fi); 2735 encoding = btrfs_file_extent_compression(leaf, fi); 2736 encoding |= btrfs_file_extent_encryption(leaf, fi); 2737 encoding |= btrfs_file_extent_other_encoding(leaf, fi); 2738 2739 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2740 item_end += 2741 btrfs_file_extent_num_bytes(leaf, fi); 2742 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2743 item_end += btrfs_file_extent_inline_len(leaf, 2744 fi); 2745 } 2746 item_end--; 2747 } 2748 if (item_end < new_size) { 2749 if (found_type == BTRFS_DIR_ITEM_KEY) 2750 found_type = BTRFS_INODE_ITEM_KEY; 2751 else if (found_type == BTRFS_EXTENT_ITEM_KEY) 2752 found_type = BTRFS_EXTENT_DATA_KEY; 2753 else if (found_type == BTRFS_EXTENT_DATA_KEY) 2754 found_type = BTRFS_XATTR_ITEM_KEY; 2755 else if (found_type == BTRFS_XATTR_ITEM_KEY) 2756 found_type = BTRFS_INODE_REF_KEY; 2757 else if (found_type) 2758 found_type--; 2759 else 2760 break; 2761 btrfs_set_key_type(&key, found_type); 2762 goto next; 2763 } 2764 if (found_key.offset >= new_size) 2765 del_item = 1; 2766 else 2767 del_item = 0; 2768 found_extent = 0; 2769 2770 /* FIXME, shrink the extent if the ref count is only 1 */ 2771 if (found_type != BTRFS_EXTENT_DATA_KEY) 2772 goto delete; 2773 2774 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2775 u64 num_dec; 2776 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 2777 if (!del_item && !encoding) { 2778 u64 orig_num_bytes = 2779 btrfs_file_extent_num_bytes(leaf, fi); 2780 extent_num_bytes = new_size - 2781 found_key.offset + root->sectorsize - 1; 2782 extent_num_bytes = extent_num_bytes & 2783 ~((u64)root->sectorsize - 1); 2784 btrfs_set_file_extent_num_bytes(leaf, fi, 2785 extent_num_bytes); 2786 num_dec = (orig_num_bytes - 2787 extent_num_bytes); 2788 if (root->ref_cows && extent_start != 0) 2789 inode_sub_bytes(inode, num_dec); 2790 btrfs_mark_buffer_dirty(leaf); 2791 } else { 2792 extent_num_bytes = 2793 btrfs_file_extent_disk_num_bytes(leaf, 2794 fi); 2795 extent_offset = found_key.offset - 2796 btrfs_file_extent_offset(leaf, fi); 2797 2798 /* FIXME blocksize != 4096 */ 2799 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 2800 if (extent_start != 0) { 2801 found_extent = 1; 2802 if (root->ref_cows) 2803 inode_sub_bytes(inode, num_dec); 2804 } 2805 } 2806 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2807 /* 2808 * we can't truncate inline items that have had 2809 * special encodings 2810 */ 2811 if (!del_item && 2812 btrfs_file_extent_compression(leaf, fi) == 0 && 2813 btrfs_file_extent_encryption(leaf, fi) == 0 && 2814 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 2815 u32 size = new_size - found_key.offset; 2816 2817 if (root->ref_cows) { 2818 inode_sub_bytes(inode, item_end + 1 - 2819 new_size); 2820 } 2821 size = 2822 btrfs_file_extent_calc_inline_size(size); 2823 ret = btrfs_truncate_item(trans, root, path, 2824 size, 1); 2825 BUG_ON(ret); 2826 } else if (root->ref_cows) { 2827 inode_sub_bytes(inode, item_end + 1 - 2828 found_key.offset); 2829 } 2830 } 2831 delete: 2832 if (del_item) { 2833 if (!pending_del_nr) { 2834 /* no pending yet, add ourselves */ 2835 pending_del_slot = path->slots[0]; 2836 pending_del_nr = 1; 2837 } else if (pending_del_nr && 2838 path->slots[0] + 1 == pending_del_slot) { 2839 /* hop on the pending chunk */ 2840 pending_del_nr++; 2841 pending_del_slot = path->slots[0]; 2842 } else { 2843 BUG(); 2844 } 2845 } else { 2846 break; 2847 } 2848 if (found_extent && root->ref_cows) { 2849 btrfs_set_path_blocking(path); 2850 ret = btrfs_free_extent(trans, root, extent_start, 2851 extent_num_bytes, 0, 2852 btrfs_header_owner(leaf), 2853 inode->i_ino, extent_offset); 2854 BUG_ON(ret); 2855 } 2856 next: 2857 if (path->slots[0] == 0) { 2858 if (pending_del_nr) 2859 goto del_pending; 2860 btrfs_release_path(root, path); 2861 if (found_type == BTRFS_INODE_ITEM_KEY) 2862 break; 2863 goto search_again; 2864 } 2865 2866 path->slots[0]--; 2867 if (pending_del_nr && 2868 path->slots[0] + 1 != pending_del_slot) { 2869 struct btrfs_key debug; 2870 del_pending: 2871 btrfs_item_key_to_cpu(path->nodes[0], &debug, 2872 pending_del_slot); 2873 ret = btrfs_del_items(trans, root, path, 2874 pending_del_slot, 2875 pending_del_nr); 2876 BUG_ON(ret); 2877 pending_del_nr = 0; 2878 btrfs_release_path(root, path); 2879 if (found_type == BTRFS_INODE_ITEM_KEY) 2880 break; 2881 goto search_again; 2882 } 2883 } 2884 ret = 0; 2885 error: 2886 if (pending_del_nr) { 2887 ret = btrfs_del_items(trans, root, path, pending_del_slot, 2888 pending_del_nr); 2889 } 2890 btrfs_free_path(path); 2891 return ret; 2892 } 2893 2894 /* 2895 * taken from block_truncate_page, but does cow as it zeros out 2896 * any bytes left in the last page in the file. 2897 */ 2898 static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 2899 { 2900 struct inode *inode = mapping->host; 2901 struct btrfs_root *root = BTRFS_I(inode)->root; 2902 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2903 struct btrfs_ordered_extent *ordered; 2904 char *kaddr; 2905 u32 blocksize = root->sectorsize; 2906 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2907 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2908 struct page *page; 2909 int ret = 0; 2910 u64 page_start; 2911 u64 page_end; 2912 2913 if ((offset & (blocksize - 1)) == 0) 2914 goto out; 2915 2916 ret = -ENOMEM; 2917 again: 2918 page = grab_cache_page(mapping, index); 2919 if (!page) 2920 goto out; 2921 2922 page_start = page_offset(page); 2923 page_end = page_start + PAGE_CACHE_SIZE - 1; 2924 2925 if (!PageUptodate(page)) { 2926 ret = btrfs_readpage(NULL, page); 2927 lock_page(page); 2928 if (page->mapping != mapping) { 2929 unlock_page(page); 2930 page_cache_release(page); 2931 goto again; 2932 } 2933 if (!PageUptodate(page)) { 2934 ret = -EIO; 2935 goto out_unlock; 2936 } 2937 } 2938 wait_on_page_writeback(page); 2939 2940 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 2941 set_page_extent_mapped(page); 2942 2943 ordered = btrfs_lookup_ordered_extent(inode, page_start); 2944 if (ordered) { 2945 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 2946 unlock_page(page); 2947 page_cache_release(page); 2948 btrfs_start_ordered_extent(inode, ordered, 1); 2949 btrfs_put_ordered_extent(ordered); 2950 goto again; 2951 } 2952 2953 btrfs_set_extent_delalloc(inode, page_start, page_end); 2954 ret = 0; 2955 if (offset != PAGE_CACHE_SIZE) { 2956 kaddr = kmap(page); 2957 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 2958 flush_dcache_page(page); 2959 kunmap(page); 2960 } 2961 ClearPageChecked(page); 2962 set_page_dirty(page); 2963 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 2964 2965 out_unlock: 2966 unlock_page(page); 2967 page_cache_release(page); 2968 out: 2969 return ret; 2970 } 2971 2972 int btrfs_cont_expand(struct inode *inode, loff_t size) 2973 { 2974 struct btrfs_trans_handle *trans; 2975 struct btrfs_root *root = BTRFS_I(inode)->root; 2976 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2977 struct extent_map *em; 2978 u64 mask = root->sectorsize - 1; 2979 u64 hole_start = (inode->i_size + mask) & ~mask; 2980 u64 block_end = (size + mask) & ~mask; 2981 u64 last_byte; 2982 u64 cur_offset; 2983 u64 hole_size; 2984 int err; 2985 2986 if (size <= hole_start) 2987 return 0; 2988 2989 err = btrfs_check_metadata_free_space(root); 2990 if (err) 2991 return err; 2992 2993 btrfs_truncate_page(inode->i_mapping, inode->i_size); 2994 2995 while (1) { 2996 struct btrfs_ordered_extent *ordered; 2997 btrfs_wait_ordered_range(inode, hole_start, 2998 block_end - hole_start); 2999 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3000 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3001 if (!ordered) 3002 break; 3003 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3004 btrfs_put_ordered_extent(ordered); 3005 } 3006 3007 trans = btrfs_start_transaction(root, 1); 3008 btrfs_set_trans_block_group(trans, inode); 3009 3010 cur_offset = hole_start; 3011 while (1) { 3012 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3013 block_end - cur_offset, 0); 3014 BUG_ON(IS_ERR(em) || !em); 3015 last_byte = min(extent_map_end(em), block_end); 3016 last_byte = (last_byte + mask) & ~mask; 3017 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 3018 u64 hint_byte = 0; 3019 hole_size = last_byte - cur_offset; 3020 err = btrfs_drop_extents(trans, root, inode, 3021 cur_offset, 3022 cur_offset + hole_size, 3023 block_end, 3024 cur_offset, &hint_byte, 1); 3025 if (err) 3026 break; 3027 err = btrfs_insert_file_extent(trans, root, 3028 inode->i_ino, cur_offset, 0, 3029 0, hole_size, 0, hole_size, 3030 0, 0, 0); 3031 btrfs_drop_extent_cache(inode, hole_start, 3032 last_byte - 1, 0); 3033 } 3034 free_extent_map(em); 3035 cur_offset = last_byte; 3036 if (err || cur_offset >= block_end) 3037 break; 3038 } 3039 3040 btrfs_end_transaction(trans, root); 3041 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3042 return err; 3043 } 3044 3045 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3046 { 3047 struct inode *inode = dentry->d_inode; 3048 int err; 3049 3050 err = inode_change_ok(inode, attr); 3051 if (err) 3052 return err; 3053 3054 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3055 if (attr->ia_size > inode->i_size) { 3056 err = btrfs_cont_expand(inode, attr->ia_size); 3057 if (err) 3058 return err; 3059 } else if (inode->i_size > 0 && 3060 attr->ia_size == 0) { 3061 3062 /* we're truncating a file that used to have good 3063 * data down to zero. Make sure it gets into 3064 * the ordered flush list so that any new writes 3065 * get down to disk quickly. 3066 */ 3067 BTRFS_I(inode)->ordered_data_close = 1; 3068 } 3069 } 3070 3071 err = inode_setattr(inode, attr); 3072 3073 if (!err && ((attr->ia_valid & ATTR_MODE))) 3074 err = btrfs_acl_chmod(inode); 3075 return err; 3076 } 3077 3078 void btrfs_delete_inode(struct inode *inode) 3079 { 3080 struct btrfs_trans_handle *trans; 3081 struct btrfs_root *root = BTRFS_I(inode)->root; 3082 unsigned long nr; 3083 int ret; 3084 3085 truncate_inode_pages(&inode->i_data, 0); 3086 if (is_bad_inode(inode)) { 3087 btrfs_orphan_del(NULL, inode); 3088 goto no_delete; 3089 } 3090 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3091 3092 if (inode->i_nlink > 0) { 3093 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3094 goto no_delete; 3095 } 3096 3097 btrfs_i_size_write(inode, 0); 3098 trans = btrfs_join_transaction(root, 1); 3099 3100 btrfs_set_trans_block_group(trans, inode); 3101 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); 3102 if (ret) { 3103 btrfs_orphan_del(NULL, inode); 3104 goto no_delete_lock; 3105 } 3106 3107 btrfs_orphan_del(trans, inode); 3108 3109 nr = trans->blocks_used; 3110 clear_inode(inode); 3111 3112 btrfs_end_transaction(trans, root); 3113 btrfs_btree_balance_dirty(root, nr); 3114 return; 3115 3116 no_delete_lock: 3117 nr = trans->blocks_used; 3118 btrfs_end_transaction(trans, root); 3119 btrfs_btree_balance_dirty(root, nr); 3120 no_delete: 3121 clear_inode(inode); 3122 } 3123 3124 /* 3125 * this returns the key found in the dir entry in the location pointer. 3126 * If no dir entries were found, location->objectid is 0. 3127 */ 3128 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3129 struct btrfs_key *location) 3130 { 3131 const char *name = dentry->d_name.name; 3132 int namelen = dentry->d_name.len; 3133 struct btrfs_dir_item *di; 3134 struct btrfs_path *path; 3135 struct btrfs_root *root = BTRFS_I(dir)->root; 3136 int ret = 0; 3137 3138 path = btrfs_alloc_path(); 3139 BUG_ON(!path); 3140 3141 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, 3142 namelen, 0); 3143 if (IS_ERR(di)) 3144 ret = PTR_ERR(di); 3145 3146 if (!di || IS_ERR(di)) 3147 goto out_err; 3148 3149 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3150 out: 3151 btrfs_free_path(path); 3152 return ret; 3153 out_err: 3154 location->objectid = 0; 3155 goto out; 3156 } 3157 3158 /* 3159 * when we hit a tree root in a directory, the btrfs part of the inode 3160 * needs to be changed to reflect the root directory of the tree root. This 3161 * is kind of like crossing a mount point. 3162 */ 3163 static int fixup_tree_root_location(struct btrfs_root *root, 3164 struct inode *dir, 3165 struct dentry *dentry, 3166 struct btrfs_key *location, 3167 struct btrfs_root **sub_root) 3168 { 3169 struct btrfs_path *path; 3170 struct btrfs_root *new_root; 3171 struct btrfs_root_ref *ref; 3172 struct extent_buffer *leaf; 3173 int ret; 3174 int err = 0; 3175 3176 path = btrfs_alloc_path(); 3177 if (!path) { 3178 err = -ENOMEM; 3179 goto out; 3180 } 3181 3182 err = -ENOENT; 3183 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3184 BTRFS_I(dir)->root->root_key.objectid, 3185 location->objectid); 3186 if (ret) { 3187 if (ret < 0) 3188 err = ret; 3189 goto out; 3190 } 3191 3192 leaf = path->nodes[0]; 3193 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3194 if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || 3195 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3196 goto out; 3197 3198 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3199 (unsigned long)(ref + 1), 3200 dentry->d_name.len); 3201 if (ret) 3202 goto out; 3203 3204 btrfs_release_path(root->fs_info->tree_root, path); 3205 3206 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3207 if (IS_ERR(new_root)) { 3208 err = PTR_ERR(new_root); 3209 goto out; 3210 } 3211 3212 if (btrfs_root_refs(&new_root->root_item) == 0) { 3213 err = -ENOENT; 3214 goto out; 3215 } 3216 3217 *sub_root = new_root; 3218 location->objectid = btrfs_root_dirid(&new_root->root_item); 3219 location->type = BTRFS_INODE_ITEM_KEY; 3220 location->offset = 0; 3221 err = 0; 3222 out: 3223 btrfs_free_path(path); 3224 return err; 3225 } 3226 3227 static void inode_tree_add(struct inode *inode) 3228 { 3229 struct btrfs_root *root = BTRFS_I(inode)->root; 3230 struct btrfs_inode *entry; 3231 struct rb_node **p; 3232 struct rb_node *parent; 3233 again: 3234 p = &root->inode_tree.rb_node; 3235 parent = NULL; 3236 3237 if (hlist_unhashed(&inode->i_hash)) 3238 return; 3239 3240 spin_lock(&root->inode_lock); 3241 while (*p) { 3242 parent = *p; 3243 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3244 3245 if (inode->i_ino < entry->vfs_inode.i_ino) 3246 p = &parent->rb_left; 3247 else if (inode->i_ino > entry->vfs_inode.i_ino) 3248 p = &parent->rb_right; 3249 else { 3250 WARN_ON(!(entry->vfs_inode.i_state & 3251 (I_WILL_FREE | I_FREEING | I_CLEAR))); 3252 rb_erase(parent, &root->inode_tree); 3253 RB_CLEAR_NODE(parent); 3254 spin_unlock(&root->inode_lock); 3255 goto again; 3256 } 3257 } 3258 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3259 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3260 spin_unlock(&root->inode_lock); 3261 } 3262 3263 static void inode_tree_del(struct inode *inode) 3264 { 3265 struct btrfs_root *root = BTRFS_I(inode)->root; 3266 int empty = 0; 3267 3268 spin_lock(&root->inode_lock); 3269 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3270 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3271 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3272 empty = RB_EMPTY_ROOT(&root->inode_tree); 3273 } 3274 spin_unlock(&root->inode_lock); 3275 3276 if (empty && btrfs_root_refs(&root->root_item) == 0) { 3277 synchronize_srcu(&root->fs_info->subvol_srcu); 3278 spin_lock(&root->inode_lock); 3279 empty = RB_EMPTY_ROOT(&root->inode_tree); 3280 spin_unlock(&root->inode_lock); 3281 if (empty) 3282 btrfs_add_dead_root(root); 3283 } 3284 } 3285 3286 int btrfs_invalidate_inodes(struct btrfs_root *root) 3287 { 3288 struct rb_node *node; 3289 struct rb_node *prev; 3290 struct btrfs_inode *entry; 3291 struct inode *inode; 3292 u64 objectid = 0; 3293 3294 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 3295 3296 spin_lock(&root->inode_lock); 3297 again: 3298 node = root->inode_tree.rb_node; 3299 prev = NULL; 3300 while (node) { 3301 prev = node; 3302 entry = rb_entry(node, struct btrfs_inode, rb_node); 3303 3304 if (objectid < entry->vfs_inode.i_ino) 3305 node = node->rb_left; 3306 else if (objectid > entry->vfs_inode.i_ino) 3307 node = node->rb_right; 3308 else 3309 break; 3310 } 3311 if (!node) { 3312 while (prev) { 3313 entry = rb_entry(prev, struct btrfs_inode, rb_node); 3314 if (objectid <= entry->vfs_inode.i_ino) { 3315 node = prev; 3316 break; 3317 } 3318 prev = rb_next(prev); 3319 } 3320 } 3321 while (node) { 3322 entry = rb_entry(node, struct btrfs_inode, rb_node); 3323 objectid = entry->vfs_inode.i_ino + 1; 3324 inode = igrab(&entry->vfs_inode); 3325 if (inode) { 3326 spin_unlock(&root->inode_lock); 3327 if (atomic_read(&inode->i_count) > 1) 3328 d_prune_aliases(inode); 3329 /* 3330 * btrfs_drop_inode will remove it from 3331 * the inode cache when its usage count 3332 * hits zero. 3333 */ 3334 iput(inode); 3335 cond_resched(); 3336 spin_lock(&root->inode_lock); 3337 goto again; 3338 } 3339 3340 if (cond_resched_lock(&root->inode_lock)) 3341 goto again; 3342 3343 node = rb_next(node); 3344 } 3345 spin_unlock(&root->inode_lock); 3346 return 0; 3347 } 3348 3349 static noinline void init_btrfs_i(struct inode *inode) 3350 { 3351 struct btrfs_inode *bi = BTRFS_I(inode); 3352 3353 bi->generation = 0; 3354 bi->sequence = 0; 3355 bi->last_trans = 0; 3356 bi->logged_trans = 0; 3357 bi->delalloc_bytes = 0; 3358 bi->reserved_bytes = 0; 3359 bi->disk_i_size = 0; 3360 bi->flags = 0; 3361 bi->index_cnt = (u64)-1; 3362 bi->last_unlink_trans = 0; 3363 bi->ordered_data_close = 0; 3364 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3365 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3366 inode->i_mapping, GFP_NOFS); 3367 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3368 inode->i_mapping, GFP_NOFS); 3369 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3370 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); 3371 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3372 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3373 mutex_init(&BTRFS_I(inode)->extent_mutex); 3374 mutex_init(&BTRFS_I(inode)->log_mutex); 3375 } 3376 3377 static int btrfs_init_locked_inode(struct inode *inode, void *p) 3378 { 3379 struct btrfs_iget_args *args = p; 3380 inode->i_ino = args->ino; 3381 init_btrfs_i(inode); 3382 BTRFS_I(inode)->root = args->root; 3383 btrfs_set_inode_space_info(args->root, inode); 3384 return 0; 3385 } 3386 3387 static int btrfs_find_actor(struct inode *inode, void *opaque) 3388 { 3389 struct btrfs_iget_args *args = opaque; 3390 return args->ino == inode->i_ino && 3391 args->root == BTRFS_I(inode)->root; 3392 } 3393 3394 static struct inode *btrfs_iget_locked(struct super_block *s, 3395 u64 objectid, 3396 struct btrfs_root *root) 3397 { 3398 struct inode *inode; 3399 struct btrfs_iget_args args; 3400 args.ino = objectid; 3401 args.root = root; 3402 3403 inode = iget5_locked(s, objectid, btrfs_find_actor, 3404 btrfs_init_locked_inode, 3405 (void *)&args); 3406 return inode; 3407 } 3408 3409 /* Get an inode object given its location and corresponding root. 3410 * Returns in *is_new if the inode was read from disk 3411 */ 3412 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3413 struct btrfs_root *root) 3414 { 3415 struct inode *inode; 3416 3417 inode = btrfs_iget_locked(s, location->objectid, root); 3418 if (!inode) 3419 return ERR_PTR(-ENOMEM); 3420 3421 if (inode->i_state & I_NEW) { 3422 BTRFS_I(inode)->root = root; 3423 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3424 btrfs_read_locked_inode(inode); 3425 3426 inode_tree_add(inode); 3427 unlock_new_inode(inode); 3428 } 3429 3430 return inode; 3431 } 3432 3433 static struct inode *new_simple_dir(struct super_block *s, 3434 struct btrfs_key *key, 3435 struct btrfs_root *root) 3436 { 3437 struct inode *inode = new_inode(s); 3438 3439 if (!inode) 3440 return ERR_PTR(-ENOMEM); 3441 3442 init_btrfs_i(inode); 3443 3444 BTRFS_I(inode)->root = root; 3445 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 3446 BTRFS_I(inode)->dummy_inode = 1; 3447 3448 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 3449 inode->i_op = &simple_dir_inode_operations; 3450 inode->i_fop = &simple_dir_operations; 3451 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 3452 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 3453 3454 return inode; 3455 } 3456 3457 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 3458 { 3459 struct inode *inode; 3460 struct btrfs_root *root = BTRFS_I(dir)->root; 3461 struct btrfs_root *sub_root = root; 3462 struct btrfs_key location; 3463 int index; 3464 int ret; 3465 3466 dentry->d_op = &btrfs_dentry_operations; 3467 3468 if (dentry->d_name.len > BTRFS_NAME_LEN) 3469 return ERR_PTR(-ENAMETOOLONG); 3470 3471 ret = btrfs_inode_by_name(dir, dentry, &location); 3472 3473 if (ret < 0) 3474 return ERR_PTR(ret); 3475 3476 if (location.objectid == 0) 3477 return NULL; 3478 3479 if (location.type == BTRFS_INODE_ITEM_KEY) { 3480 inode = btrfs_iget(dir->i_sb, &location, root); 3481 return inode; 3482 } 3483 3484 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 3485 3486 index = srcu_read_lock(&root->fs_info->subvol_srcu); 3487 ret = fixup_tree_root_location(root, dir, dentry, 3488 &location, &sub_root); 3489 if (ret < 0) { 3490 if (ret != -ENOENT) 3491 inode = ERR_PTR(ret); 3492 else 3493 inode = new_simple_dir(dir->i_sb, &location, sub_root); 3494 } else { 3495 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3496 } 3497 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3498 3499 return inode; 3500 } 3501 3502 static int btrfs_dentry_delete(struct dentry *dentry) 3503 { 3504 struct btrfs_root *root; 3505 3506 if (!dentry->d_inode) 3507 return 0; 3508 3509 root = BTRFS_I(dentry->d_inode)->root; 3510 if (btrfs_root_refs(&root->root_item) == 0) 3511 return 1; 3512 return 0; 3513 } 3514 3515 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 3516 struct nameidata *nd) 3517 { 3518 struct inode *inode; 3519 3520 inode = btrfs_lookup_dentry(dir, dentry); 3521 if (IS_ERR(inode)) 3522 return ERR_CAST(inode); 3523 3524 return d_splice_alias(inode, dentry); 3525 } 3526 3527 static unsigned char btrfs_filetype_table[] = { 3528 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 3529 }; 3530 3531 static int btrfs_real_readdir(struct file *filp, void *dirent, 3532 filldir_t filldir) 3533 { 3534 struct inode *inode = filp->f_dentry->d_inode; 3535 struct btrfs_root *root = BTRFS_I(inode)->root; 3536 struct btrfs_item *item; 3537 struct btrfs_dir_item *di; 3538 struct btrfs_key key; 3539 struct btrfs_key found_key; 3540 struct btrfs_path *path; 3541 int ret; 3542 u32 nritems; 3543 struct extent_buffer *leaf; 3544 int slot; 3545 int advance; 3546 unsigned char d_type; 3547 int over = 0; 3548 u32 di_cur; 3549 u32 di_total; 3550 u32 di_len; 3551 int key_type = BTRFS_DIR_INDEX_KEY; 3552 char tmp_name[32]; 3553 char *name_ptr; 3554 int name_len; 3555 3556 /* FIXME, use a real flag for deciding about the key type */ 3557 if (root->fs_info->tree_root == root) 3558 key_type = BTRFS_DIR_ITEM_KEY; 3559 3560 /* special case for "." */ 3561 if (filp->f_pos == 0) { 3562 over = filldir(dirent, ".", 1, 3563 1, inode->i_ino, 3564 DT_DIR); 3565 if (over) 3566 return 0; 3567 filp->f_pos = 1; 3568 } 3569 /* special case for .., just use the back ref */ 3570 if (filp->f_pos == 1) { 3571 u64 pino = parent_ino(filp->f_path.dentry); 3572 over = filldir(dirent, "..", 2, 3573 2, pino, DT_DIR); 3574 if (over) 3575 return 0; 3576 filp->f_pos = 2; 3577 } 3578 path = btrfs_alloc_path(); 3579 path->reada = 2; 3580 3581 btrfs_set_key_type(&key, key_type); 3582 key.offset = filp->f_pos; 3583 key.objectid = inode->i_ino; 3584 3585 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3586 if (ret < 0) 3587 goto err; 3588 advance = 0; 3589 3590 while (1) { 3591 leaf = path->nodes[0]; 3592 nritems = btrfs_header_nritems(leaf); 3593 slot = path->slots[0]; 3594 if (advance || slot >= nritems) { 3595 if (slot >= nritems - 1) { 3596 ret = btrfs_next_leaf(root, path); 3597 if (ret) 3598 break; 3599 leaf = path->nodes[0]; 3600 nritems = btrfs_header_nritems(leaf); 3601 slot = path->slots[0]; 3602 } else { 3603 slot++; 3604 path->slots[0]++; 3605 } 3606 } 3607 3608 advance = 1; 3609 item = btrfs_item_nr(leaf, slot); 3610 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3611 3612 if (found_key.objectid != key.objectid) 3613 break; 3614 if (btrfs_key_type(&found_key) != key_type) 3615 break; 3616 if (found_key.offset < filp->f_pos) 3617 continue; 3618 3619 filp->f_pos = found_key.offset; 3620 3621 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 3622 di_cur = 0; 3623 di_total = btrfs_item_size(leaf, item); 3624 3625 while (di_cur < di_total) { 3626 struct btrfs_key location; 3627 3628 name_len = btrfs_dir_name_len(leaf, di); 3629 if (name_len <= sizeof(tmp_name)) { 3630 name_ptr = tmp_name; 3631 } else { 3632 name_ptr = kmalloc(name_len, GFP_NOFS); 3633 if (!name_ptr) { 3634 ret = -ENOMEM; 3635 goto err; 3636 } 3637 } 3638 read_extent_buffer(leaf, name_ptr, 3639 (unsigned long)(di + 1), name_len); 3640 3641 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 3642 btrfs_dir_item_key_to_cpu(leaf, di, &location); 3643 3644 /* is this a reference to our own snapshot? If so 3645 * skip it 3646 */ 3647 if (location.type == BTRFS_ROOT_ITEM_KEY && 3648 location.objectid == root->root_key.objectid) { 3649 over = 0; 3650 goto skip; 3651 } 3652 over = filldir(dirent, name_ptr, name_len, 3653 found_key.offset, location.objectid, 3654 d_type); 3655 3656 skip: 3657 if (name_ptr != tmp_name) 3658 kfree(name_ptr); 3659 3660 if (over) 3661 goto nopos; 3662 di_len = btrfs_dir_name_len(leaf, di) + 3663 btrfs_dir_data_len(leaf, di) + sizeof(*di); 3664 di_cur += di_len; 3665 di = (struct btrfs_dir_item *)((char *)di + di_len); 3666 } 3667 } 3668 3669 /* Reached end of directory/root. Bump pos past the last item. */ 3670 if (key_type == BTRFS_DIR_INDEX_KEY) 3671 filp->f_pos = INT_LIMIT(off_t); 3672 else 3673 filp->f_pos++; 3674 nopos: 3675 ret = 0; 3676 err: 3677 btrfs_free_path(path); 3678 return ret; 3679 } 3680 3681 int btrfs_write_inode(struct inode *inode, int wait) 3682 { 3683 struct btrfs_root *root = BTRFS_I(inode)->root; 3684 struct btrfs_trans_handle *trans; 3685 int ret = 0; 3686 3687 if (root->fs_info->btree_inode == inode) 3688 return 0; 3689 3690 if (wait) { 3691 trans = btrfs_join_transaction(root, 1); 3692 btrfs_set_trans_block_group(trans, inode); 3693 ret = btrfs_commit_transaction(trans, root); 3694 } 3695 return ret; 3696 } 3697 3698 /* 3699 * This is somewhat expensive, updating the tree every time the 3700 * inode changes. But, it is most likely to find the inode in cache. 3701 * FIXME, needs more benchmarking...there are no reasons other than performance 3702 * to keep or drop this code. 3703 */ 3704 void btrfs_dirty_inode(struct inode *inode) 3705 { 3706 struct btrfs_root *root = BTRFS_I(inode)->root; 3707 struct btrfs_trans_handle *trans; 3708 3709 trans = btrfs_join_transaction(root, 1); 3710 btrfs_set_trans_block_group(trans, inode); 3711 btrfs_update_inode(trans, root, inode); 3712 btrfs_end_transaction(trans, root); 3713 } 3714 3715 /* 3716 * find the highest existing sequence number in a directory 3717 * and then set the in-memory index_cnt variable to reflect 3718 * free sequence numbers 3719 */ 3720 static int btrfs_set_inode_index_count(struct inode *inode) 3721 { 3722 struct btrfs_root *root = BTRFS_I(inode)->root; 3723 struct btrfs_key key, found_key; 3724 struct btrfs_path *path; 3725 struct extent_buffer *leaf; 3726 int ret; 3727 3728 key.objectid = inode->i_ino; 3729 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 3730 key.offset = (u64)-1; 3731 3732 path = btrfs_alloc_path(); 3733 if (!path) 3734 return -ENOMEM; 3735 3736 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3737 if (ret < 0) 3738 goto out; 3739 /* FIXME: we should be able to handle this */ 3740 if (ret == 0) 3741 goto out; 3742 ret = 0; 3743 3744 /* 3745 * MAGIC NUMBER EXPLANATION: 3746 * since we search a directory based on f_pos we have to start at 2 3747 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 3748 * else has to start at 2 3749 */ 3750 if (path->slots[0] == 0) { 3751 BTRFS_I(inode)->index_cnt = 2; 3752 goto out; 3753 } 3754 3755 path->slots[0]--; 3756 3757 leaf = path->nodes[0]; 3758 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3759 3760 if (found_key.objectid != inode->i_ino || 3761 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 3762 BTRFS_I(inode)->index_cnt = 2; 3763 goto out; 3764 } 3765 3766 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 3767 out: 3768 btrfs_free_path(path); 3769 return ret; 3770 } 3771 3772 /* 3773 * helper to find a free sequence number in a given directory. This current 3774 * code is very simple, later versions will do smarter things in the btree 3775 */ 3776 int btrfs_set_inode_index(struct inode *dir, u64 *index) 3777 { 3778 int ret = 0; 3779 3780 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 3781 ret = btrfs_set_inode_index_count(dir); 3782 if (ret) 3783 return ret; 3784 } 3785 3786 *index = BTRFS_I(dir)->index_cnt; 3787 BTRFS_I(dir)->index_cnt++; 3788 3789 return ret; 3790 } 3791 3792 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 3793 struct btrfs_root *root, 3794 struct inode *dir, 3795 const char *name, int name_len, 3796 u64 ref_objectid, u64 objectid, 3797 u64 alloc_hint, int mode, u64 *index) 3798 { 3799 struct inode *inode; 3800 struct btrfs_inode_item *inode_item; 3801 struct btrfs_key *location; 3802 struct btrfs_path *path; 3803 struct btrfs_inode_ref *ref; 3804 struct btrfs_key key[2]; 3805 u32 sizes[2]; 3806 unsigned long ptr; 3807 int ret; 3808 int owner; 3809 3810 path = btrfs_alloc_path(); 3811 BUG_ON(!path); 3812 3813 inode = new_inode(root->fs_info->sb); 3814 if (!inode) 3815 return ERR_PTR(-ENOMEM); 3816 3817 if (dir) { 3818 ret = btrfs_set_inode_index(dir, index); 3819 if (ret) { 3820 iput(inode); 3821 return ERR_PTR(ret); 3822 } 3823 } 3824 /* 3825 * index_cnt is ignored for everything but a dir, 3826 * btrfs_get_inode_index_count has an explanation for the magic 3827 * number 3828 */ 3829 init_btrfs_i(inode); 3830 BTRFS_I(inode)->index_cnt = 2; 3831 BTRFS_I(inode)->root = root; 3832 BTRFS_I(inode)->generation = trans->transid; 3833 btrfs_set_inode_space_info(root, inode); 3834 3835 if (mode & S_IFDIR) 3836 owner = 0; 3837 else 3838 owner = 1; 3839 BTRFS_I(inode)->block_group = 3840 btrfs_find_block_group(root, 0, alloc_hint, owner); 3841 3842 key[0].objectid = objectid; 3843 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 3844 key[0].offset = 0; 3845 3846 key[1].objectid = objectid; 3847 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 3848 key[1].offset = ref_objectid; 3849 3850 sizes[0] = sizeof(struct btrfs_inode_item); 3851 sizes[1] = name_len + sizeof(*ref); 3852 3853 path->leave_spinning = 1; 3854 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 3855 if (ret != 0) 3856 goto fail; 3857 3858 inode->i_uid = current_fsuid(); 3859 3860 if (dir && (dir->i_mode & S_ISGID)) { 3861 inode->i_gid = dir->i_gid; 3862 if (S_ISDIR(mode)) 3863 mode |= S_ISGID; 3864 } else 3865 inode->i_gid = current_fsgid(); 3866 3867 inode->i_mode = mode; 3868 inode->i_ino = objectid; 3869 inode_set_bytes(inode, 0); 3870 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 3871 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3872 struct btrfs_inode_item); 3873 fill_inode_item(trans, path->nodes[0], inode_item, inode); 3874 3875 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 3876 struct btrfs_inode_ref); 3877 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 3878 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 3879 ptr = (unsigned long)(ref + 1); 3880 write_extent_buffer(path->nodes[0], name, ptr, name_len); 3881 3882 btrfs_mark_buffer_dirty(path->nodes[0]); 3883 btrfs_free_path(path); 3884 3885 location = &BTRFS_I(inode)->location; 3886 location->objectid = objectid; 3887 location->offset = 0; 3888 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 3889 3890 btrfs_inherit_iflags(inode, dir); 3891 3892 if ((mode & S_IFREG)) { 3893 if (btrfs_test_opt(root, NODATASUM)) 3894 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 3895 if (btrfs_test_opt(root, NODATACOW)) 3896 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 3897 } 3898 3899 insert_inode_hash(inode); 3900 inode_tree_add(inode); 3901 return inode; 3902 fail: 3903 if (dir) 3904 BTRFS_I(dir)->index_cnt--; 3905 btrfs_free_path(path); 3906 iput(inode); 3907 return ERR_PTR(ret); 3908 } 3909 3910 static inline u8 btrfs_inode_type(struct inode *inode) 3911 { 3912 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 3913 } 3914 3915 /* 3916 * utility function to add 'inode' into 'parent_inode' with 3917 * a give name and a given sequence number. 3918 * if 'add_backref' is true, also insert a backref from the 3919 * inode to the parent directory. 3920 */ 3921 int btrfs_add_link(struct btrfs_trans_handle *trans, 3922 struct inode *parent_inode, struct inode *inode, 3923 const char *name, int name_len, int add_backref, u64 index) 3924 { 3925 int ret = 0; 3926 struct btrfs_key key; 3927 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 3928 3929 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 3930 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 3931 } else { 3932 key.objectid = inode->i_ino; 3933 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 3934 key.offset = 0; 3935 } 3936 3937 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 3938 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 3939 key.objectid, root->root_key.objectid, 3940 parent_inode->i_ino, 3941 index, name, name_len); 3942 } else if (add_backref) { 3943 ret = btrfs_insert_inode_ref(trans, root, 3944 name, name_len, inode->i_ino, 3945 parent_inode->i_ino, index); 3946 } 3947 3948 if (ret == 0) { 3949 ret = btrfs_insert_dir_item(trans, root, name, name_len, 3950 parent_inode->i_ino, &key, 3951 btrfs_inode_type(inode), index); 3952 BUG_ON(ret); 3953 3954 btrfs_i_size_write(parent_inode, parent_inode->i_size + 3955 name_len * 2); 3956 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 3957 ret = btrfs_update_inode(trans, root, parent_inode); 3958 } 3959 return ret; 3960 } 3961 3962 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 3963 struct dentry *dentry, struct inode *inode, 3964 int backref, u64 index) 3965 { 3966 int err = btrfs_add_link(trans, dentry->d_parent->d_inode, 3967 inode, dentry->d_name.name, 3968 dentry->d_name.len, backref, index); 3969 if (!err) { 3970 d_instantiate(dentry, inode); 3971 return 0; 3972 } 3973 if (err > 0) 3974 err = -EEXIST; 3975 return err; 3976 } 3977 3978 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 3979 int mode, dev_t rdev) 3980 { 3981 struct btrfs_trans_handle *trans; 3982 struct btrfs_root *root = BTRFS_I(dir)->root; 3983 struct inode *inode = NULL; 3984 int err; 3985 int drop_inode = 0; 3986 u64 objectid; 3987 unsigned long nr = 0; 3988 u64 index = 0; 3989 3990 if (!new_valid_dev(rdev)) 3991 return -EINVAL; 3992 3993 err = btrfs_check_metadata_free_space(root); 3994 if (err) 3995 goto fail; 3996 3997 trans = btrfs_start_transaction(root, 1); 3998 btrfs_set_trans_block_group(trans, dir); 3999 4000 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4001 if (err) { 4002 err = -ENOSPC; 4003 goto out_unlock; 4004 } 4005 4006 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4007 dentry->d_name.len, 4008 dentry->d_parent->d_inode->i_ino, objectid, 4009 BTRFS_I(dir)->block_group, mode, &index); 4010 err = PTR_ERR(inode); 4011 if (IS_ERR(inode)) 4012 goto out_unlock; 4013 4014 err = btrfs_init_inode_security(inode, dir); 4015 if (err) { 4016 drop_inode = 1; 4017 goto out_unlock; 4018 } 4019 4020 btrfs_set_trans_block_group(trans, inode); 4021 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4022 if (err) 4023 drop_inode = 1; 4024 else { 4025 inode->i_op = &btrfs_special_inode_operations; 4026 init_special_inode(inode, inode->i_mode, rdev); 4027 btrfs_update_inode(trans, root, inode); 4028 } 4029 btrfs_update_inode_block_group(trans, inode); 4030 btrfs_update_inode_block_group(trans, dir); 4031 out_unlock: 4032 nr = trans->blocks_used; 4033 btrfs_end_transaction_throttle(trans, root); 4034 fail: 4035 if (drop_inode) { 4036 inode_dec_link_count(inode); 4037 iput(inode); 4038 } 4039 btrfs_btree_balance_dirty(root, nr); 4040 return err; 4041 } 4042 4043 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4044 int mode, struct nameidata *nd) 4045 { 4046 struct btrfs_trans_handle *trans; 4047 struct btrfs_root *root = BTRFS_I(dir)->root; 4048 struct inode *inode = NULL; 4049 int err; 4050 int drop_inode = 0; 4051 unsigned long nr = 0; 4052 u64 objectid; 4053 u64 index = 0; 4054 4055 err = btrfs_check_metadata_free_space(root); 4056 if (err) 4057 goto fail; 4058 trans = btrfs_start_transaction(root, 1); 4059 btrfs_set_trans_block_group(trans, dir); 4060 4061 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4062 if (err) { 4063 err = -ENOSPC; 4064 goto out_unlock; 4065 } 4066 4067 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4068 dentry->d_name.len, 4069 dentry->d_parent->d_inode->i_ino, 4070 objectid, BTRFS_I(dir)->block_group, mode, 4071 &index); 4072 err = PTR_ERR(inode); 4073 if (IS_ERR(inode)) 4074 goto out_unlock; 4075 4076 err = btrfs_init_inode_security(inode, dir); 4077 if (err) { 4078 drop_inode = 1; 4079 goto out_unlock; 4080 } 4081 4082 btrfs_set_trans_block_group(trans, inode); 4083 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4084 if (err) 4085 drop_inode = 1; 4086 else { 4087 inode->i_mapping->a_ops = &btrfs_aops; 4088 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4089 inode->i_fop = &btrfs_file_operations; 4090 inode->i_op = &btrfs_file_inode_operations; 4091 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4092 } 4093 btrfs_update_inode_block_group(trans, inode); 4094 btrfs_update_inode_block_group(trans, dir); 4095 out_unlock: 4096 nr = trans->blocks_used; 4097 btrfs_end_transaction_throttle(trans, root); 4098 fail: 4099 if (drop_inode) { 4100 inode_dec_link_count(inode); 4101 iput(inode); 4102 } 4103 btrfs_btree_balance_dirty(root, nr); 4104 return err; 4105 } 4106 4107 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 4108 struct dentry *dentry) 4109 { 4110 struct btrfs_trans_handle *trans; 4111 struct btrfs_root *root = BTRFS_I(dir)->root; 4112 struct inode *inode = old_dentry->d_inode; 4113 u64 index; 4114 unsigned long nr = 0; 4115 int err; 4116 int drop_inode = 0; 4117 4118 if (inode->i_nlink == 0) 4119 return -ENOENT; 4120 4121 btrfs_inc_nlink(inode); 4122 err = btrfs_check_metadata_free_space(root); 4123 if (err) 4124 goto fail; 4125 err = btrfs_set_inode_index(dir, &index); 4126 if (err) 4127 goto fail; 4128 4129 trans = btrfs_start_transaction(root, 1); 4130 4131 btrfs_set_trans_block_group(trans, dir); 4132 atomic_inc(&inode->i_count); 4133 4134 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4135 4136 if (err) { 4137 drop_inode = 1; 4138 } else { 4139 btrfs_update_inode_block_group(trans, dir); 4140 err = btrfs_update_inode(trans, root, inode); 4141 BUG_ON(err); 4142 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); 4143 } 4144 4145 nr = trans->blocks_used; 4146 btrfs_end_transaction_throttle(trans, root); 4147 fail: 4148 if (drop_inode) { 4149 inode_dec_link_count(inode); 4150 iput(inode); 4151 } 4152 btrfs_btree_balance_dirty(root, nr); 4153 return err; 4154 } 4155 4156 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 4157 { 4158 struct inode *inode = NULL; 4159 struct btrfs_trans_handle *trans; 4160 struct btrfs_root *root = BTRFS_I(dir)->root; 4161 int err = 0; 4162 int drop_on_err = 0; 4163 u64 objectid = 0; 4164 u64 index = 0; 4165 unsigned long nr = 1; 4166 4167 err = btrfs_check_metadata_free_space(root); 4168 if (err) 4169 goto out_unlock; 4170 4171 trans = btrfs_start_transaction(root, 1); 4172 btrfs_set_trans_block_group(trans, dir); 4173 4174 if (IS_ERR(trans)) { 4175 err = PTR_ERR(trans); 4176 goto out_unlock; 4177 } 4178 4179 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4180 if (err) { 4181 err = -ENOSPC; 4182 goto out_unlock; 4183 } 4184 4185 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4186 dentry->d_name.len, 4187 dentry->d_parent->d_inode->i_ino, objectid, 4188 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4189 &index); 4190 if (IS_ERR(inode)) { 4191 err = PTR_ERR(inode); 4192 goto out_fail; 4193 } 4194 4195 drop_on_err = 1; 4196 4197 err = btrfs_init_inode_security(inode, dir); 4198 if (err) 4199 goto out_fail; 4200 4201 inode->i_op = &btrfs_dir_inode_operations; 4202 inode->i_fop = &btrfs_dir_file_operations; 4203 btrfs_set_trans_block_group(trans, inode); 4204 4205 btrfs_i_size_write(inode, 0); 4206 err = btrfs_update_inode(trans, root, inode); 4207 if (err) 4208 goto out_fail; 4209 4210 err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4211 inode, dentry->d_name.name, 4212 dentry->d_name.len, 0, index); 4213 if (err) 4214 goto out_fail; 4215 4216 d_instantiate(dentry, inode); 4217 drop_on_err = 0; 4218 btrfs_update_inode_block_group(trans, inode); 4219 btrfs_update_inode_block_group(trans, dir); 4220 4221 out_fail: 4222 nr = trans->blocks_used; 4223 btrfs_end_transaction_throttle(trans, root); 4224 4225 out_unlock: 4226 if (drop_on_err) 4227 iput(inode); 4228 btrfs_btree_balance_dirty(root, nr); 4229 return err; 4230 } 4231 4232 /* helper for btfs_get_extent. Given an existing extent in the tree, 4233 * and an extent that you want to insert, deal with overlap and insert 4234 * the new extent into the tree. 4235 */ 4236 static int merge_extent_mapping(struct extent_map_tree *em_tree, 4237 struct extent_map *existing, 4238 struct extent_map *em, 4239 u64 map_start, u64 map_len) 4240 { 4241 u64 start_diff; 4242 4243 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 4244 start_diff = map_start - em->start; 4245 em->start = map_start; 4246 em->len = map_len; 4247 if (em->block_start < EXTENT_MAP_LAST_BYTE && 4248 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 4249 em->block_start += start_diff; 4250 em->block_len -= start_diff; 4251 } 4252 return add_extent_mapping(em_tree, em); 4253 } 4254 4255 static noinline int uncompress_inline(struct btrfs_path *path, 4256 struct inode *inode, struct page *page, 4257 size_t pg_offset, u64 extent_offset, 4258 struct btrfs_file_extent_item *item) 4259 { 4260 int ret; 4261 struct extent_buffer *leaf = path->nodes[0]; 4262 char *tmp; 4263 size_t max_size; 4264 unsigned long inline_size; 4265 unsigned long ptr; 4266 4267 WARN_ON(pg_offset != 0); 4268 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4269 inline_size = btrfs_file_extent_inline_item_len(leaf, 4270 btrfs_item_nr(leaf, path->slots[0])); 4271 tmp = kmalloc(inline_size, GFP_NOFS); 4272 ptr = btrfs_file_extent_inline_start(item); 4273 4274 read_extent_buffer(leaf, tmp, ptr, inline_size); 4275 4276 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4277 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4278 inline_size, max_size); 4279 if (ret) { 4280 char *kaddr = kmap_atomic(page, KM_USER0); 4281 unsigned long copy_size = min_t(u64, 4282 PAGE_CACHE_SIZE - pg_offset, 4283 max_size - extent_offset); 4284 memset(kaddr + pg_offset, 0, copy_size); 4285 kunmap_atomic(kaddr, KM_USER0); 4286 } 4287 kfree(tmp); 4288 return 0; 4289 } 4290 4291 /* 4292 * a bit scary, this does extent mapping from logical file offset to the disk. 4293 * the ugly parts come from merging extents from the disk with the in-ram 4294 * representation. This gets more complex because of the data=ordered code, 4295 * where the in-ram extents might be locked pending data=ordered completion. 4296 * 4297 * This also copies inline extents directly into the page. 4298 */ 4299 4300 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 4301 size_t pg_offset, u64 start, u64 len, 4302 int create) 4303 { 4304 int ret; 4305 int err = 0; 4306 u64 bytenr; 4307 u64 extent_start = 0; 4308 u64 extent_end = 0; 4309 u64 objectid = inode->i_ino; 4310 u32 found_type; 4311 struct btrfs_path *path = NULL; 4312 struct btrfs_root *root = BTRFS_I(inode)->root; 4313 struct btrfs_file_extent_item *item; 4314 struct extent_buffer *leaf; 4315 struct btrfs_key found_key; 4316 struct extent_map *em = NULL; 4317 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4318 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4319 struct btrfs_trans_handle *trans = NULL; 4320 int compressed; 4321 4322 again: 4323 read_lock(&em_tree->lock); 4324 em = lookup_extent_mapping(em_tree, start, len); 4325 if (em) 4326 em->bdev = root->fs_info->fs_devices->latest_bdev; 4327 read_unlock(&em_tree->lock); 4328 4329 if (em) { 4330 if (em->start > start || em->start + em->len <= start) 4331 free_extent_map(em); 4332 else if (em->block_start == EXTENT_MAP_INLINE && page) 4333 free_extent_map(em); 4334 else 4335 goto out; 4336 } 4337 em = alloc_extent_map(GFP_NOFS); 4338 if (!em) { 4339 err = -ENOMEM; 4340 goto out; 4341 } 4342 em->bdev = root->fs_info->fs_devices->latest_bdev; 4343 em->start = EXTENT_MAP_HOLE; 4344 em->orig_start = EXTENT_MAP_HOLE; 4345 em->len = (u64)-1; 4346 em->block_len = (u64)-1; 4347 4348 if (!path) { 4349 path = btrfs_alloc_path(); 4350 BUG_ON(!path); 4351 } 4352 4353 ret = btrfs_lookup_file_extent(trans, root, path, 4354 objectid, start, trans != NULL); 4355 if (ret < 0) { 4356 err = ret; 4357 goto out; 4358 } 4359 4360 if (ret != 0) { 4361 if (path->slots[0] == 0) 4362 goto not_found; 4363 path->slots[0]--; 4364 } 4365 4366 leaf = path->nodes[0]; 4367 item = btrfs_item_ptr(leaf, path->slots[0], 4368 struct btrfs_file_extent_item); 4369 /* are we inside the extent that was found? */ 4370 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4371 found_type = btrfs_key_type(&found_key); 4372 if (found_key.objectid != objectid || 4373 found_type != BTRFS_EXTENT_DATA_KEY) { 4374 goto not_found; 4375 } 4376 4377 found_type = btrfs_file_extent_type(leaf, item); 4378 extent_start = found_key.offset; 4379 compressed = btrfs_file_extent_compression(leaf, item); 4380 if (found_type == BTRFS_FILE_EXTENT_REG || 4381 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 4382 extent_end = extent_start + 4383 btrfs_file_extent_num_bytes(leaf, item); 4384 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 4385 size_t size; 4386 size = btrfs_file_extent_inline_len(leaf, item); 4387 extent_end = (extent_start + size + root->sectorsize - 1) & 4388 ~((u64)root->sectorsize - 1); 4389 } 4390 4391 if (start >= extent_end) { 4392 path->slots[0]++; 4393 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 4394 ret = btrfs_next_leaf(root, path); 4395 if (ret < 0) { 4396 err = ret; 4397 goto out; 4398 } 4399 if (ret > 0) 4400 goto not_found; 4401 leaf = path->nodes[0]; 4402 } 4403 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4404 if (found_key.objectid != objectid || 4405 found_key.type != BTRFS_EXTENT_DATA_KEY) 4406 goto not_found; 4407 if (start + len <= found_key.offset) 4408 goto not_found; 4409 em->start = start; 4410 em->len = found_key.offset - start; 4411 goto not_found_em; 4412 } 4413 4414 if (found_type == BTRFS_FILE_EXTENT_REG || 4415 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 4416 em->start = extent_start; 4417 em->len = extent_end - extent_start; 4418 em->orig_start = extent_start - 4419 btrfs_file_extent_offset(leaf, item); 4420 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 4421 if (bytenr == 0) { 4422 em->block_start = EXTENT_MAP_HOLE; 4423 goto insert; 4424 } 4425 if (compressed) { 4426 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 4427 em->block_start = bytenr; 4428 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 4429 item); 4430 } else { 4431 bytenr += btrfs_file_extent_offset(leaf, item); 4432 em->block_start = bytenr; 4433 em->block_len = em->len; 4434 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 4435 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 4436 } 4437 goto insert; 4438 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 4439 unsigned long ptr; 4440 char *map; 4441 size_t size; 4442 size_t extent_offset; 4443 size_t copy_size; 4444 4445 em->block_start = EXTENT_MAP_INLINE; 4446 if (!page || create) { 4447 em->start = extent_start; 4448 em->len = extent_end - extent_start; 4449 goto out; 4450 } 4451 4452 size = btrfs_file_extent_inline_len(leaf, item); 4453 extent_offset = page_offset(page) + pg_offset - extent_start; 4454 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 4455 size - extent_offset); 4456 em->start = extent_start + extent_offset; 4457 em->len = (copy_size + root->sectorsize - 1) & 4458 ~((u64)root->sectorsize - 1); 4459 em->orig_start = EXTENT_MAP_INLINE; 4460 if (compressed) 4461 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 4462 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 4463 if (create == 0 && !PageUptodate(page)) { 4464 if (btrfs_file_extent_compression(leaf, item) == 4465 BTRFS_COMPRESS_ZLIB) { 4466 ret = uncompress_inline(path, inode, page, 4467 pg_offset, 4468 extent_offset, item); 4469 BUG_ON(ret); 4470 } else { 4471 map = kmap(page); 4472 read_extent_buffer(leaf, map + pg_offset, ptr, 4473 copy_size); 4474 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 4475 memset(map + pg_offset + copy_size, 0, 4476 PAGE_CACHE_SIZE - pg_offset - 4477 copy_size); 4478 } 4479 kunmap(page); 4480 } 4481 flush_dcache_page(page); 4482 } else if (create && PageUptodate(page)) { 4483 if (!trans) { 4484 kunmap(page); 4485 free_extent_map(em); 4486 em = NULL; 4487 btrfs_release_path(root, path); 4488 trans = btrfs_join_transaction(root, 1); 4489 goto again; 4490 } 4491 map = kmap(page); 4492 write_extent_buffer(leaf, map + pg_offset, ptr, 4493 copy_size); 4494 kunmap(page); 4495 btrfs_mark_buffer_dirty(leaf); 4496 } 4497 set_extent_uptodate(io_tree, em->start, 4498 extent_map_end(em) - 1, GFP_NOFS); 4499 goto insert; 4500 } else { 4501 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 4502 WARN_ON(1); 4503 } 4504 not_found: 4505 em->start = start; 4506 em->len = len; 4507 not_found_em: 4508 em->block_start = EXTENT_MAP_HOLE; 4509 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 4510 insert: 4511 btrfs_release_path(root, path); 4512 if (em->start > start || extent_map_end(em) <= start) { 4513 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 4514 "[%llu %llu]\n", (unsigned long long)em->start, 4515 (unsigned long long)em->len, 4516 (unsigned long long)start, 4517 (unsigned long long)len); 4518 err = -EIO; 4519 goto out; 4520 } 4521 4522 err = 0; 4523 write_lock(&em_tree->lock); 4524 ret = add_extent_mapping(em_tree, em); 4525 /* it is possible that someone inserted the extent into the tree 4526 * while we had the lock dropped. It is also possible that 4527 * an overlapping map exists in the tree 4528 */ 4529 if (ret == -EEXIST) { 4530 struct extent_map *existing; 4531 4532 ret = 0; 4533 4534 existing = lookup_extent_mapping(em_tree, start, len); 4535 if (existing && (existing->start > start || 4536 existing->start + existing->len <= start)) { 4537 free_extent_map(existing); 4538 existing = NULL; 4539 } 4540 if (!existing) { 4541 existing = lookup_extent_mapping(em_tree, em->start, 4542 em->len); 4543 if (existing) { 4544 err = merge_extent_mapping(em_tree, existing, 4545 em, start, 4546 root->sectorsize); 4547 free_extent_map(existing); 4548 if (err) { 4549 free_extent_map(em); 4550 em = NULL; 4551 } 4552 } else { 4553 err = -EIO; 4554 free_extent_map(em); 4555 em = NULL; 4556 } 4557 } else { 4558 free_extent_map(em); 4559 em = existing; 4560 err = 0; 4561 } 4562 } 4563 write_unlock(&em_tree->lock); 4564 out: 4565 if (path) 4566 btrfs_free_path(path); 4567 if (trans) { 4568 ret = btrfs_end_transaction(trans, root); 4569 if (!err) 4570 err = ret; 4571 } 4572 if (err) { 4573 free_extent_map(em); 4574 return ERR_PTR(err); 4575 } 4576 return em; 4577 } 4578 4579 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 4580 const struct iovec *iov, loff_t offset, 4581 unsigned long nr_segs) 4582 { 4583 return -EINVAL; 4584 } 4585 4586 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4587 __u64 start, __u64 len) 4588 { 4589 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); 4590 } 4591 4592 int btrfs_readpage(struct file *file, struct page *page) 4593 { 4594 struct extent_io_tree *tree; 4595 tree = &BTRFS_I(page->mapping->host)->io_tree; 4596 return extent_read_full_page(tree, page, btrfs_get_extent); 4597 } 4598 4599 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 4600 { 4601 struct extent_io_tree *tree; 4602 4603 4604 if (current->flags & PF_MEMALLOC) { 4605 redirty_page_for_writepage(wbc, page); 4606 unlock_page(page); 4607 return 0; 4608 } 4609 tree = &BTRFS_I(page->mapping->host)->io_tree; 4610 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 4611 } 4612 4613 int btrfs_writepages(struct address_space *mapping, 4614 struct writeback_control *wbc) 4615 { 4616 struct extent_io_tree *tree; 4617 4618 tree = &BTRFS_I(mapping->host)->io_tree; 4619 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 4620 } 4621 4622 static int 4623 btrfs_readpages(struct file *file, struct address_space *mapping, 4624 struct list_head *pages, unsigned nr_pages) 4625 { 4626 struct extent_io_tree *tree; 4627 tree = &BTRFS_I(mapping->host)->io_tree; 4628 return extent_readpages(tree, mapping, pages, nr_pages, 4629 btrfs_get_extent); 4630 } 4631 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 4632 { 4633 struct extent_io_tree *tree; 4634 struct extent_map_tree *map; 4635 int ret; 4636 4637 tree = &BTRFS_I(page->mapping->host)->io_tree; 4638 map = &BTRFS_I(page->mapping->host)->extent_tree; 4639 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 4640 if (ret == 1) { 4641 ClearPagePrivate(page); 4642 set_page_private(page, 0); 4643 page_cache_release(page); 4644 } 4645 return ret; 4646 } 4647 4648 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 4649 { 4650 if (PageWriteback(page) || PageDirty(page)) 4651 return 0; 4652 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 4653 } 4654 4655 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 4656 { 4657 struct extent_io_tree *tree; 4658 struct btrfs_ordered_extent *ordered; 4659 u64 page_start = page_offset(page); 4660 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 4661 4662 4663 /* 4664 * we have the page locked, so new writeback can't start, 4665 * and the dirty bit won't be cleared while we are here. 4666 * 4667 * Wait for IO on this page so that we can safely clear 4668 * the PagePrivate2 bit and do ordered accounting 4669 */ 4670 wait_on_page_writeback(page); 4671 4672 tree = &BTRFS_I(page->mapping->host)->io_tree; 4673 if (offset) { 4674 btrfs_releasepage(page, GFP_NOFS); 4675 return; 4676 } 4677 lock_extent(tree, page_start, page_end, GFP_NOFS); 4678 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 4679 page_offset(page)); 4680 if (ordered) { 4681 /* 4682 * IO on this page will never be started, so we need 4683 * to account for any ordered extents now 4684 */ 4685 clear_extent_bit(tree, page_start, page_end, 4686 EXTENT_DIRTY | EXTENT_DELALLOC | 4687 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 4688 /* 4689 * whoever cleared the private bit is responsible 4690 * for the finish_ordered_io 4691 */ 4692 if (TestClearPagePrivate2(page)) { 4693 btrfs_finish_ordered_io(page->mapping->host, 4694 page_start, page_end); 4695 } 4696 btrfs_put_ordered_extent(ordered); 4697 lock_extent(tree, page_start, page_end, GFP_NOFS); 4698 } 4699 clear_extent_bit(tree, page_start, page_end, 4700 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 4701 1, 1, NULL, GFP_NOFS); 4702 __btrfs_releasepage(page, GFP_NOFS); 4703 4704 ClearPageChecked(page); 4705 if (PagePrivate(page)) { 4706 ClearPagePrivate(page); 4707 set_page_private(page, 0); 4708 page_cache_release(page); 4709 } 4710 } 4711 4712 /* 4713 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 4714 * called from a page fault handler when a page is first dirtied. Hence we must 4715 * be careful to check for EOF conditions here. We set the page up correctly 4716 * for a written page which means we get ENOSPC checking when writing into 4717 * holes and correct delalloc and unwritten extent mapping on filesystems that 4718 * support these features. 4719 * 4720 * We are not allowed to take the i_mutex here so we have to play games to 4721 * protect against truncate races as the page could now be beyond EOF. Because 4722 * vmtruncate() writes the inode size before removing pages, once we have the 4723 * page lock we can determine safely if the page is beyond EOF. If it is not 4724 * beyond EOF, then the page is guaranteed safe against truncation until we 4725 * unlock the page. 4726 */ 4727 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 4728 { 4729 struct page *page = vmf->page; 4730 struct inode *inode = fdentry(vma->vm_file)->d_inode; 4731 struct btrfs_root *root = BTRFS_I(inode)->root; 4732 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4733 struct btrfs_ordered_extent *ordered; 4734 char *kaddr; 4735 unsigned long zero_start; 4736 loff_t size; 4737 int ret; 4738 u64 page_start; 4739 u64 page_end; 4740 4741 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 4742 if (ret) { 4743 if (ret == -ENOMEM) 4744 ret = VM_FAULT_OOM; 4745 else /* -ENOSPC, -EIO, etc */ 4746 ret = VM_FAULT_SIGBUS; 4747 goto out; 4748 } 4749 4750 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4751 again: 4752 lock_page(page); 4753 size = i_size_read(inode); 4754 page_start = page_offset(page); 4755 page_end = page_start + PAGE_CACHE_SIZE - 1; 4756 4757 if ((page->mapping != inode->i_mapping) || 4758 (page_start >= size)) { 4759 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 4760 /* page got truncated out from underneath us */ 4761 goto out_unlock; 4762 } 4763 wait_on_page_writeback(page); 4764 4765 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 4766 set_page_extent_mapped(page); 4767 4768 /* 4769 * we can't set the delalloc bits if there are pending ordered 4770 * extents. Drop our locks and wait for them to finish 4771 */ 4772 ordered = btrfs_lookup_ordered_extent(inode, page_start); 4773 if (ordered) { 4774 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4775 unlock_page(page); 4776 btrfs_start_ordered_extent(inode, ordered, 1); 4777 btrfs_put_ordered_extent(ordered); 4778 goto again; 4779 } 4780 4781 btrfs_set_extent_delalloc(inode, page_start, page_end); 4782 ret = 0; 4783 4784 /* page is wholly or partially inside EOF */ 4785 if (page_start + PAGE_CACHE_SIZE > size) 4786 zero_start = size & ~PAGE_CACHE_MASK; 4787 else 4788 zero_start = PAGE_CACHE_SIZE; 4789 4790 if (zero_start != PAGE_CACHE_SIZE) { 4791 kaddr = kmap(page); 4792 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 4793 flush_dcache_page(page); 4794 kunmap(page); 4795 } 4796 ClearPageChecked(page); 4797 set_page_dirty(page); 4798 SetPageUptodate(page); 4799 4800 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 4801 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4802 4803 out_unlock: 4804 if (!ret) 4805 return VM_FAULT_LOCKED; 4806 unlock_page(page); 4807 out: 4808 return ret; 4809 } 4810 4811 static void btrfs_truncate(struct inode *inode) 4812 { 4813 struct btrfs_root *root = BTRFS_I(inode)->root; 4814 int ret; 4815 struct btrfs_trans_handle *trans; 4816 unsigned long nr; 4817 u64 mask = root->sectorsize - 1; 4818 4819 if (!S_ISREG(inode->i_mode)) 4820 return; 4821 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 4822 return; 4823 4824 btrfs_truncate_page(inode->i_mapping, inode->i_size); 4825 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4826 4827 trans = btrfs_start_transaction(root, 1); 4828 4829 /* 4830 * setattr is responsible for setting the ordered_data_close flag, 4831 * but that is only tested during the last file release. That 4832 * could happen well after the next commit, leaving a great big 4833 * window where new writes may get lost if someone chooses to write 4834 * to this file after truncating to zero 4835 * 4836 * The inode doesn't have any dirty data here, and so if we commit 4837 * this is a noop. If someone immediately starts writing to the inode 4838 * it is very likely we'll catch some of their writes in this 4839 * transaction, and the commit will find this file on the ordered 4840 * data list with good things to send down. 4841 * 4842 * This is a best effort solution, there is still a window where 4843 * using truncate to replace the contents of the file will 4844 * end up with a zero length file after a crash. 4845 */ 4846 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 4847 btrfs_add_ordered_operation(trans, root, inode); 4848 4849 btrfs_set_trans_block_group(trans, inode); 4850 btrfs_i_size_write(inode, inode->i_size); 4851 4852 ret = btrfs_orphan_add(trans, inode); 4853 if (ret) 4854 goto out; 4855 /* FIXME, add redo link to tree so we don't leak on crash */ 4856 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 4857 BTRFS_EXTENT_DATA_KEY); 4858 btrfs_update_inode(trans, root, inode); 4859 4860 ret = btrfs_orphan_del(trans, inode); 4861 BUG_ON(ret); 4862 4863 out: 4864 nr = trans->blocks_used; 4865 ret = btrfs_end_transaction_throttle(trans, root); 4866 BUG_ON(ret); 4867 btrfs_btree_balance_dirty(root, nr); 4868 } 4869 4870 /* 4871 * create a new subvolume directory/inode (helper for the ioctl). 4872 */ 4873 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 4874 struct btrfs_root *new_root, 4875 u64 new_dirid, u64 alloc_hint) 4876 { 4877 struct inode *inode; 4878 int err; 4879 u64 index = 0; 4880 4881 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 4882 new_dirid, alloc_hint, S_IFDIR | 0700, &index); 4883 if (IS_ERR(inode)) 4884 return PTR_ERR(inode); 4885 inode->i_op = &btrfs_dir_inode_operations; 4886 inode->i_fop = &btrfs_dir_file_operations; 4887 4888 inode->i_nlink = 1; 4889 btrfs_i_size_write(inode, 0); 4890 4891 err = btrfs_update_inode(trans, new_root, inode); 4892 BUG_ON(err); 4893 4894 iput(inode); 4895 return 0; 4896 } 4897 4898 /* helper function for file defrag and space balancing. This 4899 * forces readahead on a given range of bytes in an inode 4900 */ 4901 unsigned long btrfs_force_ra(struct address_space *mapping, 4902 struct file_ra_state *ra, struct file *file, 4903 pgoff_t offset, pgoff_t last_index) 4904 { 4905 pgoff_t req_size = last_index - offset + 1; 4906 4907 page_cache_sync_readahead(mapping, ra, file, offset, req_size); 4908 return offset + req_size; 4909 } 4910 4911 struct inode *btrfs_alloc_inode(struct super_block *sb) 4912 { 4913 struct btrfs_inode *ei; 4914 4915 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 4916 if (!ei) 4917 return NULL; 4918 ei->last_trans = 0; 4919 ei->logged_trans = 0; 4920 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 4921 INIT_LIST_HEAD(&ei->i_orphan); 4922 INIT_LIST_HEAD(&ei->ordered_operations); 4923 return &ei->vfs_inode; 4924 } 4925 4926 void btrfs_destroy_inode(struct inode *inode) 4927 { 4928 struct btrfs_ordered_extent *ordered; 4929 struct btrfs_root *root = BTRFS_I(inode)->root; 4930 4931 WARN_ON(!list_empty(&inode->i_dentry)); 4932 WARN_ON(inode->i_data.nrpages); 4933 4934 /* 4935 * Make sure we're properly removed from the ordered operation 4936 * lists. 4937 */ 4938 smp_mb(); 4939 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 4940 spin_lock(&root->fs_info->ordered_extent_lock); 4941 list_del_init(&BTRFS_I(inode)->ordered_operations); 4942 spin_unlock(&root->fs_info->ordered_extent_lock); 4943 } 4944 4945 spin_lock(&root->list_lock); 4946 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4947 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4948 " list\n", inode->i_ino); 4949 dump_stack(); 4950 } 4951 spin_unlock(&root->list_lock); 4952 4953 while (1) { 4954 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4955 if (!ordered) 4956 break; 4957 else { 4958 printk(KERN_ERR "btrfs found ordered " 4959 "extent %llu %llu on inode cleanup\n", 4960 (unsigned long long)ordered->file_offset, 4961 (unsigned long long)ordered->len); 4962 btrfs_remove_ordered_extent(inode, ordered); 4963 btrfs_put_ordered_extent(ordered); 4964 btrfs_put_ordered_extent(ordered); 4965 } 4966 } 4967 inode_tree_del(inode); 4968 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 4969 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 4970 } 4971 4972 void btrfs_drop_inode(struct inode *inode) 4973 { 4974 struct btrfs_root *root = BTRFS_I(inode)->root; 4975 4976 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 4977 generic_delete_inode(inode); 4978 else 4979 generic_drop_inode(inode); 4980 } 4981 4982 static void init_once(void *foo) 4983 { 4984 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 4985 4986 inode_init_once(&ei->vfs_inode); 4987 } 4988 4989 void btrfs_destroy_cachep(void) 4990 { 4991 if (btrfs_inode_cachep) 4992 kmem_cache_destroy(btrfs_inode_cachep); 4993 if (btrfs_trans_handle_cachep) 4994 kmem_cache_destroy(btrfs_trans_handle_cachep); 4995 if (btrfs_transaction_cachep) 4996 kmem_cache_destroy(btrfs_transaction_cachep); 4997 if (btrfs_path_cachep) 4998 kmem_cache_destroy(btrfs_path_cachep); 4999 } 5000 5001 int btrfs_init_cachep(void) 5002 { 5003 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 5004 sizeof(struct btrfs_inode), 0, 5005 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 5006 if (!btrfs_inode_cachep) 5007 goto fail; 5008 5009 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 5010 sizeof(struct btrfs_trans_handle), 0, 5011 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 5012 if (!btrfs_trans_handle_cachep) 5013 goto fail; 5014 5015 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 5016 sizeof(struct btrfs_transaction), 0, 5017 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 5018 if (!btrfs_transaction_cachep) 5019 goto fail; 5020 5021 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 5022 sizeof(struct btrfs_path), 0, 5023 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 5024 if (!btrfs_path_cachep) 5025 goto fail; 5026 5027 return 0; 5028 fail: 5029 btrfs_destroy_cachep(); 5030 return -ENOMEM; 5031 } 5032 5033 static int btrfs_getattr(struct vfsmount *mnt, 5034 struct dentry *dentry, struct kstat *stat) 5035 { 5036 struct inode *inode = dentry->d_inode; 5037 generic_fillattr(inode, stat); 5038 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; 5039 stat->blksize = PAGE_CACHE_SIZE; 5040 stat->blocks = (inode_get_bytes(inode) + 5041 BTRFS_I(inode)->delalloc_bytes) >> 9; 5042 return 0; 5043 } 5044 5045 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 5046 struct inode *new_dir, struct dentry *new_dentry) 5047 { 5048 struct btrfs_trans_handle *trans; 5049 struct btrfs_root *root = BTRFS_I(old_dir)->root; 5050 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 5051 struct inode *new_inode = new_dentry->d_inode; 5052 struct inode *old_inode = old_dentry->d_inode; 5053 struct timespec ctime = CURRENT_TIME; 5054 u64 index = 0; 5055 u64 root_objectid; 5056 int ret; 5057 5058 if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5059 return -EPERM; 5060 5061 /* we only allow rename subvolume link between subvolumes */ 5062 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 5063 return -EXDEV; 5064 5065 if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 5066 (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) 5067 return -ENOTEMPTY; 5068 5069 if (S_ISDIR(old_inode->i_mode) && new_inode && 5070 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 5071 return -ENOTEMPTY; 5072 5073 ret = btrfs_check_metadata_free_space(root); 5074 if (ret) 5075 return ret; 5076 5077 /* 5078 * we're using rename to replace one file with another. 5079 * and the replacement file is large. Start IO on it now so 5080 * we don't add too much work to the end of the transaction 5081 */ 5082 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 5083 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 5084 filemap_flush(old_inode->i_mapping); 5085 5086 /* close the racy window with snapshot create/destroy ioctl */ 5087 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5088 down_read(&root->fs_info->subvol_sem); 5089 5090 trans = btrfs_start_transaction(root, 1); 5091 btrfs_set_trans_block_group(trans, new_dir); 5092 5093 if (dest != root) 5094 btrfs_record_root_in_trans(trans, dest); 5095 5096 ret = btrfs_set_inode_index(new_dir, &index); 5097 if (ret) 5098 goto out_fail; 5099 5100 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 5101 /* force full log commit if subvolume involved. */ 5102 root->fs_info->last_trans_log_full_commit = trans->transid; 5103 } else { 5104 ret = btrfs_insert_inode_ref(trans, dest, 5105 new_dentry->d_name.name, 5106 new_dentry->d_name.len, 5107 old_inode->i_ino, 5108 new_dir->i_ino, index); 5109 if (ret) 5110 goto out_fail; 5111 /* 5112 * this is an ugly little race, but the rename is required 5113 * to make sure that if we crash, the inode is either at the 5114 * old name or the new one. pinning the log transaction lets 5115 * us make sure we don't allow a log commit to come in after 5116 * we unlink the name but before we add the new name back in. 5117 */ 5118 btrfs_pin_log_trans(root); 5119 } 5120 /* 5121 * make sure the inode gets flushed if it is replacing 5122 * something. 5123 */ 5124 if (new_inode && new_inode->i_size && 5125 old_inode && S_ISREG(old_inode->i_mode)) { 5126 btrfs_add_ordered_operation(trans, root, old_inode); 5127 } 5128 5129 old_dir->i_ctime = old_dir->i_mtime = ctime; 5130 new_dir->i_ctime = new_dir->i_mtime = ctime; 5131 old_inode->i_ctime = ctime; 5132 5133 if (old_dentry->d_parent != new_dentry->d_parent) 5134 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 5135 5136 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 5137 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 5138 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 5139 old_dentry->d_name.name, 5140 old_dentry->d_name.len); 5141 } else { 5142 btrfs_inc_nlink(old_dentry->d_inode); 5143 ret = btrfs_unlink_inode(trans, root, old_dir, 5144 old_dentry->d_inode, 5145 old_dentry->d_name.name, 5146 old_dentry->d_name.len); 5147 } 5148 BUG_ON(ret); 5149 5150 if (new_inode) { 5151 new_inode->i_ctime = CURRENT_TIME; 5152 if (unlikely(new_inode->i_ino == 5153 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 5154 root_objectid = BTRFS_I(new_inode)->location.objectid; 5155 ret = btrfs_unlink_subvol(trans, dest, new_dir, 5156 root_objectid, 5157 new_dentry->d_name.name, 5158 new_dentry->d_name.len); 5159 BUG_ON(new_inode->i_nlink == 0); 5160 } else { 5161 ret = btrfs_unlink_inode(trans, dest, new_dir, 5162 new_dentry->d_inode, 5163 new_dentry->d_name.name, 5164 new_dentry->d_name.len); 5165 } 5166 BUG_ON(ret); 5167 if (new_inode->i_nlink == 0) { 5168 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 5169 BUG_ON(ret); 5170 } 5171 } 5172 5173 ret = btrfs_add_link(trans, new_dir, old_inode, 5174 new_dentry->d_name.name, 5175 new_dentry->d_name.len, 0, index); 5176 BUG_ON(ret); 5177 5178 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 5179 btrfs_log_new_name(trans, old_inode, old_dir, 5180 new_dentry->d_parent); 5181 btrfs_end_log_trans(root); 5182 } 5183 out_fail: 5184 btrfs_end_transaction_throttle(trans, root); 5185 5186 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5187 up_read(&root->fs_info->subvol_sem); 5188 return ret; 5189 } 5190 5191 /* 5192 * some fairly slow code that needs optimization. This walks the list 5193 * of all the inodes with pending delalloc and forces them to disk. 5194 */ 5195 int btrfs_start_delalloc_inodes(struct btrfs_root *root) 5196 { 5197 struct list_head *head = &root->fs_info->delalloc_inodes; 5198 struct btrfs_inode *binode; 5199 struct inode *inode; 5200 5201 if (root->fs_info->sb->s_flags & MS_RDONLY) 5202 return -EROFS; 5203 5204 spin_lock(&root->fs_info->delalloc_lock); 5205 while (!list_empty(head)) { 5206 binode = list_entry(head->next, struct btrfs_inode, 5207 delalloc_inodes); 5208 inode = igrab(&binode->vfs_inode); 5209 if (!inode) 5210 list_del_init(&binode->delalloc_inodes); 5211 spin_unlock(&root->fs_info->delalloc_lock); 5212 if (inode) { 5213 filemap_flush(inode->i_mapping); 5214 iput(inode); 5215 } 5216 cond_resched(); 5217 spin_lock(&root->fs_info->delalloc_lock); 5218 } 5219 spin_unlock(&root->fs_info->delalloc_lock); 5220 5221 /* the filemap_flush will queue IO into the worker threads, but 5222 * we have to make sure the IO is actually started and that 5223 * ordered extents get created before we return 5224 */ 5225 atomic_inc(&root->fs_info->async_submit_draining); 5226 while (atomic_read(&root->fs_info->nr_async_submits) || 5227 atomic_read(&root->fs_info->async_delalloc_pages)) { 5228 wait_event(root->fs_info->async_submit_wait, 5229 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 5230 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 5231 } 5232 atomic_dec(&root->fs_info->async_submit_draining); 5233 return 0; 5234 } 5235 5236 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 5237 const char *symname) 5238 { 5239 struct btrfs_trans_handle *trans; 5240 struct btrfs_root *root = BTRFS_I(dir)->root; 5241 struct btrfs_path *path; 5242 struct btrfs_key key; 5243 struct inode *inode = NULL; 5244 int err; 5245 int drop_inode = 0; 5246 u64 objectid; 5247 u64 index = 0 ; 5248 int name_len; 5249 int datasize; 5250 unsigned long ptr; 5251 struct btrfs_file_extent_item *ei; 5252 struct extent_buffer *leaf; 5253 unsigned long nr = 0; 5254 5255 name_len = strlen(symname) + 1; 5256 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5257 return -ENAMETOOLONG; 5258 5259 err = btrfs_check_metadata_free_space(root); 5260 if (err) 5261 goto out_fail; 5262 5263 trans = btrfs_start_transaction(root, 1); 5264 btrfs_set_trans_block_group(trans, dir); 5265 5266 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5267 if (err) { 5268 err = -ENOSPC; 5269 goto out_unlock; 5270 } 5271 5272 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5273 dentry->d_name.len, 5274 dentry->d_parent->d_inode->i_ino, objectid, 5275 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 5276 &index); 5277 err = PTR_ERR(inode); 5278 if (IS_ERR(inode)) 5279 goto out_unlock; 5280 5281 err = btrfs_init_inode_security(inode, dir); 5282 if (err) { 5283 drop_inode = 1; 5284 goto out_unlock; 5285 } 5286 5287 btrfs_set_trans_block_group(trans, inode); 5288 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 5289 if (err) 5290 drop_inode = 1; 5291 else { 5292 inode->i_mapping->a_ops = &btrfs_aops; 5293 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5294 inode->i_fop = &btrfs_file_operations; 5295 inode->i_op = &btrfs_file_inode_operations; 5296 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5297 } 5298 btrfs_update_inode_block_group(trans, inode); 5299 btrfs_update_inode_block_group(trans, dir); 5300 if (drop_inode) 5301 goto out_unlock; 5302 5303 path = btrfs_alloc_path(); 5304 BUG_ON(!path); 5305 key.objectid = inode->i_ino; 5306 key.offset = 0; 5307 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 5308 datasize = btrfs_file_extent_calc_inline_size(name_len); 5309 err = btrfs_insert_empty_item(trans, root, path, &key, 5310 datasize); 5311 if (err) { 5312 drop_inode = 1; 5313 goto out_unlock; 5314 } 5315 leaf = path->nodes[0]; 5316 ei = btrfs_item_ptr(leaf, path->slots[0], 5317 struct btrfs_file_extent_item); 5318 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 5319 btrfs_set_file_extent_type(leaf, ei, 5320 BTRFS_FILE_EXTENT_INLINE); 5321 btrfs_set_file_extent_encryption(leaf, ei, 0); 5322 btrfs_set_file_extent_compression(leaf, ei, 0); 5323 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 5324 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 5325 5326 ptr = btrfs_file_extent_inline_start(ei); 5327 write_extent_buffer(leaf, symname, ptr, name_len); 5328 btrfs_mark_buffer_dirty(leaf); 5329 btrfs_free_path(path); 5330 5331 inode->i_op = &btrfs_symlink_inode_operations; 5332 inode->i_mapping->a_ops = &btrfs_symlink_aops; 5333 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5334 inode_set_bytes(inode, name_len); 5335 btrfs_i_size_write(inode, name_len - 1); 5336 err = btrfs_update_inode(trans, root, inode); 5337 if (err) 5338 drop_inode = 1; 5339 5340 out_unlock: 5341 nr = trans->blocks_used; 5342 btrfs_end_transaction_throttle(trans, root); 5343 out_fail: 5344 if (drop_inode) { 5345 inode_dec_link_count(inode); 5346 iput(inode); 5347 } 5348 btrfs_btree_balance_dirty(root, nr); 5349 return err; 5350 } 5351 5352 static int prealloc_file_range(struct btrfs_trans_handle *trans, 5353 struct inode *inode, u64 start, u64 end, 5354 u64 locked_end, u64 alloc_hint, int mode) 5355 { 5356 struct btrfs_root *root = BTRFS_I(inode)->root; 5357 struct btrfs_key ins; 5358 u64 alloc_size; 5359 u64 cur_offset = start; 5360 u64 num_bytes = end - start; 5361 int ret = 0; 5362 5363 while (num_bytes > 0) { 5364 alloc_size = min(num_bytes, root->fs_info->max_extent); 5365 ret = btrfs_reserve_extent(trans, root, alloc_size, 5366 root->sectorsize, 0, alloc_hint, 5367 (u64)-1, &ins, 1); 5368 if (ret) { 5369 WARN_ON(1); 5370 goto out; 5371 } 5372 ret = insert_reserved_file_extent(trans, inode, 5373 cur_offset, ins.objectid, 5374 ins.offset, ins.offset, 5375 ins.offset, locked_end, 5376 0, 0, 0, 5377 BTRFS_FILE_EXTENT_PREALLOC); 5378 BUG_ON(ret); 5379 btrfs_drop_extent_cache(inode, cur_offset, 5380 cur_offset + ins.offset -1, 0); 5381 num_bytes -= ins.offset; 5382 cur_offset += ins.offset; 5383 alloc_hint = ins.objectid + ins.offset; 5384 } 5385 out: 5386 if (cur_offset > start) { 5387 inode->i_ctime = CURRENT_TIME; 5388 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 5389 if (!(mode & FALLOC_FL_KEEP_SIZE) && 5390 cur_offset > i_size_read(inode)) 5391 btrfs_i_size_write(inode, cur_offset); 5392 ret = btrfs_update_inode(trans, root, inode); 5393 BUG_ON(ret); 5394 } 5395 5396 return ret; 5397 } 5398 5399 static long btrfs_fallocate(struct inode *inode, int mode, 5400 loff_t offset, loff_t len) 5401 { 5402 u64 cur_offset; 5403 u64 last_byte; 5404 u64 alloc_start; 5405 u64 alloc_end; 5406 u64 alloc_hint = 0; 5407 u64 locked_end; 5408 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 5409 struct extent_map *em; 5410 struct btrfs_trans_handle *trans; 5411 struct btrfs_root *root; 5412 int ret; 5413 5414 alloc_start = offset & ~mask; 5415 alloc_end = (offset + len + mask) & ~mask; 5416 5417 /* 5418 * wait for ordered IO before we have any locks. We'll loop again 5419 * below with the locks held. 5420 */ 5421 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); 5422 5423 mutex_lock(&inode->i_mutex); 5424 if (alloc_start > inode->i_size) { 5425 ret = btrfs_cont_expand(inode, alloc_start); 5426 if (ret) 5427 goto out; 5428 } 5429 5430 root = BTRFS_I(inode)->root; 5431 5432 ret = btrfs_check_data_free_space(root, inode, 5433 alloc_end - alloc_start); 5434 if (ret) 5435 goto out; 5436 5437 locked_end = alloc_end - 1; 5438 while (1) { 5439 struct btrfs_ordered_extent *ordered; 5440 5441 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); 5442 if (!trans) { 5443 ret = -EIO; 5444 goto out_free; 5445 } 5446 5447 /* the extent lock is ordered inside the running 5448 * transaction 5449 */ 5450 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5451 GFP_NOFS); 5452 ordered = btrfs_lookup_first_ordered_extent(inode, 5453 alloc_end - 1); 5454 if (ordered && 5455 ordered->file_offset + ordered->len > alloc_start && 5456 ordered->file_offset < alloc_end) { 5457 btrfs_put_ordered_extent(ordered); 5458 unlock_extent(&BTRFS_I(inode)->io_tree, 5459 alloc_start, locked_end, GFP_NOFS); 5460 btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5461 5462 /* 5463 * we can't wait on the range with the transaction 5464 * running or with the extent lock held 5465 */ 5466 btrfs_wait_ordered_range(inode, alloc_start, 5467 alloc_end - alloc_start); 5468 } else { 5469 if (ordered) 5470 btrfs_put_ordered_extent(ordered); 5471 break; 5472 } 5473 } 5474 5475 cur_offset = alloc_start; 5476 while (1) { 5477 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 5478 alloc_end - cur_offset, 0); 5479 BUG_ON(IS_ERR(em) || !em); 5480 last_byte = min(extent_map_end(em), alloc_end); 5481 last_byte = (last_byte + mask) & ~mask; 5482 if (em->block_start == EXTENT_MAP_HOLE) { 5483 ret = prealloc_file_range(trans, inode, cur_offset, 5484 last_byte, locked_end + 1, 5485 alloc_hint, mode); 5486 if (ret < 0) { 5487 free_extent_map(em); 5488 break; 5489 } 5490 } 5491 if (em->block_start <= EXTENT_MAP_LAST_BYTE) 5492 alloc_hint = em->block_start; 5493 free_extent_map(em); 5494 5495 cur_offset = last_byte; 5496 if (cur_offset >= alloc_end) { 5497 ret = 0; 5498 break; 5499 } 5500 } 5501 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5502 GFP_NOFS); 5503 5504 btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5505 out_free: 5506 btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); 5507 out: 5508 mutex_unlock(&inode->i_mutex); 5509 return ret; 5510 } 5511 5512 static int btrfs_set_page_dirty(struct page *page) 5513 { 5514 return __set_page_dirty_nobuffers(page); 5515 } 5516 5517 static int btrfs_permission(struct inode *inode, int mask) 5518 { 5519 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 5520 return -EACCES; 5521 return generic_permission(inode, mask, btrfs_check_acl); 5522 } 5523 5524 static const struct inode_operations btrfs_dir_inode_operations = { 5525 .getattr = btrfs_getattr, 5526 .lookup = btrfs_lookup, 5527 .create = btrfs_create, 5528 .unlink = btrfs_unlink, 5529 .link = btrfs_link, 5530 .mkdir = btrfs_mkdir, 5531 .rmdir = btrfs_rmdir, 5532 .rename = btrfs_rename, 5533 .symlink = btrfs_symlink, 5534 .setattr = btrfs_setattr, 5535 .mknod = btrfs_mknod, 5536 .setxattr = btrfs_setxattr, 5537 .getxattr = btrfs_getxattr, 5538 .listxattr = btrfs_listxattr, 5539 .removexattr = btrfs_removexattr, 5540 .permission = btrfs_permission, 5541 }; 5542 static const struct inode_operations btrfs_dir_ro_inode_operations = { 5543 .lookup = btrfs_lookup, 5544 .permission = btrfs_permission, 5545 }; 5546 5547 static struct file_operations btrfs_dir_file_operations = { 5548 .llseek = generic_file_llseek, 5549 .read = generic_read_dir, 5550 .readdir = btrfs_real_readdir, 5551 .unlocked_ioctl = btrfs_ioctl, 5552 #ifdef CONFIG_COMPAT 5553 .compat_ioctl = btrfs_ioctl, 5554 #endif 5555 .release = btrfs_release_file, 5556 .fsync = btrfs_sync_file, 5557 }; 5558 5559 static struct extent_io_ops btrfs_extent_io_ops = { 5560 .fill_delalloc = run_delalloc_range, 5561 .submit_bio_hook = btrfs_submit_bio_hook, 5562 .merge_bio_hook = btrfs_merge_bio_hook, 5563 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 5564 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 5565 .writepage_start_hook = btrfs_writepage_start_hook, 5566 .readpage_io_failed_hook = btrfs_io_failed_hook, 5567 .set_bit_hook = btrfs_set_bit_hook, 5568 .clear_bit_hook = btrfs_clear_bit_hook, 5569 }; 5570 5571 /* 5572 * btrfs doesn't support the bmap operation because swapfiles 5573 * use bmap to make a mapping of extents in the file. They assume 5574 * these extents won't change over the life of the file and they 5575 * use the bmap result to do IO directly to the drive. 5576 * 5577 * the btrfs bmap call would return logical addresses that aren't 5578 * suitable for IO and they also will change frequently as COW 5579 * operations happen. So, swapfile + btrfs == corruption. 5580 * 5581 * For now we're avoiding this by dropping bmap. 5582 */ 5583 static const struct address_space_operations btrfs_aops = { 5584 .readpage = btrfs_readpage, 5585 .writepage = btrfs_writepage, 5586 .writepages = btrfs_writepages, 5587 .readpages = btrfs_readpages, 5588 .sync_page = block_sync_page, 5589 .direct_IO = btrfs_direct_IO, 5590 .invalidatepage = btrfs_invalidatepage, 5591 .releasepage = btrfs_releasepage, 5592 .set_page_dirty = btrfs_set_page_dirty, 5593 .error_remove_page = generic_error_remove_page, 5594 }; 5595 5596 static const struct address_space_operations btrfs_symlink_aops = { 5597 .readpage = btrfs_readpage, 5598 .writepage = btrfs_writepage, 5599 .invalidatepage = btrfs_invalidatepage, 5600 .releasepage = btrfs_releasepage, 5601 }; 5602 5603 static const struct inode_operations btrfs_file_inode_operations = { 5604 .truncate = btrfs_truncate, 5605 .getattr = btrfs_getattr, 5606 .setattr = btrfs_setattr, 5607 .setxattr = btrfs_setxattr, 5608 .getxattr = btrfs_getxattr, 5609 .listxattr = btrfs_listxattr, 5610 .removexattr = btrfs_removexattr, 5611 .permission = btrfs_permission, 5612 .fallocate = btrfs_fallocate, 5613 .fiemap = btrfs_fiemap, 5614 }; 5615 static const struct inode_operations btrfs_special_inode_operations = { 5616 .getattr = btrfs_getattr, 5617 .setattr = btrfs_setattr, 5618 .permission = btrfs_permission, 5619 .setxattr = btrfs_setxattr, 5620 .getxattr = btrfs_getxattr, 5621 .listxattr = btrfs_listxattr, 5622 .removexattr = btrfs_removexattr, 5623 }; 5624 static const struct inode_operations btrfs_symlink_inode_operations = { 5625 .readlink = generic_readlink, 5626 .follow_link = page_follow_link_light, 5627 .put_link = page_put_link, 5628 .permission = btrfs_permission, 5629 .setxattr = btrfs_setxattr, 5630 .getxattr = btrfs_getxattr, 5631 .listxattr = btrfs_listxattr, 5632 .removexattr = btrfs_removexattr, 5633 }; 5634 5635 struct dentry_operations btrfs_dentry_operations = { 5636 .d_delete = btrfs_dentry_delete, 5637 }; 5638