1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include "compat.h" 40 #include "ctree.h" 41 #include "disk-io.h" 42 #include "transaction.h" 43 #include "btrfs_inode.h" 44 #include "ioctl.h" 45 #include "print-tree.h" 46 #include "volumes.h" 47 #include "ordered-data.h" 48 #include "xattr.h" 49 #include "tree-log.h" 50 #include "compression.h" 51 #include "locking.h" 52 53 struct btrfs_iget_args { 54 u64 ino; 55 struct btrfs_root *root; 56 }; 57 58 static const struct inode_operations btrfs_dir_inode_operations; 59 static const struct inode_operations btrfs_symlink_inode_operations; 60 static const struct inode_operations btrfs_dir_ro_inode_operations; 61 static const struct inode_operations btrfs_special_inode_operations; 62 static const struct inode_operations btrfs_file_inode_operations; 63 static const struct address_space_operations btrfs_aops; 64 static const struct address_space_operations btrfs_symlink_aops; 65 static const struct file_operations btrfs_dir_file_operations; 66 static struct extent_io_ops btrfs_extent_io_ops; 67 68 static struct kmem_cache *btrfs_inode_cachep; 69 struct kmem_cache *btrfs_trans_handle_cachep; 70 struct kmem_cache *btrfs_transaction_cachep; 71 struct kmem_cache *btrfs_path_cachep; 72 73 #define S_SHIFT 12 74 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 75 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 76 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 77 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 78 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 79 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 80 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 81 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 82 }; 83 84 static void btrfs_truncate(struct inode *inode); 85 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 86 static noinline int cow_file_range(struct inode *inode, 87 struct page *locked_page, 88 u64 start, u64 end, int *page_started, 89 unsigned long *nr_written, int unlock); 90 91 static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) 92 { 93 int err; 94 95 err = btrfs_init_acl(inode, dir); 96 if (!err) 97 err = btrfs_xattr_security_init(inode, dir); 98 return err; 99 } 100 101 /* 102 * this does all the hard work for inserting an inline extent into 103 * the btree. The caller should have done a btrfs_drop_extents so that 104 * no overlapping inline items exist in the btree 105 */ 106 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 107 struct btrfs_root *root, struct inode *inode, 108 u64 start, size_t size, size_t compressed_size, 109 struct page **compressed_pages) 110 { 111 struct btrfs_key key; 112 struct btrfs_path *path; 113 struct extent_buffer *leaf; 114 struct page *page = NULL; 115 char *kaddr; 116 unsigned long ptr; 117 struct btrfs_file_extent_item *ei; 118 int err = 0; 119 int ret; 120 size_t cur_size = size; 121 size_t datasize; 122 unsigned long offset; 123 int use_compress = 0; 124 125 if (compressed_size && compressed_pages) { 126 use_compress = 1; 127 cur_size = compressed_size; 128 } 129 130 path = btrfs_alloc_path(); 131 if (!path) 132 return -ENOMEM; 133 134 path->leave_spinning = 1; 135 btrfs_set_trans_block_group(trans, inode); 136 137 key.objectid = inode->i_ino; 138 key.offset = start; 139 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 140 datasize = btrfs_file_extent_calc_inline_size(cur_size); 141 142 inode_add_bytes(inode, size); 143 ret = btrfs_insert_empty_item(trans, root, path, &key, 144 datasize); 145 BUG_ON(ret); 146 if (ret) { 147 err = ret; 148 goto fail; 149 } 150 leaf = path->nodes[0]; 151 ei = btrfs_item_ptr(leaf, path->slots[0], 152 struct btrfs_file_extent_item); 153 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 154 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 155 btrfs_set_file_extent_encryption(leaf, ei, 0); 156 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 157 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 158 ptr = btrfs_file_extent_inline_start(ei); 159 160 if (use_compress) { 161 struct page *cpage; 162 int i = 0; 163 while (compressed_size > 0) { 164 cpage = compressed_pages[i]; 165 cur_size = min_t(unsigned long, compressed_size, 166 PAGE_CACHE_SIZE); 167 168 kaddr = kmap_atomic(cpage, KM_USER0); 169 write_extent_buffer(leaf, kaddr, ptr, cur_size); 170 kunmap_atomic(kaddr, KM_USER0); 171 172 i++; 173 ptr += cur_size; 174 compressed_size -= cur_size; 175 } 176 btrfs_set_file_extent_compression(leaf, ei, 177 BTRFS_COMPRESS_ZLIB); 178 } else { 179 page = find_get_page(inode->i_mapping, 180 start >> PAGE_CACHE_SHIFT); 181 btrfs_set_file_extent_compression(leaf, ei, 0); 182 kaddr = kmap_atomic(page, KM_USER0); 183 offset = start & (PAGE_CACHE_SIZE - 1); 184 write_extent_buffer(leaf, kaddr + offset, ptr, size); 185 kunmap_atomic(kaddr, KM_USER0); 186 page_cache_release(page); 187 } 188 btrfs_mark_buffer_dirty(leaf); 189 btrfs_free_path(path); 190 191 BTRFS_I(inode)->disk_i_size = inode->i_size; 192 btrfs_update_inode(trans, root, inode); 193 return 0; 194 fail: 195 btrfs_free_path(path); 196 return err; 197 } 198 199 200 /* 201 * conditionally insert an inline extent into the file. This 202 * does the checks required to make sure the data is small enough 203 * to fit as an inline extent. 204 */ 205 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 206 struct btrfs_root *root, 207 struct inode *inode, u64 start, u64 end, 208 size_t compressed_size, 209 struct page **compressed_pages) 210 { 211 u64 isize = i_size_read(inode); 212 u64 actual_end = min(end + 1, isize); 213 u64 inline_len = actual_end - start; 214 u64 aligned_end = (end + root->sectorsize - 1) & 215 ~((u64)root->sectorsize - 1); 216 u64 hint_byte; 217 u64 data_len = inline_len; 218 int ret; 219 220 if (compressed_size) 221 data_len = compressed_size; 222 223 if (start > 0 || 224 actual_end >= PAGE_CACHE_SIZE || 225 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 226 (!compressed_size && 227 (actual_end & (root->sectorsize - 1)) == 0) || 228 end + 1 < isize || 229 data_len > root->fs_info->max_inline) { 230 return 1; 231 } 232 233 ret = btrfs_drop_extents(trans, root, inode, start, 234 aligned_end, aligned_end, start, 235 &hint_byte, 1); 236 BUG_ON(ret); 237 238 if (isize > actual_end) 239 inline_len = min_t(u64, isize, actual_end); 240 ret = insert_inline_extent(trans, root, inode, start, 241 inline_len, compressed_size, 242 compressed_pages); 243 BUG_ON(ret); 244 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 245 return 0; 246 } 247 248 struct async_extent { 249 u64 start; 250 u64 ram_size; 251 u64 compressed_size; 252 struct page **pages; 253 unsigned long nr_pages; 254 struct list_head list; 255 }; 256 257 struct async_cow { 258 struct inode *inode; 259 struct btrfs_root *root; 260 struct page *locked_page; 261 u64 start; 262 u64 end; 263 struct list_head extents; 264 struct btrfs_work work; 265 }; 266 267 static noinline int add_async_extent(struct async_cow *cow, 268 u64 start, u64 ram_size, 269 u64 compressed_size, 270 struct page **pages, 271 unsigned long nr_pages) 272 { 273 struct async_extent *async_extent; 274 275 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 276 async_extent->start = start; 277 async_extent->ram_size = ram_size; 278 async_extent->compressed_size = compressed_size; 279 async_extent->pages = pages; 280 async_extent->nr_pages = nr_pages; 281 list_add_tail(&async_extent->list, &cow->extents); 282 return 0; 283 } 284 285 /* 286 * we create compressed extents in two phases. The first 287 * phase compresses a range of pages that have already been 288 * locked (both pages and state bits are locked). 289 * 290 * This is done inside an ordered work queue, and the compression 291 * is spread across many cpus. The actual IO submission is step 292 * two, and the ordered work queue takes care of making sure that 293 * happens in the same order things were put onto the queue by 294 * writepages and friends. 295 * 296 * If this code finds it can't get good compression, it puts an 297 * entry onto the work queue to write the uncompressed bytes. This 298 * makes sure that both compressed inodes and uncompressed inodes 299 * are written in the same order that pdflush sent them down. 300 */ 301 static noinline int compress_file_range(struct inode *inode, 302 struct page *locked_page, 303 u64 start, u64 end, 304 struct async_cow *async_cow, 305 int *num_added) 306 { 307 struct btrfs_root *root = BTRFS_I(inode)->root; 308 struct btrfs_trans_handle *trans; 309 u64 num_bytes; 310 u64 orig_start; 311 u64 disk_num_bytes; 312 u64 blocksize = root->sectorsize; 313 u64 actual_end; 314 u64 isize = i_size_read(inode); 315 int ret = 0; 316 struct page **pages = NULL; 317 unsigned long nr_pages; 318 unsigned long nr_pages_ret = 0; 319 unsigned long total_compressed = 0; 320 unsigned long total_in = 0; 321 unsigned long max_compressed = 128 * 1024; 322 unsigned long max_uncompressed = 128 * 1024; 323 int i; 324 int will_compress; 325 326 orig_start = start; 327 328 actual_end = min_t(u64, isize, end + 1); 329 again: 330 will_compress = 0; 331 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 332 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 333 334 /* 335 * we don't want to send crud past the end of i_size through 336 * compression, that's just a waste of CPU time. So, if the 337 * end of the file is before the start of our current 338 * requested range of bytes, we bail out to the uncompressed 339 * cleanup code that can deal with all of this. 340 * 341 * It isn't really the fastest way to fix things, but this is a 342 * very uncommon corner. 343 */ 344 if (actual_end <= start) 345 goto cleanup_and_bail_uncompressed; 346 347 total_compressed = actual_end - start; 348 349 /* we want to make sure that amount of ram required to uncompress 350 * an extent is reasonable, so we limit the total size in ram 351 * of a compressed extent to 128k. This is a crucial number 352 * because it also controls how easily we can spread reads across 353 * cpus for decompression. 354 * 355 * We also want to make sure the amount of IO required to do 356 * a random read is reasonably small, so we limit the size of 357 * a compressed extent to 128k. 358 */ 359 total_compressed = min(total_compressed, max_uncompressed); 360 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 361 num_bytes = max(blocksize, num_bytes); 362 disk_num_bytes = num_bytes; 363 total_in = 0; 364 ret = 0; 365 366 /* 367 * we do compression for mount -o compress and when the 368 * inode has not been flagged as nocompress. This flag can 369 * change at any time if we discover bad compression ratios. 370 */ 371 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 372 btrfs_test_opt(root, COMPRESS)) { 373 WARN_ON(pages); 374 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 375 376 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 377 total_compressed, pages, 378 nr_pages, &nr_pages_ret, 379 &total_in, 380 &total_compressed, 381 max_compressed); 382 383 if (!ret) { 384 unsigned long offset = total_compressed & 385 (PAGE_CACHE_SIZE - 1); 386 struct page *page = pages[nr_pages_ret - 1]; 387 char *kaddr; 388 389 /* zero the tail end of the last page, we might be 390 * sending it down to disk 391 */ 392 if (offset) { 393 kaddr = kmap_atomic(page, KM_USER0); 394 memset(kaddr + offset, 0, 395 PAGE_CACHE_SIZE - offset); 396 kunmap_atomic(kaddr, KM_USER0); 397 } 398 will_compress = 1; 399 } 400 } 401 if (start == 0) { 402 trans = btrfs_join_transaction(root, 1); 403 BUG_ON(!trans); 404 btrfs_set_trans_block_group(trans, inode); 405 406 /* lets try to make an inline extent */ 407 if (ret || total_in < (actual_end - start)) { 408 /* we didn't compress the entire range, try 409 * to make an uncompressed inline extent. 410 */ 411 ret = cow_file_range_inline(trans, root, inode, 412 start, end, 0, NULL); 413 } else { 414 /* try making a compressed inline extent */ 415 ret = cow_file_range_inline(trans, root, inode, 416 start, end, 417 total_compressed, pages); 418 } 419 btrfs_end_transaction(trans, root); 420 if (ret == 0) { 421 /* 422 * inline extent creation worked, we don't need 423 * to create any more async work items. Unlock 424 * and free up our temp pages. 425 */ 426 extent_clear_unlock_delalloc(inode, 427 &BTRFS_I(inode)->io_tree, 428 start, end, NULL, 429 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 430 EXTENT_CLEAR_DELALLOC | 431 EXTENT_CLEAR_ACCOUNTING | 432 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 433 ret = 0; 434 goto free_pages_out; 435 } 436 } 437 438 if (will_compress) { 439 /* 440 * we aren't doing an inline extent round the compressed size 441 * up to a block size boundary so the allocator does sane 442 * things 443 */ 444 total_compressed = (total_compressed + blocksize - 1) & 445 ~(blocksize - 1); 446 447 /* 448 * one last check to make sure the compression is really a 449 * win, compare the page count read with the blocks on disk 450 */ 451 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 452 ~(PAGE_CACHE_SIZE - 1); 453 if (total_compressed >= total_in) { 454 will_compress = 0; 455 } else { 456 disk_num_bytes = total_compressed; 457 num_bytes = total_in; 458 } 459 } 460 if (!will_compress && pages) { 461 /* 462 * the compression code ran but failed to make things smaller, 463 * free any pages it allocated and our page pointer array 464 */ 465 for (i = 0; i < nr_pages_ret; i++) { 466 WARN_ON(pages[i]->mapping); 467 page_cache_release(pages[i]); 468 } 469 kfree(pages); 470 pages = NULL; 471 total_compressed = 0; 472 nr_pages_ret = 0; 473 474 /* flag the file so we don't compress in the future */ 475 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 476 } 477 if (will_compress) { 478 *num_added += 1; 479 480 /* the async work queues will take care of doing actual 481 * allocation on disk for these compressed pages, 482 * and will submit them to the elevator. 483 */ 484 add_async_extent(async_cow, start, num_bytes, 485 total_compressed, pages, nr_pages_ret); 486 487 if (start + num_bytes < end && start + num_bytes < actual_end) { 488 start += num_bytes; 489 pages = NULL; 490 cond_resched(); 491 goto again; 492 } 493 } else { 494 cleanup_and_bail_uncompressed: 495 /* 496 * No compression, but we still need to write the pages in 497 * the file we've been given so far. redirty the locked 498 * page if it corresponds to our extent and set things up 499 * for the async work queue to run cow_file_range to do 500 * the normal delalloc dance 501 */ 502 if (page_offset(locked_page) >= start && 503 page_offset(locked_page) <= end) { 504 __set_page_dirty_nobuffers(locked_page); 505 /* unlocked later on in the async handlers */ 506 } 507 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 508 *num_added += 1; 509 } 510 511 out: 512 return 0; 513 514 free_pages_out: 515 for (i = 0; i < nr_pages_ret; i++) { 516 WARN_ON(pages[i]->mapping); 517 page_cache_release(pages[i]); 518 } 519 kfree(pages); 520 521 goto out; 522 } 523 524 /* 525 * phase two of compressed writeback. This is the ordered portion 526 * of the code, which only gets called in the order the work was 527 * queued. We walk all the async extents created by compress_file_range 528 * and send them down to the disk. 529 */ 530 static noinline int submit_compressed_extents(struct inode *inode, 531 struct async_cow *async_cow) 532 { 533 struct async_extent *async_extent; 534 u64 alloc_hint = 0; 535 struct btrfs_trans_handle *trans; 536 struct btrfs_key ins; 537 struct extent_map *em; 538 struct btrfs_root *root = BTRFS_I(inode)->root; 539 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 540 struct extent_io_tree *io_tree; 541 int ret = 0; 542 543 if (list_empty(&async_cow->extents)) 544 return 0; 545 546 trans = btrfs_join_transaction(root, 1); 547 548 while (!list_empty(&async_cow->extents)) { 549 async_extent = list_entry(async_cow->extents.next, 550 struct async_extent, list); 551 list_del(&async_extent->list); 552 553 io_tree = &BTRFS_I(inode)->io_tree; 554 555 retry: 556 /* did the compression code fall back to uncompressed IO? */ 557 if (!async_extent->pages) { 558 int page_started = 0; 559 unsigned long nr_written = 0; 560 561 lock_extent(io_tree, async_extent->start, 562 async_extent->start + 563 async_extent->ram_size - 1, GFP_NOFS); 564 565 /* allocate blocks */ 566 ret = cow_file_range(inode, async_cow->locked_page, 567 async_extent->start, 568 async_extent->start + 569 async_extent->ram_size - 1, 570 &page_started, &nr_written, 0); 571 572 /* 573 * if page_started, cow_file_range inserted an 574 * inline extent and took care of all the unlocking 575 * and IO for us. Otherwise, we need to submit 576 * all those pages down to the drive. 577 */ 578 if (!page_started && !ret) 579 extent_write_locked_range(io_tree, 580 inode, async_extent->start, 581 async_extent->start + 582 async_extent->ram_size - 1, 583 btrfs_get_extent, 584 WB_SYNC_ALL); 585 kfree(async_extent); 586 cond_resched(); 587 continue; 588 } 589 590 lock_extent(io_tree, async_extent->start, 591 async_extent->start + async_extent->ram_size - 1, 592 GFP_NOFS); 593 /* 594 * here we're doing allocation and writeback of the 595 * compressed pages 596 */ 597 btrfs_drop_extent_cache(inode, async_extent->start, 598 async_extent->start + 599 async_extent->ram_size - 1, 0); 600 601 ret = btrfs_reserve_extent(trans, root, 602 async_extent->compressed_size, 603 async_extent->compressed_size, 604 0, alloc_hint, 605 (u64)-1, &ins, 1); 606 if (ret) { 607 int i; 608 for (i = 0; i < async_extent->nr_pages; i++) { 609 WARN_ON(async_extent->pages[i]->mapping); 610 page_cache_release(async_extent->pages[i]); 611 } 612 kfree(async_extent->pages); 613 async_extent->nr_pages = 0; 614 async_extent->pages = NULL; 615 unlock_extent(io_tree, async_extent->start, 616 async_extent->start + 617 async_extent->ram_size - 1, GFP_NOFS); 618 goto retry; 619 } 620 621 em = alloc_extent_map(GFP_NOFS); 622 em->start = async_extent->start; 623 em->len = async_extent->ram_size; 624 em->orig_start = em->start; 625 626 em->block_start = ins.objectid; 627 em->block_len = ins.offset; 628 em->bdev = root->fs_info->fs_devices->latest_bdev; 629 set_bit(EXTENT_FLAG_PINNED, &em->flags); 630 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 631 632 while (1) { 633 write_lock(&em_tree->lock); 634 ret = add_extent_mapping(em_tree, em); 635 write_unlock(&em_tree->lock); 636 if (ret != -EEXIST) { 637 free_extent_map(em); 638 break; 639 } 640 btrfs_drop_extent_cache(inode, async_extent->start, 641 async_extent->start + 642 async_extent->ram_size - 1, 0); 643 } 644 645 ret = btrfs_add_ordered_extent(inode, async_extent->start, 646 ins.objectid, 647 async_extent->ram_size, 648 ins.offset, 649 BTRFS_ORDERED_COMPRESSED); 650 BUG_ON(ret); 651 652 btrfs_end_transaction(trans, root); 653 654 /* 655 * clear dirty, set writeback and unlock the pages. 656 */ 657 extent_clear_unlock_delalloc(inode, 658 &BTRFS_I(inode)->io_tree, 659 async_extent->start, 660 async_extent->start + 661 async_extent->ram_size - 1, 662 NULL, EXTENT_CLEAR_UNLOCK_PAGE | 663 EXTENT_CLEAR_UNLOCK | 664 EXTENT_CLEAR_DELALLOC | 665 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); 666 667 ret = btrfs_submit_compressed_write(inode, 668 async_extent->start, 669 async_extent->ram_size, 670 ins.objectid, 671 ins.offset, async_extent->pages, 672 async_extent->nr_pages); 673 674 BUG_ON(ret); 675 trans = btrfs_join_transaction(root, 1); 676 alloc_hint = ins.objectid + ins.offset; 677 kfree(async_extent); 678 cond_resched(); 679 } 680 681 btrfs_end_transaction(trans, root); 682 return 0; 683 } 684 685 /* 686 * when extent_io.c finds a delayed allocation range in the file, 687 * the call backs end up in this code. The basic idea is to 688 * allocate extents on disk for the range, and create ordered data structs 689 * in ram to track those extents. 690 * 691 * locked_page is the page that writepage had locked already. We use 692 * it to make sure we don't do extra locks or unlocks. 693 * 694 * *page_started is set to one if we unlock locked_page and do everything 695 * required to start IO on it. It may be clean and already done with 696 * IO when we return. 697 */ 698 static noinline int cow_file_range(struct inode *inode, 699 struct page *locked_page, 700 u64 start, u64 end, int *page_started, 701 unsigned long *nr_written, 702 int unlock) 703 { 704 struct btrfs_root *root = BTRFS_I(inode)->root; 705 struct btrfs_trans_handle *trans; 706 u64 alloc_hint = 0; 707 u64 num_bytes; 708 unsigned long ram_size; 709 u64 disk_num_bytes; 710 u64 cur_alloc_size; 711 u64 blocksize = root->sectorsize; 712 u64 actual_end; 713 u64 isize = i_size_read(inode); 714 struct btrfs_key ins; 715 struct extent_map *em; 716 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 717 int ret = 0; 718 719 trans = btrfs_join_transaction(root, 1); 720 BUG_ON(!trans); 721 btrfs_set_trans_block_group(trans, inode); 722 723 actual_end = min_t(u64, isize, end + 1); 724 725 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 726 num_bytes = max(blocksize, num_bytes); 727 disk_num_bytes = num_bytes; 728 ret = 0; 729 730 if (start == 0) { 731 /* lets try to make an inline extent */ 732 ret = cow_file_range_inline(trans, root, inode, 733 start, end, 0, NULL); 734 if (ret == 0) { 735 extent_clear_unlock_delalloc(inode, 736 &BTRFS_I(inode)->io_tree, 737 start, end, NULL, 738 EXTENT_CLEAR_UNLOCK_PAGE | 739 EXTENT_CLEAR_UNLOCK | 740 EXTENT_CLEAR_DELALLOC | 741 EXTENT_CLEAR_ACCOUNTING | 742 EXTENT_CLEAR_DIRTY | 743 EXTENT_SET_WRITEBACK | 744 EXTENT_END_WRITEBACK); 745 *nr_written = *nr_written + 746 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 747 *page_started = 1; 748 ret = 0; 749 goto out; 750 } 751 } 752 753 BUG_ON(disk_num_bytes > 754 btrfs_super_total_bytes(&root->fs_info->super_copy)); 755 756 757 read_lock(&BTRFS_I(inode)->extent_tree.lock); 758 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, 759 start, num_bytes); 760 if (em) { 761 /* 762 * if block start isn't an actual block number then find the 763 * first block in this inode and use that as a hint. If that 764 * block is also bogus then just don't worry about it. 765 */ 766 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 767 free_extent_map(em); 768 em = search_extent_mapping(em_tree, 0, 0); 769 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 770 alloc_hint = em->block_start; 771 if (em) 772 free_extent_map(em); 773 } else { 774 alloc_hint = em->block_start; 775 free_extent_map(em); 776 } 777 } 778 read_unlock(&BTRFS_I(inode)->extent_tree.lock); 779 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 780 781 while (disk_num_bytes > 0) { 782 unsigned long op; 783 784 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 785 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 786 root->sectorsize, 0, alloc_hint, 787 (u64)-1, &ins, 1); 788 BUG_ON(ret); 789 790 em = alloc_extent_map(GFP_NOFS); 791 em->start = start; 792 em->orig_start = em->start; 793 ram_size = ins.offset; 794 em->len = ins.offset; 795 796 em->block_start = ins.objectid; 797 em->block_len = ins.offset; 798 em->bdev = root->fs_info->fs_devices->latest_bdev; 799 set_bit(EXTENT_FLAG_PINNED, &em->flags); 800 801 while (1) { 802 write_lock(&em_tree->lock); 803 ret = add_extent_mapping(em_tree, em); 804 write_unlock(&em_tree->lock); 805 if (ret != -EEXIST) { 806 free_extent_map(em); 807 break; 808 } 809 btrfs_drop_extent_cache(inode, start, 810 start + ram_size - 1, 0); 811 } 812 813 cur_alloc_size = ins.offset; 814 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 815 ram_size, cur_alloc_size, 0); 816 BUG_ON(ret); 817 818 if (root->root_key.objectid == 819 BTRFS_DATA_RELOC_TREE_OBJECTID) { 820 ret = btrfs_reloc_clone_csums(inode, start, 821 cur_alloc_size); 822 BUG_ON(ret); 823 } 824 825 if (disk_num_bytes < cur_alloc_size) 826 break; 827 828 /* we're not doing compressed IO, don't unlock the first 829 * page (which the caller expects to stay locked), don't 830 * clear any dirty bits and don't set any writeback bits 831 * 832 * Do set the Private2 bit so we know this page was properly 833 * setup for writepage 834 */ 835 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; 836 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 837 EXTENT_SET_PRIVATE2; 838 839 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 840 start, start + ram_size - 1, 841 locked_page, op); 842 disk_num_bytes -= cur_alloc_size; 843 num_bytes -= cur_alloc_size; 844 alloc_hint = ins.objectid + ins.offset; 845 start += cur_alloc_size; 846 } 847 out: 848 ret = 0; 849 btrfs_end_transaction(trans, root); 850 851 return ret; 852 } 853 854 /* 855 * work queue call back to started compression on a file and pages 856 */ 857 static noinline void async_cow_start(struct btrfs_work *work) 858 { 859 struct async_cow *async_cow; 860 int num_added = 0; 861 async_cow = container_of(work, struct async_cow, work); 862 863 compress_file_range(async_cow->inode, async_cow->locked_page, 864 async_cow->start, async_cow->end, async_cow, 865 &num_added); 866 if (num_added == 0) 867 async_cow->inode = NULL; 868 } 869 870 /* 871 * work queue call back to submit previously compressed pages 872 */ 873 static noinline void async_cow_submit(struct btrfs_work *work) 874 { 875 struct async_cow *async_cow; 876 struct btrfs_root *root; 877 unsigned long nr_pages; 878 879 async_cow = container_of(work, struct async_cow, work); 880 881 root = async_cow->root; 882 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 883 PAGE_CACHE_SHIFT; 884 885 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 886 887 if (atomic_read(&root->fs_info->async_delalloc_pages) < 888 5 * 1042 * 1024 && 889 waitqueue_active(&root->fs_info->async_submit_wait)) 890 wake_up(&root->fs_info->async_submit_wait); 891 892 if (async_cow->inode) 893 submit_compressed_extents(async_cow->inode, async_cow); 894 } 895 896 static noinline void async_cow_free(struct btrfs_work *work) 897 { 898 struct async_cow *async_cow; 899 async_cow = container_of(work, struct async_cow, work); 900 kfree(async_cow); 901 } 902 903 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 904 u64 start, u64 end, int *page_started, 905 unsigned long *nr_written) 906 { 907 struct async_cow *async_cow; 908 struct btrfs_root *root = BTRFS_I(inode)->root; 909 unsigned long nr_pages; 910 u64 cur_end; 911 int limit = 10 * 1024 * 1042; 912 913 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 914 1, 0, NULL, GFP_NOFS); 915 while (start < end) { 916 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 917 async_cow->inode = inode; 918 async_cow->root = root; 919 async_cow->locked_page = locked_page; 920 async_cow->start = start; 921 922 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 923 cur_end = end; 924 else 925 cur_end = min(end, start + 512 * 1024 - 1); 926 927 async_cow->end = cur_end; 928 INIT_LIST_HEAD(&async_cow->extents); 929 930 async_cow->work.func = async_cow_start; 931 async_cow->work.ordered_func = async_cow_submit; 932 async_cow->work.ordered_free = async_cow_free; 933 async_cow->work.flags = 0; 934 935 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 936 PAGE_CACHE_SHIFT; 937 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 938 939 btrfs_queue_worker(&root->fs_info->delalloc_workers, 940 &async_cow->work); 941 942 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 943 wait_event(root->fs_info->async_submit_wait, 944 (atomic_read(&root->fs_info->async_delalloc_pages) < 945 limit)); 946 } 947 948 while (atomic_read(&root->fs_info->async_submit_draining) && 949 atomic_read(&root->fs_info->async_delalloc_pages)) { 950 wait_event(root->fs_info->async_submit_wait, 951 (atomic_read(&root->fs_info->async_delalloc_pages) == 952 0)); 953 } 954 955 *nr_written += nr_pages; 956 start = cur_end + 1; 957 } 958 *page_started = 1; 959 return 0; 960 } 961 962 static noinline int csum_exist_in_range(struct btrfs_root *root, 963 u64 bytenr, u64 num_bytes) 964 { 965 int ret; 966 struct btrfs_ordered_sum *sums; 967 LIST_HEAD(list); 968 969 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 970 bytenr + num_bytes - 1, &list); 971 if (ret == 0 && list_empty(&list)) 972 return 0; 973 974 while (!list_empty(&list)) { 975 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 976 list_del(&sums->list); 977 kfree(sums); 978 } 979 return 1; 980 } 981 982 /* 983 * when nowcow writeback call back. This checks for snapshots or COW copies 984 * of the extents that exist in the file, and COWs the file as required. 985 * 986 * If no cow copies or snapshots exist, we write directly to the existing 987 * blocks on disk 988 */ 989 static noinline int run_delalloc_nocow(struct inode *inode, 990 struct page *locked_page, 991 u64 start, u64 end, int *page_started, int force, 992 unsigned long *nr_written) 993 { 994 struct btrfs_root *root = BTRFS_I(inode)->root; 995 struct btrfs_trans_handle *trans; 996 struct extent_buffer *leaf; 997 struct btrfs_path *path; 998 struct btrfs_file_extent_item *fi; 999 struct btrfs_key found_key; 1000 u64 cow_start; 1001 u64 cur_offset; 1002 u64 extent_end; 1003 u64 extent_offset; 1004 u64 disk_bytenr; 1005 u64 num_bytes; 1006 int extent_type; 1007 int ret; 1008 int type; 1009 int nocow; 1010 int check_prev = 1; 1011 1012 path = btrfs_alloc_path(); 1013 BUG_ON(!path); 1014 trans = btrfs_join_transaction(root, 1); 1015 BUG_ON(!trans); 1016 1017 cow_start = (u64)-1; 1018 cur_offset = start; 1019 while (1) { 1020 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 1021 cur_offset, 0); 1022 BUG_ON(ret < 0); 1023 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1024 leaf = path->nodes[0]; 1025 btrfs_item_key_to_cpu(leaf, &found_key, 1026 path->slots[0] - 1); 1027 if (found_key.objectid == inode->i_ino && 1028 found_key.type == BTRFS_EXTENT_DATA_KEY) 1029 path->slots[0]--; 1030 } 1031 check_prev = 0; 1032 next_slot: 1033 leaf = path->nodes[0]; 1034 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1035 ret = btrfs_next_leaf(root, path); 1036 if (ret < 0) 1037 BUG_ON(1); 1038 if (ret > 0) 1039 break; 1040 leaf = path->nodes[0]; 1041 } 1042 1043 nocow = 0; 1044 disk_bytenr = 0; 1045 num_bytes = 0; 1046 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1047 1048 if (found_key.objectid > inode->i_ino || 1049 found_key.type > BTRFS_EXTENT_DATA_KEY || 1050 found_key.offset > end) 1051 break; 1052 1053 if (found_key.offset > cur_offset) { 1054 extent_end = found_key.offset; 1055 extent_type = 0; 1056 goto out_check; 1057 } 1058 1059 fi = btrfs_item_ptr(leaf, path->slots[0], 1060 struct btrfs_file_extent_item); 1061 extent_type = btrfs_file_extent_type(leaf, fi); 1062 1063 if (extent_type == BTRFS_FILE_EXTENT_REG || 1064 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1065 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1066 extent_offset = btrfs_file_extent_offset(leaf, fi); 1067 extent_end = found_key.offset + 1068 btrfs_file_extent_num_bytes(leaf, fi); 1069 if (extent_end <= start) { 1070 path->slots[0]++; 1071 goto next_slot; 1072 } 1073 if (disk_bytenr == 0) 1074 goto out_check; 1075 if (btrfs_file_extent_compression(leaf, fi) || 1076 btrfs_file_extent_encryption(leaf, fi) || 1077 btrfs_file_extent_other_encoding(leaf, fi)) 1078 goto out_check; 1079 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1080 goto out_check; 1081 if (btrfs_extent_readonly(root, disk_bytenr)) 1082 goto out_check; 1083 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 1084 found_key.offset - 1085 extent_offset, disk_bytenr)) 1086 goto out_check; 1087 disk_bytenr += extent_offset; 1088 disk_bytenr += cur_offset - found_key.offset; 1089 num_bytes = min(end + 1, extent_end) - cur_offset; 1090 /* 1091 * force cow if csum exists in the range. 1092 * this ensure that csum for a given extent are 1093 * either valid or do not exist. 1094 */ 1095 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1096 goto out_check; 1097 nocow = 1; 1098 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1099 extent_end = found_key.offset + 1100 btrfs_file_extent_inline_len(leaf, fi); 1101 extent_end = ALIGN(extent_end, root->sectorsize); 1102 } else { 1103 BUG_ON(1); 1104 } 1105 out_check: 1106 if (extent_end <= start) { 1107 path->slots[0]++; 1108 goto next_slot; 1109 } 1110 if (!nocow) { 1111 if (cow_start == (u64)-1) 1112 cow_start = cur_offset; 1113 cur_offset = extent_end; 1114 if (cur_offset > end) 1115 break; 1116 path->slots[0]++; 1117 goto next_slot; 1118 } 1119 1120 btrfs_release_path(root, path); 1121 if (cow_start != (u64)-1) { 1122 ret = cow_file_range(inode, locked_page, cow_start, 1123 found_key.offset - 1, page_started, 1124 nr_written, 1); 1125 BUG_ON(ret); 1126 cow_start = (u64)-1; 1127 } 1128 1129 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1130 struct extent_map *em; 1131 struct extent_map_tree *em_tree; 1132 em_tree = &BTRFS_I(inode)->extent_tree; 1133 em = alloc_extent_map(GFP_NOFS); 1134 em->start = cur_offset; 1135 em->orig_start = em->start; 1136 em->len = num_bytes; 1137 em->block_len = num_bytes; 1138 em->block_start = disk_bytenr; 1139 em->bdev = root->fs_info->fs_devices->latest_bdev; 1140 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1141 while (1) { 1142 write_lock(&em_tree->lock); 1143 ret = add_extent_mapping(em_tree, em); 1144 write_unlock(&em_tree->lock); 1145 if (ret != -EEXIST) { 1146 free_extent_map(em); 1147 break; 1148 } 1149 btrfs_drop_extent_cache(inode, em->start, 1150 em->start + em->len - 1, 0); 1151 } 1152 type = BTRFS_ORDERED_PREALLOC; 1153 } else { 1154 type = BTRFS_ORDERED_NOCOW; 1155 } 1156 1157 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1158 num_bytes, num_bytes, type); 1159 BUG_ON(ret); 1160 1161 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1162 cur_offset, cur_offset + num_bytes - 1, 1163 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1164 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 1165 EXTENT_SET_PRIVATE2); 1166 cur_offset = extent_end; 1167 if (cur_offset > end) 1168 break; 1169 } 1170 btrfs_release_path(root, path); 1171 1172 if (cur_offset <= end && cow_start == (u64)-1) 1173 cow_start = cur_offset; 1174 if (cow_start != (u64)-1) { 1175 ret = cow_file_range(inode, locked_page, cow_start, end, 1176 page_started, nr_written, 1); 1177 BUG_ON(ret); 1178 } 1179 1180 ret = btrfs_end_transaction(trans, root); 1181 BUG_ON(ret); 1182 btrfs_free_path(path); 1183 return 0; 1184 } 1185 1186 /* 1187 * extent_io.c call back to do delayed allocation processing 1188 */ 1189 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1190 u64 start, u64 end, int *page_started, 1191 unsigned long *nr_written) 1192 { 1193 int ret; 1194 struct btrfs_root *root = BTRFS_I(inode)->root; 1195 1196 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) 1197 ret = run_delalloc_nocow(inode, locked_page, start, end, 1198 page_started, 1, nr_written); 1199 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1200 ret = run_delalloc_nocow(inode, locked_page, start, end, 1201 page_started, 0, nr_written); 1202 else if (!btrfs_test_opt(root, COMPRESS)) 1203 ret = cow_file_range(inode, locked_page, start, end, 1204 page_started, nr_written, 1); 1205 else 1206 ret = cow_file_range_async(inode, locked_page, start, end, 1207 page_started, nr_written); 1208 return ret; 1209 } 1210 1211 static int btrfs_split_extent_hook(struct inode *inode, 1212 struct extent_state *orig, u64 split) 1213 { 1214 struct btrfs_root *root = BTRFS_I(inode)->root; 1215 u64 size; 1216 1217 if (!(orig->state & EXTENT_DELALLOC)) 1218 return 0; 1219 1220 size = orig->end - orig->start + 1; 1221 if (size > root->fs_info->max_extent) { 1222 u64 num_extents; 1223 u64 new_size; 1224 1225 new_size = orig->end - split + 1; 1226 num_extents = div64_u64(size + root->fs_info->max_extent - 1, 1227 root->fs_info->max_extent); 1228 1229 /* 1230 * if we break a large extent up then leave oustanding_extents 1231 * be, since we've already accounted for the large extent. 1232 */ 1233 if (div64_u64(new_size + root->fs_info->max_extent - 1, 1234 root->fs_info->max_extent) < num_extents) 1235 return 0; 1236 } 1237 1238 spin_lock(&BTRFS_I(inode)->accounting_lock); 1239 BTRFS_I(inode)->outstanding_extents++; 1240 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1241 1242 return 0; 1243 } 1244 1245 /* 1246 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1247 * extents so we can keep track of new extents that are just merged onto old 1248 * extents, such as when we are doing sequential writes, so we can properly 1249 * account for the metadata space we'll need. 1250 */ 1251 static int btrfs_merge_extent_hook(struct inode *inode, 1252 struct extent_state *new, 1253 struct extent_state *other) 1254 { 1255 struct btrfs_root *root = BTRFS_I(inode)->root; 1256 u64 new_size, old_size; 1257 u64 num_extents; 1258 1259 /* not delalloc, ignore it */ 1260 if (!(other->state & EXTENT_DELALLOC)) 1261 return 0; 1262 1263 old_size = other->end - other->start + 1; 1264 if (new->start < other->start) 1265 new_size = other->end - new->start + 1; 1266 else 1267 new_size = new->end - other->start + 1; 1268 1269 /* we're not bigger than the max, unreserve the space and go */ 1270 if (new_size <= root->fs_info->max_extent) { 1271 spin_lock(&BTRFS_I(inode)->accounting_lock); 1272 BTRFS_I(inode)->outstanding_extents--; 1273 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1274 return 0; 1275 } 1276 1277 /* 1278 * If we grew by another max_extent, just return, we want to keep that 1279 * reserved amount. 1280 */ 1281 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, 1282 root->fs_info->max_extent); 1283 if (div64_u64(new_size + root->fs_info->max_extent - 1, 1284 root->fs_info->max_extent) > num_extents) 1285 return 0; 1286 1287 spin_lock(&BTRFS_I(inode)->accounting_lock); 1288 BTRFS_I(inode)->outstanding_extents--; 1289 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1290 1291 return 0; 1292 } 1293 1294 /* 1295 * extent_io.c set_bit_hook, used to track delayed allocation 1296 * bytes in this file, and to maintain the list of inodes that 1297 * have pending delalloc work to be done. 1298 */ 1299 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1300 unsigned long old, unsigned long bits) 1301 { 1302 1303 /* 1304 * set_bit and clear bit hooks normally require _irqsave/restore 1305 * but in this case, we are only testeing for the DELALLOC 1306 * bit, which is only set or cleared with irqs on 1307 */ 1308 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1309 struct btrfs_root *root = BTRFS_I(inode)->root; 1310 1311 spin_lock(&BTRFS_I(inode)->accounting_lock); 1312 BTRFS_I(inode)->outstanding_extents++; 1313 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1314 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1315 spin_lock(&root->fs_info->delalloc_lock); 1316 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1317 root->fs_info->delalloc_bytes += end - start + 1; 1318 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1319 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1320 &root->fs_info->delalloc_inodes); 1321 } 1322 spin_unlock(&root->fs_info->delalloc_lock); 1323 } 1324 return 0; 1325 } 1326 1327 /* 1328 * extent_io.c clear_bit_hook, see set_bit_hook for why 1329 */ 1330 static int btrfs_clear_bit_hook(struct inode *inode, 1331 struct extent_state *state, unsigned long bits) 1332 { 1333 /* 1334 * set_bit and clear bit hooks normally require _irqsave/restore 1335 * but in this case, we are only testeing for the DELALLOC 1336 * bit, which is only set or cleared with irqs on 1337 */ 1338 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1339 struct btrfs_root *root = BTRFS_I(inode)->root; 1340 1341 if (bits & EXTENT_DO_ACCOUNTING) { 1342 spin_lock(&BTRFS_I(inode)->accounting_lock); 1343 BTRFS_I(inode)->outstanding_extents--; 1344 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1345 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1346 } 1347 1348 spin_lock(&root->fs_info->delalloc_lock); 1349 if (state->end - state->start + 1 > 1350 root->fs_info->delalloc_bytes) { 1351 printk(KERN_INFO "btrfs warning: delalloc account " 1352 "%llu %llu\n", 1353 (unsigned long long) 1354 state->end - state->start + 1, 1355 (unsigned long long) 1356 root->fs_info->delalloc_bytes); 1357 btrfs_delalloc_free_space(root, inode, (u64)-1); 1358 root->fs_info->delalloc_bytes = 0; 1359 BTRFS_I(inode)->delalloc_bytes = 0; 1360 } else { 1361 btrfs_delalloc_free_space(root, inode, 1362 state->end - 1363 state->start + 1); 1364 root->fs_info->delalloc_bytes -= state->end - 1365 state->start + 1; 1366 BTRFS_I(inode)->delalloc_bytes -= state->end - 1367 state->start + 1; 1368 } 1369 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1370 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1371 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1372 } 1373 spin_unlock(&root->fs_info->delalloc_lock); 1374 } 1375 return 0; 1376 } 1377 1378 /* 1379 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1380 * we don't create bios that span stripes or chunks 1381 */ 1382 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1383 size_t size, struct bio *bio, 1384 unsigned long bio_flags) 1385 { 1386 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1387 struct btrfs_mapping_tree *map_tree; 1388 u64 logical = (u64)bio->bi_sector << 9; 1389 u64 length = 0; 1390 u64 map_length; 1391 int ret; 1392 1393 if (bio_flags & EXTENT_BIO_COMPRESSED) 1394 return 0; 1395 1396 length = bio->bi_size; 1397 map_tree = &root->fs_info->mapping_tree; 1398 map_length = length; 1399 ret = btrfs_map_block(map_tree, READ, logical, 1400 &map_length, NULL, 0); 1401 1402 if (map_length < length + size) 1403 return 1; 1404 return 0; 1405 } 1406 1407 /* 1408 * in order to insert checksums into the metadata in large chunks, 1409 * we wait until bio submission time. All the pages in the bio are 1410 * checksummed and sums are attached onto the ordered extent record. 1411 * 1412 * At IO completion time the cums attached on the ordered extent record 1413 * are inserted into the btree 1414 */ 1415 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1416 struct bio *bio, int mirror_num, 1417 unsigned long bio_flags) 1418 { 1419 struct btrfs_root *root = BTRFS_I(inode)->root; 1420 int ret = 0; 1421 1422 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1423 BUG_ON(ret); 1424 return 0; 1425 } 1426 1427 /* 1428 * in order to insert checksums into the metadata in large chunks, 1429 * we wait until bio submission time. All the pages in the bio are 1430 * checksummed and sums are attached onto the ordered extent record. 1431 * 1432 * At IO completion time the cums attached on the ordered extent record 1433 * are inserted into the btree 1434 */ 1435 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1436 int mirror_num, unsigned long bio_flags) 1437 { 1438 struct btrfs_root *root = BTRFS_I(inode)->root; 1439 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1440 } 1441 1442 /* 1443 * extent_io.c submission hook. This does the right thing for csum calculation 1444 * on write, or reading the csums from the tree before a read 1445 */ 1446 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1447 int mirror_num, unsigned long bio_flags) 1448 { 1449 struct btrfs_root *root = BTRFS_I(inode)->root; 1450 int ret = 0; 1451 int skip_sum; 1452 1453 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1454 1455 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1456 BUG_ON(ret); 1457 1458 if (!(rw & (1 << BIO_RW))) { 1459 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1460 return btrfs_submit_compressed_read(inode, bio, 1461 mirror_num, bio_flags); 1462 } else if (!skip_sum) 1463 btrfs_lookup_bio_sums(root, inode, bio, NULL); 1464 goto mapit; 1465 } else if (!skip_sum) { 1466 /* csum items have already been cloned */ 1467 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1468 goto mapit; 1469 /* we're doing a write, do the async checksumming */ 1470 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1471 inode, rw, bio, mirror_num, 1472 bio_flags, __btrfs_submit_bio_start, 1473 __btrfs_submit_bio_done); 1474 } 1475 1476 mapit: 1477 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1478 } 1479 1480 /* 1481 * given a list of ordered sums record them in the inode. This happens 1482 * at IO completion time based on sums calculated at bio submission time. 1483 */ 1484 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1485 struct inode *inode, u64 file_offset, 1486 struct list_head *list) 1487 { 1488 struct btrfs_ordered_sum *sum; 1489 1490 btrfs_set_trans_block_group(trans, inode); 1491 1492 list_for_each_entry(sum, list, list) { 1493 btrfs_csum_file_blocks(trans, 1494 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1495 } 1496 return 0; 1497 } 1498 1499 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1500 { 1501 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1502 WARN_ON(1); 1503 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1504 GFP_NOFS); 1505 } 1506 1507 /* see btrfs_writepage_start_hook for details on why this is required */ 1508 struct btrfs_writepage_fixup { 1509 struct page *page; 1510 struct btrfs_work work; 1511 }; 1512 1513 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1514 { 1515 struct btrfs_writepage_fixup *fixup; 1516 struct btrfs_ordered_extent *ordered; 1517 struct page *page; 1518 struct inode *inode; 1519 u64 page_start; 1520 u64 page_end; 1521 1522 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1523 page = fixup->page; 1524 again: 1525 lock_page(page); 1526 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1527 ClearPageChecked(page); 1528 goto out_page; 1529 } 1530 1531 inode = page->mapping->host; 1532 page_start = page_offset(page); 1533 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1534 1535 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1536 1537 /* already ordered? We're done */ 1538 if (PagePrivate2(page)) 1539 goto out; 1540 1541 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1542 if (ordered) { 1543 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1544 page_end, GFP_NOFS); 1545 unlock_page(page); 1546 btrfs_start_ordered_extent(inode, ordered, 1); 1547 goto again; 1548 } 1549 1550 btrfs_set_extent_delalloc(inode, page_start, page_end); 1551 ClearPageChecked(page); 1552 out: 1553 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1554 out_page: 1555 unlock_page(page); 1556 page_cache_release(page); 1557 } 1558 1559 /* 1560 * There are a few paths in the higher layers of the kernel that directly 1561 * set the page dirty bit without asking the filesystem if it is a 1562 * good idea. This causes problems because we want to make sure COW 1563 * properly happens and the data=ordered rules are followed. 1564 * 1565 * In our case any range that doesn't have the ORDERED bit set 1566 * hasn't been properly setup for IO. We kick off an async process 1567 * to fix it up. The async helper will wait for ordered extents, set 1568 * the delalloc bit and make it safe to write the page. 1569 */ 1570 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1571 { 1572 struct inode *inode = page->mapping->host; 1573 struct btrfs_writepage_fixup *fixup; 1574 struct btrfs_root *root = BTRFS_I(inode)->root; 1575 1576 /* this page is properly in the ordered list */ 1577 if (TestClearPagePrivate2(page)) 1578 return 0; 1579 1580 if (PageChecked(page)) 1581 return -EAGAIN; 1582 1583 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1584 if (!fixup) 1585 return -EAGAIN; 1586 1587 SetPageChecked(page); 1588 page_cache_get(page); 1589 fixup->work.func = btrfs_writepage_fixup_worker; 1590 fixup->page = page; 1591 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1592 return -EAGAIN; 1593 } 1594 1595 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1596 struct inode *inode, u64 file_pos, 1597 u64 disk_bytenr, u64 disk_num_bytes, 1598 u64 num_bytes, u64 ram_bytes, 1599 u64 locked_end, 1600 u8 compression, u8 encryption, 1601 u16 other_encoding, int extent_type) 1602 { 1603 struct btrfs_root *root = BTRFS_I(inode)->root; 1604 struct btrfs_file_extent_item *fi; 1605 struct btrfs_path *path; 1606 struct extent_buffer *leaf; 1607 struct btrfs_key ins; 1608 u64 hint; 1609 int ret; 1610 1611 path = btrfs_alloc_path(); 1612 BUG_ON(!path); 1613 1614 path->leave_spinning = 1; 1615 1616 /* 1617 * we may be replacing one extent in the tree with another. 1618 * The new extent is pinned in the extent map, and we don't want 1619 * to drop it from the cache until it is completely in the btree. 1620 * 1621 * So, tell btrfs_drop_extents to leave this extent in the cache. 1622 * the caller is expected to unpin it and allow it to be merged 1623 * with the others. 1624 */ 1625 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1626 file_pos + num_bytes, locked_end, 1627 file_pos, &hint, 0); 1628 BUG_ON(ret); 1629 1630 ins.objectid = inode->i_ino; 1631 ins.offset = file_pos; 1632 ins.type = BTRFS_EXTENT_DATA_KEY; 1633 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1634 BUG_ON(ret); 1635 leaf = path->nodes[0]; 1636 fi = btrfs_item_ptr(leaf, path->slots[0], 1637 struct btrfs_file_extent_item); 1638 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1639 btrfs_set_file_extent_type(leaf, fi, extent_type); 1640 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1641 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1642 btrfs_set_file_extent_offset(leaf, fi, 0); 1643 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1644 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1645 btrfs_set_file_extent_compression(leaf, fi, compression); 1646 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1647 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1648 1649 btrfs_unlock_up_safe(path, 1); 1650 btrfs_set_lock_blocking(leaf); 1651 1652 btrfs_mark_buffer_dirty(leaf); 1653 1654 inode_add_bytes(inode, num_bytes); 1655 1656 ins.objectid = disk_bytenr; 1657 ins.offset = disk_num_bytes; 1658 ins.type = BTRFS_EXTENT_ITEM_KEY; 1659 ret = btrfs_alloc_reserved_file_extent(trans, root, 1660 root->root_key.objectid, 1661 inode->i_ino, file_pos, &ins); 1662 BUG_ON(ret); 1663 btrfs_free_path(path); 1664 1665 return 0; 1666 } 1667 1668 /* 1669 * helper function for btrfs_finish_ordered_io, this 1670 * just reads in some of the csum leaves to prime them into ram 1671 * before we start the transaction. It limits the amount of btree 1672 * reads required while inside the transaction. 1673 */ 1674 static noinline void reada_csum(struct btrfs_root *root, 1675 struct btrfs_path *path, 1676 struct btrfs_ordered_extent *ordered_extent) 1677 { 1678 struct btrfs_ordered_sum *sum; 1679 u64 bytenr; 1680 1681 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, 1682 list); 1683 bytenr = sum->sums[0].bytenr; 1684 1685 /* 1686 * we don't care about the results, the point of this search is 1687 * just to get the btree leaves into ram 1688 */ 1689 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); 1690 } 1691 1692 /* as ordered data IO finishes, this gets called so we can finish 1693 * an ordered extent if the range of bytes in the file it covers are 1694 * fully written. 1695 */ 1696 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1697 { 1698 struct btrfs_root *root = BTRFS_I(inode)->root; 1699 struct btrfs_trans_handle *trans; 1700 struct btrfs_ordered_extent *ordered_extent = NULL; 1701 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1702 struct btrfs_path *path; 1703 int compressed = 0; 1704 int ret; 1705 1706 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1707 if (!ret) 1708 return 0; 1709 1710 /* 1711 * before we join the transaction, try to do some of our IO. 1712 * This will limit the amount of IO that we have to do with 1713 * the transaction running. We're unlikely to need to do any 1714 * IO if the file extents are new, the disk_i_size checks 1715 * covers the most common case. 1716 */ 1717 if (start < BTRFS_I(inode)->disk_i_size) { 1718 path = btrfs_alloc_path(); 1719 if (path) { 1720 ret = btrfs_lookup_file_extent(NULL, root, path, 1721 inode->i_ino, 1722 start, 0); 1723 ordered_extent = btrfs_lookup_ordered_extent(inode, 1724 start); 1725 if (!list_empty(&ordered_extent->list)) { 1726 btrfs_release_path(root, path); 1727 reada_csum(root, path, ordered_extent); 1728 } 1729 btrfs_free_path(path); 1730 } 1731 } 1732 1733 trans = btrfs_join_transaction(root, 1); 1734 1735 if (!ordered_extent) 1736 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1737 BUG_ON(!ordered_extent); 1738 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1739 goto nocow; 1740 1741 lock_extent(io_tree, ordered_extent->file_offset, 1742 ordered_extent->file_offset + ordered_extent->len - 1, 1743 GFP_NOFS); 1744 1745 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1746 compressed = 1; 1747 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1748 BUG_ON(compressed); 1749 ret = btrfs_mark_extent_written(trans, root, inode, 1750 ordered_extent->file_offset, 1751 ordered_extent->file_offset + 1752 ordered_extent->len); 1753 BUG_ON(ret); 1754 } else { 1755 ret = insert_reserved_file_extent(trans, inode, 1756 ordered_extent->file_offset, 1757 ordered_extent->start, 1758 ordered_extent->disk_len, 1759 ordered_extent->len, 1760 ordered_extent->len, 1761 ordered_extent->file_offset + 1762 ordered_extent->len, 1763 compressed, 0, 0, 1764 BTRFS_FILE_EXTENT_REG); 1765 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1766 ordered_extent->file_offset, 1767 ordered_extent->len); 1768 BUG_ON(ret); 1769 } 1770 unlock_extent(io_tree, ordered_extent->file_offset, 1771 ordered_extent->file_offset + ordered_extent->len - 1, 1772 GFP_NOFS); 1773 nocow: 1774 add_pending_csums(trans, inode, ordered_extent->file_offset, 1775 &ordered_extent->list); 1776 1777 mutex_lock(&BTRFS_I(inode)->extent_mutex); 1778 btrfs_ordered_update_i_size(inode, ordered_extent); 1779 btrfs_update_inode(trans, root, inode); 1780 btrfs_remove_ordered_extent(inode, ordered_extent); 1781 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 1782 1783 /* once for us */ 1784 btrfs_put_ordered_extent(ordered_extent); 1785 /* once for the tree */ 1786 btrfs_put_ordered_extent(ordered_extent); 1787 1788 btrfs_end_transaction(trans, root); 1789 return 0; 1790 } 1791 1792 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1793 struct extent_state *state, int uptodate) 1794 { 1795 ClearPagePrivate2(page); 1796 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1797 } 1798 1799 /* 1800 * When IO fails, either with EIO or csum verification fails, we 1801 * try other mirrors that might have a good copy of the data. This 1802 * io_failure_record is used to record state as we go through all the 1803 * mirrors. If another mirror has good data, the page is set up to date 1804 * and things continue. If a good mirror can't be found, the original 1805 * bio end_io callback is called to indicate things have failed. 1806 */ 1807 struct io_failure_record { 1808 struct page *page; 1809 u64 start; 1810 u64 len; 1811 u64 logical; 1812 unsigned long bio_flags; 1813 int last_mirror; 1814 }; 1815 1816 static int btrfs_io_failed_hook(struct bio *failed_bio, 1817 struct page *page, u64 start, u64 end, 1818 struct extent_state *state) 1819 { 1820 struct io_failure_record *failrec = NULL; 1821 u64 private; 1822 struct extent_map *em; 1823 struct inode *inode = page->mapping->host; 1824 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1825 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1826 struct bio *bio; 1827 int num_copies; 1828 int ret; 1829 int rw; 1830 u64 logical; 1831 1832 ret = get_state_private(failure_tree, start, &private); 1833 if (ret) { 1834 failrec = kmalloc(sizeof(*failrec), GFP_NOFS); 1835 if (!failrec) 1836 return -ENOMEM; 1837 failrec->start = start; 1838 failrec->len = end - start + 1; 1839 failrec->last_mirror = 0; 1840 failrec->bio_flags = 0; 1841 1842 read_lock(&em_tree->lock); 1843 em = lookup_extent_mapping(em_tree, start, failrec->len); 1844 if (em->start > start || em->start + em->len < start) { 1845 free_extent_map(em); 1846 em = NULL; 1847 } 1848 read_unlock(&em_tree->lock); 1849 1850 if (!em || IS_ERR(em)) { 1851 kfree(failrec); 1852 return -EIO; 1853 } 1854 logical = start - em->start; 1855 logical = em->block_start + logical; 1856 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1857 logical = em->block_start; 1858 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1859 } 1860 failrec->logical = logical; 1861 free_extent_map(em); 1862 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | 1863 EXTENT_DIRTY, GFP_NOFS); 1864 set_state_private(failure_tree, start, 1865 (u64)(unsigned long)failrec); 1866 } else { 1867 failrec = (struct io_failure_record *)(unsigned long)private; 1868 } 1869 num_copies = btrfs_num_copies( 1870 &BTRFS_I(inode)->root->fs_info->mapping_tree, 1871 failrec->logical, failrec->len); 1872 failrec->last_mirror++; 1873 if (!state) { 1874 spin_lock(&BTRFS_I(inode)->io_tree.lock); 1875 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 1876 failrec->start, 1877 EXTENT_LOCKED); 1878 if (state && state->start != failrec->start) 1879 state = NULL; 1880 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 1881 } 1882 if (!state || failrec->last_mirror > num_copies) { 1883 set_state_private(failure_tree, failrec->start, 0); 1884 clear_extent_bits(failure_tree, failrec->start, 1885 failrec->start + failrec->len - 1, 1886 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1887 kfree(failrec); 1888 return -EIO; 1889 } 1890 bio = bio_alloc(GFP_NOFS, 1); 1891 bio->bi_private = state; 1892 bio->bi_end_io = failed_bio->bi_end_io; 1893 bio->bi_sector = failrec->logical >> 9; 1894 bio->bi_bdev = failed_bio->bi_bdev; 1895 bio->bi_size = 0; 1896 1897 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1898 if (failed_bio->bi_rw & (1 << BIO_RW)) 1899 rw = WRITE; 1900 else 1901 rw = READ; 1902 1903 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1904 failrec->last_mirror, 1905 failrec->bio_flags); 1906 return 0; 1907 } 1908 1909 /* 1910 * each time an IO finishes, we do a fast check in the IO failure tree 1911 * to see if we need to process or clean up an io_failure_record 1912 */ 1913 static int btrfs_clean_io_failures(struct inode *inode, u64 start) 1914 { 1915 u64 private; 1916 u64 private_failure; 1917 struct io_failure_record *failure; 1918 int ret; 1919 1920 private = 0; 1921 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1922 (u64)-1, 1, EXTENT_DIRTY)) { 1923 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, 1924 start, &private_failure); 1925 if (ret == 0) { 1926 failure = (struct io_failure_record *)(unsigned long) 1927 private_failure; 1928 set_state_private(&BTRFS_I(inode)->io_failure_tree, 1929 failure->start, 0); 1930 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, 1931 failure->start, 1932 failure->start + failure->len - 1, 1933 EXTENT_DIRTY | EXTENT_LOCKED, 1934 GFP_NOFS); 1935 kfree(failure); 1936 } 1937 } 1938 return 0; 1939 } 1940 1941 /* 1942 * when reads are done, we need to check csums to verify the data is correct 1943 * if there's a match, we allow the bio to finish. If not, we go through 1944 * the io_failure_record routines to find good copies 1945 */ 1946 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1947 struct extent_state *state) 1948 { 1949 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 1950 struct inode *inode = page->mapping->host; 1951 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1952 char *kaddr; 1953 u64 private = ~(u32)0; 1954 int ret; 1955 struct btrfs_root *root = BTRFS_I(inode)->root; 1956 u32 csum = ~(u32)0; 1957 1958 if (PageChecked(page)) { 1959 ClearPageChecked(page); 1960 goto good; 1961 } 1962 1963 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 1964 return 0; 1965 1966 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1967 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 1968 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 1969 GFP_NOFS); 1970 return 0; 1971 } 1972 1973 if (state && state->start == start) { 1974 private = state->private; 1975 ret = 0; 1976 } else { 1977 ret = get_state_private(io_tree, start, &private); 1978 } 1979 kaddr = kmap_atomic(page, KM_USER0); 1980 if (ret) 1981 goto zeroit; 1982 1983 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 1984 btrfs_csum_final(csum, (char *)&csum); 1985 if (csum != private) 1986 goto zeroit; 1987 1988 kunmap_atomic(kaddr, KM_USER0); 1989 good: 1990 /* if the io failure tree for this inode is non-empty, 1991 * check to see if we've recovered from a failed IO 1992 */ 1993 btrfs_clean_io_failures(inode, start); 1994 return 0; 1995 1996 zeroit: 1997 if (printk_ratelimit()) { 1998 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " 1999 "private %llu\n", page->mapping->host->i_ino, 2000 (unsigned long long)start, csum, 2001 (unsigned long long)private); 2002 } 2003 memset(kaddr + offset, 1, end - start + 1); 2004 flush_dcache_page(page); 2005 kunmap_atomic(kaddr, KM_USER0); 2006 if (private == 0) 2007 return 0; 2008 return -EIO; 2009 } 2010 2011 /* 2012 * This creates an orphan entry for the given inode in case something goes 2013 * wrong in the middle of an unlink/truncate. 2014 */ 2015 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2016 { 2017 struct btrfs_root *root = BTRFS_I(inode)->root; 2018 int ret = 0; 2019 2020 spin_lock(&root->list_lock); 2021 2022 /* already on the orphan list, we're good */ 2023 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2024 spin_unlock(&root->list_lock); 2025 return 0; 2026 } 2027 2028 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2029 2030 spin_unlock(&root->list_lock); 2031 2032 /* 2033 * insert an orphan item to track this unlinked/truncated file 2034 */ 2035 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2036 2037 return ret; 2038 } 2039 2040 /* 2041 * We have done the truncate/delete so we can go ahead and remove the orphan 2042 * item for this particular inode. 2043 */ 2044 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2045 { 2046 struct btrfs_root *root = BTRFS_I(inode)->root; 2047 int ret = 0; 2048 2049 spin_lock(&root->list_lock); 2050 2051 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2052 spin_unlock(&root->list_lock); 2053 return 0; 2054 } 2055 2056 list_del_init(&BTRFS_I(inode)->i_orphan); 2057 if (!trans) { 2058 spin_unlock(&root->list_lock); 2059 return 0; 2060 } 2061 2062 spin_unlock(&root->list_lock); 2063 2064 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2065 2066 return ret; 2067 } 2068 2069 /* 2070 * this cleans up any orphans that may be left on the list from the last use 2071 * of this root. 2072 */ 2073 void btrfs_orphan_cleanup(struct btrfs_root *root) 2074 { 2075 struct btrfs_path *path; 2076 struct extent_buffer *leaf; 2077 struct btrfs_item *item; 2078 struct btrfs_key key, found_key; 2079 struct btrfs_trans_handle *trans; 2080 struct inode *inode; 2081 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2082 2083 path = btrfs_alloc_path(); 2084 if (!path) 2085 return; 2086 path->reada = -1; 2087 2088 key.objectid = BTRFS_ORPHAN_OBJECTID; 2089 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2090 key.offset = (u64)-1; 2091 2092 2093 while (1) { 2094 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2095 if (ret < 0) { 2096 printk(KERN_ERR "Error searching slot for orphan: %d" 2097 "\n", ret); 2098 break; 2099 } 2100 2101 /* 2102 * if ret == 0 means we found what we were searching for, which 2103 * is weird, but possible, so only screw with path if we didnt 2104 * find the key and see if we have stuff that matches 2105 */ 2106 if (ret > 0) { 2107 if (path->slots[0] == 0) 2108 break; 2109 path->slots[0]--; 2110 } 2111 2112 /* pull out the item */ 2113 leaf = path->nodes[0]; 2114 item = btrfs_item_nr(leaf, path->slots[0]); 2115 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2116 2117 /* make sure the item matches what we want */ 2118 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 2119 break; 2120 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 2121 break; 2122 2123 /* release the path since we're done with it */ 2124 btrfs_release_path(root, path); 2125 2126 /* 2127 * this is where we are basically btrfs_lookup, without the 2128 * crossing root thing. we store the inode number in the 2129 * offset of the orphan item. 2130 */ 2131 found_key.objectid = found_key.offset; 2132 found_key.type = BTRFS_INODE_ITEM_KEY; 2133 found_key.offset = 0; 2134 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 2135 if (IS_ERR(inode)) 2136 break; 2137 2138 /* 2139 * add this inode to the orphan list so btrfs_orphan_del does 2140 * the proper thing when we hit it 2141 */ 2142 spin_lock(&root->list_lock); 2143 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2144 spin_unlock(&root->list_lock); 2145 2146 /* 2147 * if this is a bad inode, means we actually succeeded in 2148 * removing the inode, but not the orphan record, which means 2149 * we need to manually delete the orphan since iput will just 2150 * do a destroy_inode 2151 */ 2152 if (is_bad_inode(inode)) { 2153 trans = btrfs_start_transaction(root, 1); 2154 btrfs_orphan_del(trans, inode); 2155 btrfs_end_transaction(trans, root); 2156 iput(inode); 2157 continue; 2158 } 2159 2160 /* if we have links, this was a truncate, lets do that */ 2161 if (inode->i_nlink) { 2162 nr_truncate++; 2163 btrfs_truncate(inode); 2164 } else { 2165 nr_unlink++; 2166 } 2167 2168 /* this will do delete_inode and everything for us */ 2169 iput(inode); 2170 } 2171 2172 if (nr_unlink) 2173 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2174 if (nr_truncate) 2175 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2176 2177 btrfs_free_path(path); 2178 } 2179 2180 /* 2181 * very simple check to peek ahead in the leaf looking for xattrs. If we 2182 * don't find any xattrs, we know there can't be any acls. 2183 * 2184 * slot is the slot the inode is in, objectid is the objectid of the inode 2185 */ 2186 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2187 int slot, u64 objectid) 2188 { 2189 u32 nritems = btrfs_header_nritems(leaf); 2190 struct btrfs_key found_key; 2191 int scanned = 0; 2192 2193 slot++; 2194 while (slot < nritems) { 2195 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2196 2197 /* we found a different objectid, there must not be acls */ 2198 if (found_key.objectid != objectid) 2199 return 0; 2200 2201 /* we found an xattr, assume we've got an acl */ 2202 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2203 return 1; 2204 2205 /* 2206 * we found a key greater than an xattr key, there can't 2207 * be any acls later on 2208 */ 2209 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2210 return 0; 2211 2212 slot++; 2213 scanned++; 2214 2215 /* 2216 * it goes inode, inode backrefs, xattrs, extents, 2217 * so if there are a ton of hard links to an inode there can 2218 * be a lot of backrefs. Don't waste time searching too hard, 2219 * this is just an optimization 2220 */ 2221 if (scanned >= 8) 2222 break; 2223 } 2224 /* we hit the end of the leaf before we found an xattr or 2225 * something larger than an xattr. We have to assume the inode 2226 * has acls 2227 */ 2228 return 1; 2229 } 2230 2231 /* 2232 * read an inode from the btree into the in-memory inode 2233 */ 2234 static void btrfs_read_locked_inode(struct inode *inode) 2235 { 2236 struct btrfs_path *path; 2237 struct extent_buffer *leaf; 2238 struct btrfs_inode_item *inode_item; 2239 struct btrfs_timespec *tspec; 2240 struct btrfs_root *root = BTRFS_I(inode)->root; 2241 struct btrfs_key location; 2242 int maybe_acls; 2243 u64 alloc_group_block; 2244 u32 rdev; 2245 int ret; 2246 2247 path = btrfs_alloc_path(); 2248 BUG_ON(!path); 2249 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2250 2251 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2252 if (ret) 2253 goto make_bad; 2254 2255 leaf = path->nodes[0]; 2256 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2257 struct btrfs_inode_item); 2258 2259 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2260 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2261 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2262 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2263 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2264 2265 tspec = btrfs_inode_atime(inode_item); 2266 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2267 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2268 2269 tspec = btrfs_inode_mtime(inode_item); 2270 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2271 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2272 2273 tspec = btrfs_inode_ctime(inode_item); 2274 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2275 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2276 2277 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2278 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2279 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2280 inode->i_generation = BTRFS_I(inode)->generation; 2281 inode->i_rdev = 0; 2282 rdev = btrfs_inode_rdev(leaf, inode_item); 2283 2284 BTRFS_I(inode)->index_cnt = (u64)-1; 2285 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2286 2287 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2288 2289 /* 2290 * try to precache a NULL acl entry for files that don't have 2291 * any xattrs or acls 2292 */ 2293 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); 2294 if (!maybe_acls) 2295 cache_no_acl(inode); 2296 2297 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2298 alloc_group_block, 0); 2299 btrfs_free_path(path); 2300 inode_item = NULL; 2301 2302 switch (inode->i_mode & S_IFMT) { 2303 case S_IFREG: 2304 inode->i_mapping->a_ops = &btrfs_aops; 2305 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2306 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2307 inode->i_fop = &btrfs_file_operations; 2308 inode->i_op = &btrfs_file_inode_operations; 2309 break; 2310 case S_IFDIR: 2311 inode->i_fop = &btrfs_dir_file_operations; 2312 if (root == root->fs_info->tree_root) 2313 inode->i_op = &btrfs_dir_ro_inode_operations; 2314 else 2315 inode->i_op = &btrfs_dir_inode_operations; 2316 break; 2317 case S_IFLNK: 2318 inode->i_op = &btrfs_symlink_inode_operations; 2319 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2320 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2321 break; 2322 default: 2323 inode->i_op = &btrfs_special_inode_operations; 2324 init_special_inode(inode, inode->i_mode, rdev); 2325 break; 2326 } 2327 2328 btrfs_update_iflags(inode); 2329 return; 2330 2331 make_bad: 2332 btrfs_free_path(path); 2333 make_bad_inode(inode); 2334 } 2335 2336 /* 2337 * given a leaf and an inode, copy the inode fields into the leaf 2338 */ 2339 static void fill_inode_item(struct btrfs_trans_handle *trans, 2340 struct extent_buffer *leaf, 2341 struct btrfs_inode_item *item, 2342 struct inode *inode) 2343 { 2344 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2345 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2346 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2347 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2348 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2349 2350 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2351 inode->i_atime.tv_sec); 2352 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2353 inode->i_atime.tv_nsec); 2354 2355 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2356 inode->i_mtime.tv_sec); 2357 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2358 inode->i_mtime.tv_nsec); 2359 2360 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2361 inode->i_ctime.tv_sec); 2362 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2363 inode->i_ctime.tv_nsec); 2364 2365 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2366 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2367 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2368 btrfs_set_inode_transid(leaf, item, trans->transid); 2369 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2370 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2371 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2372 } 2373 2374 /* 2375 * copy everything in the in-memory inode into the btree. 2376 */ 2377 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2378 struct btrfs_root *root, struct inode *inode) 2379 { 2380 struct btrfs_inode_item *inode_item; 2381 struct btrfs_path *path; 2382 struct extent_buffer *leaf; 2383 int ret; 2384 2385 path = btrfs_alloc_path(); 2386 BUG_ON(!path); 2387 path->leave_spinning = 1; 2388 ret = btrfs_lookup_inode(trans, root, path, 2389 &BTRFS_I(inode)->location, 1); 2390 if (ret) { 2391 if (ret > 0) 2392 ret = -ENOENT; 2393 goto failed; 2394 } 2395 2396 btrfs_unlock_up_safe(path, 1); 2397 leaf = path->nodes[0]; 2398 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2399 struct btrfs_inode_item); 2400 2401 fill_inode_item(trans, leaf, inode_item, inode); 2402 btrfs_mark_buffer_dirty(leaf); 2403 btrfs_set_inode_last_trans(trans, inode); 2404 ret = 0; 2405 failed: 2406 btrfs_free_path(path); 2407 return ret; 2408 } 2409 2410 2411 /* 2412 * unlink helper that gets used here in inode.c and in the tree logging 2413 * recovery code. It remove a link in a directory with a given name, and 2414 * also drops the back refs in the inode to the directory 2415 */ 2416 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2417 struct btrfs_root *root, 2418 struct inode *dir, struct inode *inode, 2419 const char *name, int name_len) 2420 { 2421 struct btrfs_path *path; 2422 int ret = 0; 2423 struct extent_buffer *leaf; 2424 struct btrfs_dir_item *di; 2425 struct btrfs_key key; 2426 u64 index; 2427 2428 path = btrfs_alloc_path(); 2429 if (!path) { 2430 ret = -ENOMEM; 2431 goto err; 2432 } 2433 2434 path->leave_spinning = 1; 2435 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2436 name, name_len, -1); 2437 if (IS_ERR(di)) { 2438 ret = PTR_ERR(di); 2439 goto err; 2440 } 2441 if (!di) { 2442 ret = -ENOENT; 2443 goto err; 2444 } 2445 leaf = path->nodes[0]; 2446 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2447 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2448 if (ret) 2449 goto err; 2450 btrfs_release_path(root, path); 2451 2452 ret = btrfs_del_inode_ref(trans, root, name, name_len, 2453 inode->i_ino, 2454 dir->i_ino, &index); 2455 if (ret) { 2456 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2457 "inode %lu parent %lu\n", name_len, name, 2458 inode->i_ino, dir->i_ino); 2459 goto err; 2460 } 2461 2462 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 2463 index, name, name_len, -1); 2464 if (IS_ERR(di)) { 2465 ret = PTR_ERR(di); 2466 goto err; 2467 } 2468 if (!di) { 2469 ret = -ENOENT; 2470 goto err; 2471 } 2472 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2473 btrfs_release_path(root, path); 2474 2475 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2476 inode, dir->i_ino); 2477 BUG_ON(ret != 0 && ret != -ENOENT); 2478 2479 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2480 dir, index); 2481 BUG_ON(ret); 2482 err: 2483 btrfs_free_path(path); 2484 if (ret) 2485 goto out; 2486 2487 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2488 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2489 btrfs_update_inode(trans, root, dir); 2490 btrfs_drop_nlink(inode); 2491 ret = btrfs_update_inode(trans, root, inode); 2492 out: 2493 return ret; 2494 } 2495 2496 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2497 { 2498 struct btrfs_root *root; 2499 struct btrfs_trans_handle *trans; 2500 struct inode *inode = dentry->d_inode; 2501 int ret; 2502 unsigned long nr = 0; 2503 2504 root = BTRFS_I(dir)->root; 2505 2506 /* 2507 * 5 items for unlink inode 2508 * 1 for orphan 2509 */ 2510 ret = btrfs_reserve_metadata_space(root, 6); 2511 if (ret) 2512 return ret; 2513 2514 trans = btrfs_start_transaction(root, 1); 2515 if (IS_ERR(trans)) { 2516 btrfs_unreserve_metadata_space(root, 6); 2517 return PTR_ERR(trans); 2518 } 2519 2520 btrfs_set_trans_block_group(trans, dir); 2521 2522 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 2523 2524 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2525 dentry->d_name.name, dentry->d_name.len); 2526 2527 if (inode->i_nlink == 0) 2528 ret = btrfs_orphan_add(trans, inode); 2529 2530 nr = trans->blocks_used; 2531 2532 btrfs_end_transaction_throttle(trans, root); 2533 btrfs_unreserve_metadata_space(root, 6); 2534 btrfs_btree_balance_dirty(root, nr); 2535 return ret; 2536 } 2537 2538 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 2539 struct btrfs_root *root, 2540 struct inode *dir, u64 objectid, 2541 const char *name, int name_len) 2542 { 2543 struct btrfs_path *path; 2544 struct extent_buffer *leaf; 2545 struct btrfs_dir_item *di; 2546 struct btrfs_key key; 2547 u64 index; 2548 int ret; 2549 2550 path = btrfs_alloc_path(); 2551 if (!path) 2552 return -ENOMEM; 2553 2554 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2555 name, name_len, -1); 2556 BUG_ON(!di || IS_ERR(di)); 2557 2558 leaf = path->nodes[0]; 2559 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2560 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 2561 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2562 BUG_ON(ret); 2563 btrfs_release_path(root, path); 2564 2565 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 2566 objectid, root->root_key.objectid, 2567 dir->i_ino, &index, name, name_len); 2568 if (ret < 0) { 2569 BUG_ON(ret != -ENOENT); 2570 di = btrfs_search_dir_index_item(root, path, dir->i_ino, 2571 name, name_len); 2572 BUG_ON(!di || IS_ERR(di)); 2573 2574 leaf = path->nodes[0]; 2575 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2576 btrfs_release_path(root, path); 2577 index = key.offset; 2578 } 2579 2580 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 2581 index, name, name_len, -1); 2582 BUG_ON(!di || IS_ERR(di)); 2583 2584 leaf = path->nodes[0]; 2585 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2586 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 2587 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2588 BUG_ON(ret); 2589 btrfs_release_path(root, path); 2590 2591 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2592 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2593 ret = btrfs_update_inode(trans, root, dir); 2594 BUG_ON(ret); 2595 dir->i_sb->s_dirt = 1; 2596 2597 btrfs_free_path(path); 2598 return 0; 2599 } 2600 2601 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 2602 { 2603 struct inode *inode = dentry->d_inode; 2604 int err = 0; 2605 int ret; 2606 struct btrfs_root *root = BTRFS_I(dir)->root; 2607 struct btrfs_trans_handle *trans; 2608 unsigned long nr = 0; 2609 2610 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 2611 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2612 return -ENOTEMPTY; 2613 2614 ret = btrfs_reserve_metadata_space(root, 5); 2615 if (ret) 2616 return ret; 2617 2618 trans = btrfs_start_transaction(root, 1); 2619 if (IS_ERR(trans)) { 2620 btrfs_unreserve_metadata_space(root, 5); 2621 return PTR_ERR(trans); 2622 } 2623 2624 btrfs_set_trans_block_group(trans, dir); 2625 2626 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 2627 err = btrfs_unlink_subvol(trans, root, dir, 2628 BTRFS_I(inode)->location.objectid, 2629 dentry->d_name.name, 2630 dentry->d_name.len); 2631 goto out; 2632 } 2633 2634 err = btrfs_orphan_add(trans, inode); 2635 if (err) 2636 goto out; 2637 2638 /* now the directory is empty */ 2639 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2640 dentry->d_name.name, dentry->d_name.len); 2641 if (!err) 2642 btrfs_i_size_write(inode, 0); 2643 out: 2644 nr = trans->blocks_used; 2645 ret = btrfs_end_transaction_throttle(trans, root); 2646 btrfs_unreserve_metadata_space(root, 5); 2647 btrfs_btree_balance_dirty(root, nr); 2648 2649 if (ret && !err) 2650 err = ret; 2651 return err; 2652 } 2653 2654 #if 0 2655 /* 2656 * when truncating bytes in a file, it is possible to avoid reading 2657 * the leaves that contain only checksum items. This can be the 2658 * majority of the IO required to delete a large file, but it must 2659 * be done carefully. 2660 * 2661 * The keys in the level just above the leaves are checked to make sure 2662 * the lowest key in a given leaf is a csum key, and starts at an offset 2663 * after the new size. 2664 * 2665 * Then the key for the next leaf is checked to make sure it also has 2666 * a checksum item for the same file. If it does, we know our target leaf 2667 * contains only checksum items, and it can be safely freed without reading 2668 * it. 2669 * 2670 * This is just an optimization targeted at large files. It may do 2671 * nothing. It will return 0 unless things went badly. 2672 */ 2673 static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, 2674 struct btrfs_root *root, 2675 struct btrfs_path *path, 2676 struct inode *inode, u64 new_size) 2677 { 2678 struct btrfs_key key; 2679 int ret; 2680 int nritems; 2681 struct btrfs_key found_key; 2682 struct btrfs_key other_key; 2683 struct btrfs_leaf_ref *ref; 2684 u64 leaf_gen; 2685 u64 leaf_start; 2686 2687 path->lowest_level = 1; 2688 key.objectid = inode->i_ino; 2689 key.type = BTRFS_CSUM_ITEM_KEY; 2690 key.offset = new_size; 2691 again: 2692 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2693 if (ret < 0) 2694 goto out; 2695 2696 if (path->nodes[1] == NULL) { 2697 ret = 0; 2698 goto out; 2699 } 2700 ret = 0; 2701 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); 2702 nritems = btrfs_header_nritems(path->nodes[1]); 2703 2704 if (!nritems) 2705 goto out; 2706 2707 if (path->slots[1] >= nritems) 2708 goto next_node; 2709 2710 /* did we find a key greater than anything we want to delete? */ 2711 if (found_key.objectid > inode->i_ino || 2712 (found_key.objectid == inode->i_ino && found_key.type > key.type)) 2713 goto out; 2714 2715 /* we check the next key in the node to make sure the leave contains 2716 * only checksum items. This comparison doesn't work if our 2717 * leaf is the last one in the node 2718 */ 2719 if (path->slots[1] + 1 >= nritems) { 2720 next_node: 2721 /* search forward from the last key in the node, this 2722 * will bring us into the next node in the tree 2723 */ 2724 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); 2725 2726 /* unlikely, but we inc below, so check to be safe */ 2727 if (found_key.offset == (u64)-1) 2728 goto out; 2729 2730 /* search_forward needs a path with locks held, do the 2731 * search again for the original key. It is possible 2732 * this will race with a balance and return a path that 2733 * we could modify, but this drop is just an optimization 2734 * and is allowed to miss some leaves. 2735 */ 2736 btrfs_release_path(root, path); 2737 found_key.offset++; 2738 2739 /* setup a max key for search_forward */ 2740 other_key.offset = (u64)-1; 2741 other_key.type = key.type; 2742 other_key.objectid = key.objectid; 2743 2744 path->keep_locks = 1; 2745 ret = btrfs_search_forward(root, &found_key, &other_key, 2746 path, 0, 0); 2747 path->keep_locks = 0; 2748 if (ret || found_key.objectid != key.objectid || 2749 found_key.type != key.type) { 2750 ret = 0; 2751 goto out; 2752 } 2753 2754 key.offset = found_key.offset; 2755 btrfs_release_path(root, path); 2756 cond_resched(); 2757 goto again; 2758 } 2759 2760 /* we know there's one more slot after us in the tree, 2761 * read that key so we can verify it is also a checksum item 2762 */ 2763 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); 2764 2765 if (found_key.objectid < inode->i_ino) 2766 goto next_key; 2767 2768 if (found_key.type != key.type || found_key.offset < new_size) 2769 goto next_key; 2770 2771 /* 2772 * if the key for the next leaf isn't a csum key from this objectid, 2773 * we can't be sure there aren't good items inside this leaf. 2774 * Bail out 2775 */ 2776 if (other_key.objectid != inode->i_ino || other_key.type != key.type) 2777 goto out; 2778 2779 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); 2780 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); 2781 /* 2782 * it is safe to delete this leaf, it contains only 2783 * csum items from this inode at an offset >= new_size 2784 */ 2785 ret = btrfs_del_leaf(trans, root, path, leaf_start); 2786 BUG_ON(ret); 2787 2788 if (root->ref_cows && leaf_gen < trans->transid) { 2789 ref = btrfs_alloc_leaf_ref(root, 0); 2790 if (ref) { 2791 ref->root_gen = root->root_key.offset; 2792 ref->bytenr = leaf_start; 2793 ref->owner = 0; 2794 ref->generation = leaf_gen; 2795 ref->nritems = 0; 2796 2797 btrfs_sort_leaf_ref(ref); 2798 2799 ret = btrfs_add_leaf_ref(root, ref, 0); 2800 WARN_ON(ret); 2801 btrfs_free_leaf_ref(root, ref); 2802 } else { 2803 WARN_ON(1); 2804 } 2805 } 2806 next_key: 2807 btrfs_release_path(root, path); 2808 2809 if (other_key.objectid == inode->i_ino && 2810 other_key.type == key.type && other_key.offset > key.offset) { 2811 key.offset = other_key.offset; 2812 cond_resched(); 2813 goto again; 2814 } 2815 ret = 0; 2816 out: 2817 /* fixup any changes we've made to the path */ 2818 path->lowest_level = 0; 2819 path->keep_locks = 0; 2820 btrfs_release_path(root, path); 2821 return ret; 2822 } 2823 2824 #endif 2825 2826 /* 2827 * this can truncate away extent items, csum items and directory items. 2828 * It starts at a high offset and removes keys until it can't find 2829 * any higher than new_size 2830 * 2831 * csum items that cross the new i_size are truncated to the new size 2832 * as well. 2833 * 2834 * min_type is the minimum key type to truncate down to. If set to 0, this 2835 * will kill all the items on this inode, including the INODE_ITEM_KEY. 2836 */ 2837 noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 2838 struct btrfs_root *root, 2839 struct inode *inode, 2840 u64 new_size, u32 min_type) 2841 { 2842 int ret; 2843 struct btrfs_path *path; 2844 struct btrfs_key key; 2845 struct btrfs_key found_key; 2846 u32 found_type = (u8)-1; 2847 struct extent_buffer *leaf; 2848 struct btrfs_file_extent_item *fi; 2849 u64 extent_start = 0; 2850 u64 extent_num_bytes = 0; 2851 u64 extent_offset = 0; 2852 u64 item_end = 0; 2853 int found_extent; 2854 int del_item; 2855 int pending_del_nr = 0; 2856 int pending_del_slot = 0; 2857 int extent_type = -1; 2858 int encoding; 2859 u64 mask = root->sectorsize - 1; 2860 2861 if (root->ref_cows) 2862 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 2863 path = btrfs_alloc_path(); 2864 BUG_ON(!path); 2865 path->reada = -1; 2866 2867 /* FIXME, add redo link to tree so we don't leak on crash */ 2868 key.objectid = inode->i_ino; 2869 key.offset = (u64)-1; 2870 key.type = (u8)-1; 2871 2872 search_again: 2873 path->leave_spinning = 1; 2874 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2875 if (ret < 0) 2876 goto error; 2877 2878 if (ret > 0) { 2879 /* there are no items in the tree for us to truncate, we're 2880 * done 2881 */ 2882 if (path->slots[0] == 0) { 2883 ret = 0; 2884 goto error; 2885 } 2886 path->slots[0]--; 2887 } 2888 2889 while (1) { 2890 fi = NULL; 2891 leaf = path->nodes[0]; 2892 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2893 found_type = btrfs_key_type(&found_key); 2894 encoding = 0; 2895 2896 if (found_key.objectid != inode->i_ino) 2897 break; 2898 2899 if (found_type < min_type) 2900 break; 2901 2902 item_end = found_key.offset; 2903 if (found_type == BTRFS_EXTENT_DATA_KEY) { 2904 fi = btrfs_item_ptr(leaf, path->slots[0], 2905 struct btrfs_file_extent_item); 2906 extent_type = btrfs_file_extent_type(leaf, fi); 2907 encoding = btrfs_file_extent_compression(leaf, fi); 2908 encoding |= btrfs_file_extent_encryption(leaf, fi); 2909 encoding |= btrfs_file_extent_other_encoding(leaf, fi); 2910 2911 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2912 item_end += 2913 btrfs_file_extent_num_bytes(leaf, fi); 2914 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2915 item_end += btrfs_file_extent_inline_len(leaf, 2916 fi); 2917 } 2918 item_end--; 2919 } 2920 if (item_end < new_size) { 2921 if (found_type == BTRFS_DIR_ITEM_KEY) 2922 found_type = BTRFS_INODE_ITEM_KEY; 2923 else if (found_type == BTRFS_EXTENT_ITEM_KEY) 2924 found_type = BTRFS_EXTENT_DATA_KEY; 2925 else if (found_type == BTRFS_EXTENT_DATA_KEY) 2926 found_type = BTRFS_XATTR_ITEM_KEY; 2927 else if (found_type == BTRFS_XATTR_ITEM_KEY) 2928 found_type = BTRFS_INODE_REF_KEY; 2929 else if (found_type) 2930 found_type--; 2931 else 2932 break; 2933 btrfs_set_key_type(&key, found_type); 2934 goto next; 2935 } 2936 if (found_key.offset >= new_size) 2937 del_item = 1; 2938 else 2939 del_item = 0; 2940 found_extent = 0; 2941 2942 /* FIXME, shrink the extent if the ref count is only 1 */ 2943 if (found_type != BTRFS_EXTENT_DATA_KEY) 2944 goto delete; 2945 2946 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2947 u64 num_dec; 2948 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 2949 if (!del_item && !encoding) { 2950 u64 orig_num_bytes = 2951 btrfs_file_extent_num_bytes(leaf, fi); 2952 extent_num_bytes = new_size - 2953 found_key.offset + root->sectorsize - 1; 2954 extent_num_bytes = extent_num_bytes & 2955 ~((u64)root->sectorsize - 1); 2956 btrfs_set_file_extent_num_bytes(leaf, fi, 2957 extent_num_bytes); 2958 num_dec = (orig_num_bytes - 2959 extent_num_bytes); 2960 if (root->ref_cows && extent_start != 0) 2961 inode_sub_bytes(inode, num_dec); 2962 btrfs_mark_buffer_dirty(leaf); 2963 } else { 2964 extent_num_bytes = 2965 btrfs_file_extent_disk_num_bytes(leaf, 2966 fi); 2967 extent_offset = found_key.offset - 2968 btrfs_file_extent_offset(leaf, fi); 2969 2970 /* FIXME blocksize != 4096 */ 2971 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 2972 if (extent_start != 0) { 2973 found_extent = 1; 2974 if (root->ref_cows) 2975 inode_sub_bytes(inode, num_dec); 2976 } 2977 } 2978 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2979 /* 2980 * we can't truncate inline items that have had 2981 * special encodings 2982 */ 2983 if (!del_item && 2984 btrfs_file_extent_compression(leaf, fi) == 0 && 2985 btrfs_file_extent_encryption(leaf, fi) == 0 && 2986 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 2987 u32 size = new_size - found_key.offset; 2988 2989 if (root->ref_cows) { 2990 inode_sub_bytes(inode, item_end + 1 - 2991 new_size); 2992 } 2993 size = 2994 btrfs_file_extent_calc_inline_size(size); 2995 ret = btrfs_truncate_item(trans, root, path, 2996 size, 1); 2997 BUG_ON(ret); 2998 } else if (root->ref_cows) { 2999 inode_sub_bytes(inode, item_end + 1 - 3000 found_key.offset); 3001 } 3002 } 3003 delete: 3004 if (del_item) { 3005 if (!pending_del_nr) { 3006 /* no pending yet, add ourselves */ 3007 pending_del_slot = path->slots[0]; 3008 pending_del_nr = 1; 3009 } else if (pending_del_nr && 3010 path->slots[0] + 1 == pending_del_slot) { 3011 /* hop on the pending chunk */ 3012 pending_del_nr++; 3013 pending_del_slot = path->slots[0]; 3014 } else { 3015 BUG(); 3016 } 3017 } else { 3018 break; 3019 } 3020 if (found_extent && root->ref_cows) { 3021 btrfs_set_path_blocking(path); 3022 ret = btrfs_free_extent(trans, root, extent_start, 3023 extent_num_bytes, 0, 3024 btrfs_header_owner(leaf), 3025 inode->i_ino, extent_offset); 3026 BUG_ON(ret); 3027 } 3028 next: 3029 if (path->slots[0] == 0) { 3030 if (pending_del_nr) 3031 goto del_pending; 3032 btrfs_release_path(root, path); 3033 if (found_type == BTRFS_INODE_ITEM_KEY) 3034 break; 3035 goto search_again; 3036 } 3037 3038 path->slots[0]--; 3039 if (pending_del_nr && 3040 path->slots[0] + 1 != pending_del_slot) { 3041 struct btrfs_key debug; 3042 del_pending: 3043 btrfs_item_key_to_cpu(path->nodes[0], &debug, 3044 pending_del_slot); 3045 ret = btrfs_del_items(trans, root, path, 3046 pending_del_slot, 3047 pending_del_nr); 3048 BUG_ON(ret); 3049 pending_del_nr = 0; 3050 btrfs_release_path(root, path); 3051 if (found_type == BTRFS_INODE_ITEM_KEY) 3052 break; 3053 goto search_again; 3054 } 3055 } 3056 ret = 0; 3057 error: 3058 if (pending_del_nr) { 3059 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3060 pending_del_nr); 3061 } 3062 btrfs_free_path(path); 3063 return ret; 3064 } 3065 3066 /* 3067 * taken from block_truncate_page, but does cow as it zeros out 3068 * any bytes left in the last page in the file. 3069 */ 3070 static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3071 { 3072 struct inode *inode = mapping->host; 3073 struct btrfs_root *root = BTRFS_I(inode)->root; 3074 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3075 struct btrfs_ordered_extent *ordered; 3076 char *kaddr; 3077 u32 blocksize = root->sectorsize; 3078 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3079 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3080 struct page *page; 3081 int ret = 0; 3082 u64 page_start; 3083 u64 page_end; 3084 3085 if ((offset & (blocksize - 1)) == 0) 3086 goto out; 3087 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3088 if (ret) 3089 goto out; 3090 3091 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); 3092 if (ret) 3093 goto out; 3094 3095 ret = -ENOMEM; 3096 again: 3097 page = grab_cache_page(mapping, index); 3098 if (!page) { 3099 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3100 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 3101 goto out; 3102 } 3103 3104 page_start = page_offset(page); 3105 page_end = page_start + PAGE_CACHE_SIZE - 1; 3106 3107 if (!PageUptodate(page)) { 3108 ret = btrfs_readpage(NULL, page); 3109 lock_page(page); 3110 if (page->mapping != mapping) { 3111 unlock_page(page); 3112 page_cache_release(page); 3113 goto again; 3114 } 3115 if (!PageUptodate(page)) { 3116 ret = -EIO; 3117 goto out_unlock; 3118 } 3119 } 3120 wait_on_page_writeback(page); 3121 3122 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3123 set_page_extent_mapped(page); 3124 3125 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3126 if (ordered) { 3127 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3128 unlock_page(page); 3129 page_cache_release(page); 3130 btrfs_start_ordered_extent(inode, ordered, 1); 3131 btrfs_put_ordered_extent(ordered); 3132 goto again; 3133 } 3134 3135 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3136 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3137 GFP_NOFS); 3138 3139 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3140 if (ret) { 3141 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3142 goto out_unlock; 3143 } 3144 3145 ret = 0; 3146 if (offset != PAGE_CACHE_SIZE) { 3147 kaddr = kmap(page); 3148 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3149 flush_dcache_page(page); 3150 kunmap(page); 3151 } 3152 ClearPageChecked(page); 3153 set_page_dirty(page); 3154 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3155 3156 out_unlock: 3157 if (ret) 3158 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3159 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 3160 unlock_page(page); 3161 page_cache_release(page); 3162 out: 3163 return ret; 3164 } 3165 3166 int btrfs_cont_expand(struct inode *inode, loff_t size) 3167 { 3168 struct btrfs_trans_handle *trans; 3169 struct btrfs_root *root = BTRFS_I(inode)->root; 3170 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3171 struct extent_map *em; 3172 u64 mask = root->sectorsize - 1; 3173 u64 hole_start = (inode->i_size + mask) & ~mask; 3174 u64 block_end = (size + mask) & ~mask; 3175 u64 last_byte; 3176 u64 cur_offset; 3177 u64 hole_size; 3178 int err = 0; 3179 3180 if (size <= hole_start) 3181 return 0; 3182 3183 err = btrfs_truncate_page(inode->i_mapping, inode->i_size); 3184 if (err) 3185 return err; 3186 3187 while (1) { 3188 struct btrfs_ordered_extent *ordered; 3189 btrfs_wait_ordered_range(inode, hole_start, 3190 block_end - hole_start); 3191 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3192 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3193 if (!ordered) 3194 break; 3195 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3196 btrfs_put_ordered_extent(ordered); 3197 } 3198 3199 trans = btrfs_start_transaction(root, 1); 3200 btrfs_set_trans_block_group(trans, inode); 3201 3202 cur_offset = hole_start; 3203 while (1) { 3204 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3205 block_end - cur_offset, 0); 3206 BUG_ON(IS_ERR(em) || !em); 3207 last_byte = min(extent_map_end(em), block_end); 3208 last_byte = (last_byte + mask) & ~mask; 3209 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 3210 u64 hint_byte = 0; 3211 hole_size = last_byte - cur_offset; 3212 err = btrfs_drop_extents(trans, root, inode, 3213 cur_offset, 3214 cur_offset + hole_size, 3215 block_end, 3216 cur_offset, &hint_byte, 1); 3217 if (err) 3218 break; 3219 3220 err = btrfs_reserve_metadata_space(root, 1); 3221 if (err) 3222 break; 3223 3224 err = btrfs_insert_file_extent(trans, root, 3225 inode->i_ino, cur_offset, 0, 3226 0, hole_size, 0, hole_size, 3227 0, 0, 0); 3228 btrfs_drop_extent_cache(inode, hole_start, 3229 last_byte - 1, 0); 3230 btrfs_unreserve_metadata_space(root, 1); 3231 } 3232 free_extent_map(em); 3233 cur_offset = last_byte; 3234 if (err || cur_offset >= block_end) 3235 break; 3236 } 3237 3238 btrfs_end_transaction(trans, root); 3239 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3240 return err; 3241 } 3242 3243 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3244 { 3245 struct inode *inode = dentry->d_inode; 3246 int err; 3247 3248 err = inode_change_ok(inode, attr); 3249 if (err) 3250 return err; 3251 3252 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3253 if (attr->ia_size > inode->i_size) { 3254 err = btrfs_cont_expand(inode, attr->ia_size); 3255 if (err) 3256 return err; 3257 } else if (inode->i_size > 0 && 3258 attr->ia_size == 0) { 3259 3260 /* we're truncating a file that used to have good 3261 * data down to zero. Make sure it gets into 3262 * the ordered flush list so that any new writes 3263 * get down to disk quickly. 3264 */ 3265 BTRFS_I(inode)->ordered_data_close = 1; 3266 } 3267 } 3268 3269 err = inode_setattr(inode, attr); 3270 3271 if (!err && ((attr->ia_valid & ATTR_MODE))) 3272 err = btrfs_acl_chmod(inode); 3273 return err; 3274 } 3275 3276 void btrfs_delete_inode(struct inode *inode) 3277 { 3278 struct btrfs_trans_handle *trans; 3279 struct btrfs_root *root = BTRFS_I(inode)->root; 3280 unsigned long nr; 3281 int ret; 3282 3283 truncate_inode_pages(&inode->i_data, 0); 3284 if (is_bad_inode(inode)) { 3285 btrfs_orphan_del(NULL, inode); 3286 goto no_delete; 3287 } 3288 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3289 3290 if (inode->i_nlink > 0) { 3291 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3292 goto no_delete; 3293 } 3294 3295 btrfs_i_size_write(inode, 0); 3296 trans = btrfs_join_transaction(root, 1); 3297 3298 btrfs_set_trans_block_group(trans, inode); 3299 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); 3300 if (ret) { 3301 btrfs_orphan_del(NULL, inode); 3302 goto no_delete_lock; 3303 } 3304 3305 btrfs_orphan_del(trans, inode); 3306 3307 nr = trans->blocks_used; 3308 clear_inode(inode); 3309 3310 btrfs_end_transaction(trans, root); 3311 btrfs_btree_balance_dirty(root, nr); 3312 return; 3313 3314 no_delete_lock: 3315 nr = trans->blocks_used; 3316 btrfs_end_transaction(trans, root); 3317 btrfs_btree_balance_dirty(root, nr); 3318 no_delete: 3319 clear_inode(inode); 3320 } 3321 3322 /* 3323 * this returns the key found in the dir entry in the location pointer. 3324 * If no dir entries were found, location->objectid is 0. 3325 */ 3326 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3327 struct btrfs_key *location) 3328 { 3329 const char *name = dentry->d_name.name; 3330 int namelen = dentry->d_name.len; 3331 struct btrfs_dir_item *di; 3332 struct btrfs_path *path; 3333 struct btrfs_root *root = BTRFS_I(dir)->root; 3334 int ret = 0; 3335 3336 path = btrfs_alloc_path(); 3337 BUG_ON(!path); 3338 3339 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, 3340 namelen, 0); 3341 if (IS_ERR(di)) 3342 ret = PTR_ERR(di); 3343 3344 if (!di || IS_ERR(di)) 3345 goto out_err; 3346 3347 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3348 out: 3349 btrfs_free_path(path); 3350 return ret; 3351 out_err: 3352 location->objectid = 0; 3353 goto out; 3354 } 3355 3356 /* 3357 * when we hit a tree root in a directory, the btrfs part of the inode 3358 * needs to be changed to reflect the root directory of the tree root. This 3359 * is kind of like crossing a mount point. 3360 */ 3361 static int fixup_tree_root_location(struct btrfs_root *root, 3362 struct inode *dir, 3363 struct dentry *dentry, 3364 struct btrfs_key *location, 3365 struct btrfs_root **sub_root) 3366 { 3367 struct btrfs_path *path; 3368 struct btrfs_root *new_root; 3369 struct btrfs_root_ref *ref; 3370 struct extent_buffer *leaf; 3371 int ret; 3372 int err = 0; 3373 3374 path = btrfs_alloc_path(); 3375 if (!path) { 3376 err = -ENOMEM; 3377 goto out; 3378 } 3379 3380 err = -ENOENT; 3381 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3382 BTRFS_I(dir)->root->root_key.objectid, 3383 location->objectid); 3384 if (ret) { 3385 if (ret < 0) 3386 err = ret; 3387 goto out; 3388 } 3389 3390 leaf = path->nodes[0]; 3391 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3392 if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || 3393 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3394 goto out; 3395 3396 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3397 (unsigned long)(ref + 1), 3398 dentry->d_name.len); 3399 if (ret) 3400 goto out; 3401 3402 btrfs_release_path(root->fs_info->tree_root, path); 3403 3404 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3405 if (IS_ERR(new_root)) { 3406 err = PTR_ERR(new_root); 3407 goto out; 3408 } 3409 3410 if (btrfs_root_refs(&new_root->root_item) == 0) { 3411 err = -ENOENT; 3412 goto out; 3413 } 3414 3415 *sub_root = new_root; 3416 location->objectid = btrfs_root_dirid(&new_root->root_item); 3417 location->type = BTRFS_INODE_ITEM_KEY; 3418 location->offset = 0; 3419 err = 0; 3420 out: 3421 btrfs_free_path(path); 3422 return err; 3423 } 3424 3425 static void inode_tree_add(struct inode *inode) 3426 { 3427 struct btrfs_root *root = BTRFS_I(inode)->root; 3428 struct btrfs_inode *entry; 3429 struct rb_node **p; 3430 struct rb_node *parent; 3431 again: 3432 p = &root->inode_tree.rb_node; 3433 parent = NULL; 3434 3435 if (hlist_unhashed(&inode->i_hash)) 3436 return; 3437 3438 spin_lock(&root->inode_lock); 3439 while (*p) { 3440 parent = *p; 3441 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3442 3443 if (inode->i_ino < entry->vfs_inode.i_ino) 3444 p = &parent->rb_left; 3445 else if (inode->i_ino > entry->vfs_inode.i_ino) 3446 p = &parent->rb_right; 3447 else { 3448 WARN_ON(!(entry->vfs_inode.i_state & 3449 (I_WILL_FREE | I_FREEING | I_CLEAR))); 3450 rb_erase(parent, &root->inode_tree); 3451 RB_CLEAR_NODE(parent); 3452 spin_unlock(&root->inode_lock); 3453 goto again; 3454 } 3455 } 3456 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3457 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3458 spin_unlock(&root->inode_lock); 3459 } 3460 3461 static void inode_tree_del(struct inode *inode) 3462 { 3463 struct btrfs_root *root = BTRFS_I(inode)->root; 3464 int empty = 0; 3465 3466 spin_lock(&root->inode_lock); 3467 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3468 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3469 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3470 empty = RB_EMPTY_ROOT(&root->inode_tree); 3471 } 3472 spin_unlock(&root->inode_lock); 3473 3474 if (empty && btrfs_root_refs(&root->root_item) == 0) { 3475 synchronize_srcu(&root->fs_info->subvol_srcu); 3476 spin_lock(&root->inode_lock); 3477 empty = RB_EMPTY_ROOT(&root->inode_tree); 3478 spin_unlock(&root->inode_lock); 3479 if (empty) 3480 btrfs_add_dead_root(root); 3481 } 3482 } 3483 3484 int btrfs_invalidate_inodes(struct btrfs_root *root) 3485 { 3486 struct rb_node *node; 3487 struct rb_node *prev; 3488 struct btrfs_inode *entry; 3489 struct inode *inode; 3490 u64 objectid = 0; 3491 3492 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 3493 3494 spin_lock(&root->inode_lock); 3495 again: 3496 node = root->inode_tree.rb_node; 3497 prev = NULL; 3498 while (node) { 3499 prev = node; 3500 entry = rb_entry(node, struct btrfs_inode, rb_node); 3501 3502 if (objectid < entry->vfs_inode.i_ino) 3503 node = node->rb_left; 3504 else if (objectid > entry->vfs_inode.i_ino) 3505 node = node->rb_right; 3506 else 3507 break; 3508 } 3509 if (!node) { 3510 while (prev) { 3511 entry = rb_entry(prev, struct btrfs_inode, rb_node); 3512 if (objectid <= entry->vfs_inode.i_ino) { 3513 node = prev; 3514 break; 3515 } 3516 prev = rb_next(prev); 3517 } 3518 } 3519 while (node) { 3520 entry = rb_entry(node, struct btrfs_inode, rb_node); 3521 objectid = entry->vfs_inode.i_ino + 1; 3522 inode = igrab(&entry->vfs_inode); 3523 if (inode) { 3524 spin_unlock(&root->inode_lock); 3525 if (atomic_read(&inode->i_count) > 1) 3526 d_prune_aliases(inode); 3527 /* 3528 * btrfs_drop_inode will remove it from 3529 * the inode cache when its usage count 3530 * hits zero. 3531 */ 3532 iput(inode); 3533 cond_resched(); 3534 spin_lock(&root->inode_lock); 3535 goto again; 3536 } 3537 3538 if (cond_resched_lock(&root->inode_lock)) 3539 goto again; 3540 3541 node = rb_next(node); 3542 } 3543 spin_unlock(&root->inode_lock); 3544 return 0; 3545 } 3546 3547 static noinline void init_btrfs_i(struct inode *inode) 3548 { 3549 struct btrfs_inode *bi = BTRFS_I(inode); 3550 3551 bi->generation = 0; 3552 bi->sequence = 0; 3553 bi->last_trans = 0; 3554 bi->last_sub_trans = 0; 3555 bi->logged_trans = 0; 3556 bi->delalloc_bytes = 0; 3557 bi->reserved_bytes = 0; 3558 bi->disk_i_size = 0; 3559 bi->flags = 0; 3560 bi->index_cnt = (u64)-1; 3561 bi->last_unlink_trans = 0; 3562 bi->ordered_data_close = 0; 3563 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3564 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3565 inode->i_mapping, GFP_NOFS); 3566 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3567 inode->i_mapping, GFP_NOFS); 3568 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3569 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); 3570 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3571 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3572 mutex_init(&BTRFS_I(inode)->extent_mutex); 3573 mutex_init(&BTRFS_I(inode)->log_mutex); 3574 } 3575 3576 static int btrfs_init_locked_inode(struct inode *inode, void *p) 3577 { 3578 struct btrfs_iget_args *args = p; 3579 inode->i_ino = args->ino; 3580 init_btrfs_i(inode); 3581 BTRFS_I(inode)->root = args->root; 3582 btrfs_set_inode_space_info(args->root, inode); 3583 return 0; 3584 } 3585 3586 static int btrfs_find_actor(struct inode *inode, void *opaque) 3587 { 3588 struct btrfs_iget_args *args = opaque; 3589 return args->ino == inode->i_ino && 3590 args->root == BTRFS_I(inode)->root; 3591 } 3592 3593 static struct inode *btrfs_iget_locked(struct super_block *s, 3594 u64 objectid, 3595 struct btrfs_root *root) 3596 { 3597 struct inode *inode; 3598 struct btrfs_iget_args args; 3599 args.ino = objectid; 3600 args.root = root; 3601 3602 inode = iget5_locked(s, objectid, btrfs_find_actor, 3603 btrfs_init_locked_inode, 3604 (void *)&args); 3605 return inode; 3606 } 3607 3608 /* Get an inode object given its location and corresponding root. 3609 * Returns in *is_new if the inode was read from disk 3610 */ 3611 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3612 struct btrfs_root *root) 3613 { 3614 struct inode *inode; 3615 3616 inode = btrfs_iget_locked(s, location->objectid, root); 3617 if (!inode) 3618 return ERR_PTR(-ENOMEM); 3619 3620 if (inode->i_state & I_NEW) { 3621 BTRFS_I(inode)->root = root; 3622 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3623 btrfs_read_locked_inode(inode); 3624 3625 inode_tree_add(inode); 3626 unlock_new_inode(inode); 3627 } 3628 3629 return inode; 3630 } 3631 3632 static struct inode *new_simple_dir(struct super_block *s, 3633 struct btrfs_key *key, 3634 struct btrfs_root *root) 3635 { 3636 struct inode *inode = new_inode(s); 3637 3638 if (!inode) 3639 return ERR_PTR(-ENOMEM); 3640 3641 init_btrfs_i(inode); 3642 3643 BTRFS_I(inode)->root = root; 3644 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 3645 BTRFS_I(inode)->dummy_inode = 1; 3646 3647 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 3648 inode->i_op = &simple_dir_inode_operations; 3649 inode->i_fop = &simple_dir_operations; 3650 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 3651 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 3652 3653 return inode; 3654 } 3655 3656 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 3657 { 3658 struct inode *inode; 3659 struct btrfs_root *root = BTRFS_I(dir)->root; 3660 struct btrfs_root *sub_root = root; 3661 struct btrfs_key location; 3662 int index; 3663 int ret; 3664 3665 dentry->d_op = &btrfs_dentry_operations; 3666 3667 if (dentry->d_name.len > BTRFS_NAME_LEN) 3668 return ERR_PTR(-ENAMETOOLONG); 3669 3670 ret = btrfs_inode_by_name(dir, dentry, &location); 3671 3672 if (ret < 0) 3673 return ERR_PTR(ret); 3674 3675 if (location.objectid == 0) 3676 return NULL; 3677 3678 if (location.type == BTRFS_INODE_ITEM_KEY) { 3679 inode = btrfs_iget(dir->i_sb, &location, root); 3680 return inode; 3681 } 3682 3683 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 3684 3685 index = srcu_read_lock(&root->fs_info->subvol_srcu); 3686 ret = fixup_tree_root_location(root, dir, dentry, 3687 &location, &sub_root); 3688 if (ret < 0) { 3689 if (ret != -ENOENT) 3690 inode = ERR_PTR(ret); 3691 else 3692 inode = new_simple_dir(dir->i_sb, &location, sub_root); 3693 } else { 3694 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3695 } 3696 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3697 3698 return inode; 3699 } 3700 3701 static int btrfs_dentry_delete(struct dentry *dentry) 3702 { 3703 struct btrfs_root *root; 3704 3705 if (!dentry->d_inode && !IS_ROOT(dentry)) 3706 dentry = dentry->d_parent; 3707 3708 if (dentry->d_inode) { 3709 root = BTRFS_I(dentry->d_inode)->root; 3710 if (btrfs_root_refs(&root->root_item) == 0) 3711 return 1; 3712 } 3713 return 0; 3714 } 3715 3716 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 3717 struct nameidata *nd) 3718 { 3719 struct inode *inode; 3720 3721 inode = btrfs_lookup_dentry(dir, dentry); 3722 if (IS_ERR(inode)) 3723 return ERR_CAST(inode); 3724 3725 return d_splice_alias(inode, dentry); 3726 } 3727 3728 static unsigned char btrfs_filetype_table[] = { 3729 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 3730 }; 3731 3732 static int btrfs_real_readdir(struct file *filp, void *dirent, 3733 filldir_t filldir) 3734 { 3735 struct inode *inode = filp->f_dentry->d_inode; 3736 struct btrfs_root *root = BTRFS_I(inode)->root; 3737 struct btrfs_item *item; 3738 struct btrfs_dir_item *di; 3739 struct btrfs_key key; 3740 struct btrfs_key found_key; 3741 struct btrfs_path *path; 3742 int ret; 3743 u32 nritems; 3744 struct extent_buffer *leaf; 3745 int slot; 3746 int advance; 3747 unsigned char d_type; 3748 int over = 0; 3749 u32 di_cur; 3750 u32 di_total; 3751 u32 di_len; 3752 int key_type = BTRFS_DIR_INDEX_KEY; 3753 char tmp_name[32]; 3754 char *name_ptr; 3755 int name_len; 3756 3757 /* FIXME, use a real flag for deciding about the key type */ 3758 if (root->fs_info->tree_root == root) 3759 key_type = BTRFS_DIR_ITEM_KEY; 3760 3761 /* special case for "." */ 3762 if (filp->f_pos == 0) { 3763 over = filldir(dirent, ".", 1, 3764 1, inode->i_ino, 3765 DT_DIR); 3766 if (over) 3767 return 0; 3768 filp->f_pos = 1; 3769 } 3770 /* special case for .., just use the back ref */ 3771 if (filp->f_pos == 1) { 3772 u64 pino = parent_ino(filp->f_path.dentry); 3773 over = filldir(dirent, "..", 2, 3774 2, pino, DT_DIR); 3775 if (over) 3776 return 0; 3777 filp->f_pos = 2; 3778 } 3779 path = btrfs_alloc_path(); 3780 path->reada = 2; 3781 3782 btrfs_set_key_type(&key, key_type); 3783 key.offset = filp->f_pos; 3784 key.objectid = inode->i_ino; 3785 3786 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3787 if (ret < 0) 3788 goto err; 3789 advance = 0; 3790 3791 while (1) { 3792 leaf = path->nodes[0]; 3793 nritems = btrfs_header_nritems(leaf); 3794 slot = path->slots[0]; 3795 if (advance || slot >= nritems) { 3796 if (slot >= nritems - 1) { 3797 ret = btrfs_next_leaf(root, path); 3798 if (ret) 3799 break; 3800 leaf = path->nodes[0]; 3801 nritems = btrfs_header_nritems(leaf); 3802 slot = path->slots[0]; 3803 } else { 3804 slot++; 3805 path->slots[0]++; 3806 } 3807 } 3808 3809 advance = 1; 3810 item = btrfs_item_nr(leaf, slot); 3811 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3812 3813 if (found_key.objectid != key.objectid) 3814 break; 3815 if (btrfs_key_type(&found_key) != key_type) 3816 break; 3817 if (found_key.offset < filp->f_pos) 3818 continue; 3819 3820 filp->f_pos = found_key.offset; 3821 3822 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 3823 di_cur = 0; 3824 di_total = btrfs_item_size(leaf, item); 3825 3826 while (di_cur < di_total) { 3827 struct btrfs_key location; 3828 3829 name_len = btrfs_dir_name_len(leaf, di); 3830 if (name_len <= sizeof(tmp_name)) { 3831 name_ptr = tmp_name; 3832 } else { 3833 name_ptr = kmalloc(name_len, GFP_NOFS); 3834 if (!name_ptr) { 3835 ret = -ENOMEM; 3836 goto err; 3837 } 3838 } 3839 read_extent_buffer(leaf, name_ptr, 3840 (unsigned long)(di + 1), name_len); 3841 3842 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 3843 btrfs_dir_item_key_to_cpu(leaf, di, &location); 3844 3845 /* is this a reference to our own snapshot? If so 3846 * skip it 3847 */ 3848 if (location.type == BTRFS_ROOT_ITEM_KEY && 3849 location.objectid == root->root_key.objectid) { 3850 over = 0; 3851 goto skip; 3852 } 3853 over = filldir(dirent, name_ptr, name_len, 3854 found_key.offset, location.objectid, 3855 d_type); 3856 3857 skip: 3858 if (name_ptr != tmp_name) 3859 kfree(name_ptr); 3860 3861 if (over) 3862 goto nopos; 3863 di_len = btrfs_dir_name_len(leaf, di) + 3864 btrfs_dir_data_len(leaf, di) + sizeof(*di); 3865 di_cur += di_len; 3866 di = (struct btrfs_dir_item *)((char *)di + di_len); 3867 } 3868 } 3869 3870 /* Reached end of directory/root. Bump pos past the last item. */ 3871 if (key_type == BTRFS_DIR_INDEX_KEY) 3872 filp->f_pos = INT_LIMIT(off_t); 3873 else 3874 filp->f_pos++; 3875 nopos: 3876 ret = 0; 3877 err: 3878 btrfs_free_path(path); 3879 return ret; 3880 } 3881 3882 int btrfs_write_inode(struct inode *inode, int wait) 3883 { 3884 struct btrfs_root *root = BTRFS_I(inode)->root; 3885 struct btrfs_trans_handle *trans; 3886 int ret = 0; 3887 3888 if (root->fs_info->btree_inode == inode) 3889 return 0; 3890 3891 if (wait) { 3892 trans = btrfs_join_transaction(root, 1); 3893 btrfs_set_trans_block_group(trans, inode); 3894 ret = btrfs_commit_transaction(trans, root); 3895 } 3896 return ret; 3897 } 3898 3899 /* 3900 * This is somewhat expensive, updating the tree every time the 3901 * inode changes. But, it is most likely to find the inode in cache. 3902 * FIXME, needs more benchmarking...there are no reasons other than performance 3903 * to keep or drop this code. 3904 */ 3905 void btrfs_dirty_inode(struct inode *inode) 3906 { 3907 struct btrfs_root *root = BTRFS_I(inode)->root; 3908 struct btrfs_trans_handle *trans; 3909 3910 trans = btrfs_join_transaction(root, 1); 3911 btrfs_set_trans_block_group(trans, inode); 3912 btrfs_update_inode(trans, root, inode); 3913 btrfs_end_transaction(trans, root); 3914 } 3915 3916 /* 3917 * find the highest existing sequence number in a directory 3918 * and then set the in-memory index_cnt variable to reflect 3919 * free sequence numbers 3920 */ 3921 static int btrfs_set_inode_index_count(struct inode *inode) 3922 { 3923 struct btrfs_root *root = BTRFS_I(inode)->root; 3924 struct btrfs_key key, found_key; 3925 struct btrfs_path *path; 3926 struct extent_buffer *leaf; 3927 int ret; 3928 3929 key.objectid = inode->i_ino; 3930 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 3931 key.offset = (u64)-1; 3932 3933 path = btrfs_alloc_path(); 3934 if (!path) 3935 return -ENOMEM; 3936 3937 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3938 if (ret < 0) 3939 goto out; 3940 /* FIXME: we should be able to handle this */ 3941 if (ret == 0) 3942 goto out; 3943 ret = 0; 3944 3945 /* 3946 * MAGIC NUMBER EXPLANATION: 3947 * since we search a directory based on f_pos we have to start at 2 3948 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 3949 * else has to start at 2 3950 */ 3951 if (path->slots[0] == 0) { 3952 BTRFS_I(inode)->index_cnt = 2; 3953 goto out; 3954 } 3955 3956 path->slots[0]--; 3957 3958 leaf = path->nodes[0]; 3959 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3960 3961 if (found_key.objectid != inode->i_ino || 3962 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 3963 BTRFS_I(inode)->index_cnt = 2; 3964 goto out; 3965 } 3966 3967 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 3968 out: 3969 btrfs_free_path(path); 3970 return ret; 3971 } 3972 3973 /* 3974 * helper to find a free sequence number in a given directory. This current 3975 * code is very simple, later versions will do smarter things in the btree 3976 */ 3977 int btrfs_set_inode_index(struct inode *dir, u64 *index) 3978 { 3979 int ret = 0; 3980 3981 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 3982 ret = btrfs_set_inode_index_count(dir); 3983 if (ret) 3984 return ret; 3985 } 3986 3987 *index = BTRFS_I(dir)->index_cnt; 3988 BTRFS_I(dir)->index_cnt++; 3989 3990 return ret; 3991 } 3992 3993 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 3994 struct btrfs_root *root, 3995 struct inode *dir, 3996 const char *name, int name_len, 3997 u64 ref_objectid, u64 objectid, 3998 u64 alloc_hint, int mode, u64 *index) 3999 { 4000 struct inode *inode; 4001 struct btrfs_inode_item *inode_item; 4002 struct btrfs_key *location; 4003 struct btrfs_path *path; 4004 struct btrfs_inode_ref *ref; 4005 struct btrfs_key key[2]; 4006 u32 sizes[2]; 4007 unsigned long ptr; 4008 int ret; 4009 int owner; 4010 4011 path = btrfs_alloc_path(); 4012 BUG_ON(!path); 4013 4014 inode = new_inode(root->fs_info->sb); 4015 if (!inode) 4016 return ERR_PTR(-ENOMEM); 4017 4018 if (dir) { 4019 ret = btrfs_set_inode_index(dir, index); 4020 if (ret) { 4021 iput(inode); 4022 return ERR_PTR(ret); 4023 } 4024 } 4025 /* 4026 * index_cnt is ignored for everything but a dir, 4027 * btrfs_get_inode_index_count has an explanation for the magic 4028 * number 4029 */ 4030 init_btrfs_i(inode); 4031 BTRFS_I(inode)->index_cnt = 2; 4032 BTRFS_I(inode)->root = root; 4033 BTRFS_I(inode)->generation = trans->transid; 4034 btrfs_set_inode_space_info(root, inode); 4035 4036 if (mode & S_IFDIR) 4037 owner = 0; 4038 else 4039 owner = 1; 4040 BTRFS_I(inode)->block_group = 4041 btrfs_find_block_group(root, 0, alloc_hint, owner); 4042 4043 key[0].objectid = objectid; 4044 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4045 key[0].offset = 0; 4046 4047 key[1].objectid = objectid; 4048 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4049 key[1].offset = ref_objectid; 4050 4051 sizes[0] = sizeof(struct btrfs_inode_item); 4052 sizes[1] = name_len + sizeof(*ref); 4053 4054 path->leave_spinning = 1; 4055 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 4056 if (ret != 0) 4057 goto fail; 4058 4059 inode->i_uid = current_fsuid(); 4060 4061 if (dir && (dir->i_mode & S_ISGID)) { 4062 inode->i_gid = dir->i_gid; 4063 if (S_ISDIR(mode)) 4064 mode |= S_ISGID; 4065 } else 4066 inode->i_gid = current_fsgid(); 4067 4068 inode->i_mode = mode; 4069 inode->i_ino = objectid; 4070 inode_set_bytes(inode, 0); 4071 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4072 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4073 struct btrfs_inode_item); 4074 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4075 4076 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4077 struct btrfs_inode_ref); 4078 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 4079 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 4080 ptr = (unsigned long)(ref + 1); 4081 write_extent_buffer(path->nodes[0], name, ptr, name_len); 4082 4083 btrfs_mark_buffer_dirty(path->nodes[0]); 4084 btrfs_free_path(path); 4085 4086 location = &BTRFS_I(inode)->location; 4087 location->objectid = objectid; 4088 location->offset = 0; 4089 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 4090 4091 btrfs_inherit_iflags(inode, dir); 4092 4093 if ((mode & S_IFREG)) { 4094 if (btrfs_test_opt(root, NODATASUM)) 4095 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4096 if (btrfs_test_opt(root, NODATACOW)) 4097 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4098 } 4099 4100 insert_inode_hash(inode); 4101 inode_tree_add(inode); 4102 return inode; 4103 fail: 4104 if (dir) 4105 BTRFS_I(dir)->index_cnt--; 4106 btrfs_free_path(path); 4107 iput(inode); 4108 return ERR_PTR(ret); 4109 } 4110 4111 static inline u8 btrfs_inode_type(struct inode *inode) 4112 { 4113 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 4114 } 4115 4116 /* 4117 * utility function to add 'inode' into 'parent_inode' with 4118 * a give name and a given sequence number. 4119 * if 'add_backref' is true, also insert a backref from the 4120 * inode to the parent directory. 4121 */ 4122 int btrfs_add_link(struct btrfs_trans_handle *trans, 4123 struct inode *parent_inode, struct inode *inode, 4124 const char *name, int name_len, int add_backref, u64 index) 4125 { 4126 int ret = 0; 4127 struct btrfs_key key; 4128 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4129 4130 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 4131 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4132 } else { 4133 key.objectid = inode->i_ino; 4134 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4135 key.offset = 0; 4136 } 4137 4138 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 4139 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4140 key.objectid, root->root_key.objectid, 4141 parent_inode->i_ino, 4142 index, name, name_len); 4143 } else if (add_backref) { 4144 ret = btrfs_insert_inode_ref(trans, root, 4145 name, name_len, inode->i_ino, 4146 parent_inode->i_ino, index); 4147 } 4148 4149 if (ret == 0) { 4150 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4151 parent_inode->i_ino, &key, 4152 btrfs_inode_type(inode), index); 4153 BUG_ON(ret); 4154 4155 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4156 name_len * 2); 4157 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4158 ret = btrfs_update_inode(trans, root, parent_inode); 4159 } 4160 return ret; 4161 } 4162 4163 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4164 struct dentry *dentry, struct inode *inode, 4165 int backref, u64 index) 4166 { 4167 int err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4168 inode, dentry->d_name.name, 4169 dentry->d_name.len, backref, index); 4170 if (!err) { 4171 d_instantiate(dentry, inode); 4172 return 0; 4173 } 4174 if (err > 0) 4175 err = -EEXIST; 4176 return err; 4177 } 4178 4179 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4180 int mode, dev_t rdev) 4181 { 4182 struct btrfs_trans_handle *trans; 4183 struct btrfs_root *root = BTRFS_I(dir)->root; 4184 struct inode *inode = NULL; 4185 int err; 4186 int drop_inode = 0; 4187 u64 objectid; 4188 unsigned long nr = 0; 4189 u64 index = 0; 4190 4191 if (!new_valid_dev(rdev)) 4192 return -EINVAL; 4193 4194 /* 4195 * 2 for inode item and ref 4196 * 2 for dir items 4197 * 1 for xattr if selinux is on 4198 */ 4199 err = btrfs_reserve_metadata_space(root, 5); 4200 if (err) 4201 return err; 4202 4203 trans = btrfs_start_transaction(root, 1); 4204 if (!trans) 4205 goto fail; 4206 btrfs_set_trans_block_group(trans, dir); 4207 4208 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4209 if (err) { 4210 err = -ENOSPC; 4211 goto out_unlock; 4212 } 4213 4214 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4215 dentry->d_name.len, 4216 dentry->d_parent->d_inode->i_ino, objectid, 4217 BTRFS_I(dir)->block_group, mode, &index); 4218 err = PTR_ERR(inode); 4219 if (IS_ERR(inode)) 4220 goto out_unlock; 4221 4222 err = btrfs_init_inode_security(inode, dir); 4223 if (err) { 4224 drop_inode = 1; 4225 goto out_unlock; 4226 } 4227 4228 btrfs_set_trans_block_group(trans, inode); 4229 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4230 if (err) 4231 drop_inode = 1; 4232 else { 4233 inode->i_op = &btrfs_special_inode_operations; 4234 init_special_inode(inode, inode->i_mode, rdev); 4235 btrfs_update_inode(trans, root, inode); 4236 } 4237 btrfs_update_inode_block_group(trans, inode); 4238 btrfs_update_inode_block_group(trans, dir); 4239 out_unlock: 4240 nr = trans->blocks_used; 4241 btrfs_end_transaction_throttle(trans, root); 4242 fail: 4243 btrfs_unreserve_metadata_space(root, 5); 4244 if (drop_inode) { 4245 inode_dec_link_count(inode); 4246 iput(inode); 4247 } 4248 btrfs_btree_balance_dirty(root, nr); 4249 return err; 4250 } 4251 4252 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4253 int mode, struct nameidata *nd) 4254 { 4255 struct btrfs_trans_handle *trans; 4256 struct btrfs_root *root = BTRFS_I(dir)->root; 4257 struct inode *inode = NULL; 4258 int err; 4259 int drop_inode = 0; 4260 unsigned long nr = 0; 4261 u64 objectid; 4262 u64 index = 0; 4263 4264 /* 4265 * 2 for inode item and ref 4266 * 2 for dir items 4267 * 1 for xattr if selinux is on 4268 */ 4269 err = btrfs_reserve_metadata_space(root, 5); 4270 if (err) 4271 return err; 4272 4273 trans = btrfs_start_transaction(root, 1); 4274 if (!trans) 4275 goto fail; 4276 btrfs_set_trans_block_group(trans, dir); 4277 4278 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4279 if (err) { 4280 err = -ENOSPC; 4281 goto out_unlock; 4282 } 4283 4284 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4285 dentry->d_name.len, 4286 dentry->d_parent->d_inode->i_ino, 4287 objectid, BTRFS_I(dir)->block_group, mode, 4288 &index); 4289 err = PTR_ERR(inode); 4290 if (IS_ERR(inode)) 4291 goto out_unlock; 4292 4293 err = btrfs_init_inode_security(inode, dir); 4294 if (err) { 4295 drop_inode = 1; 4296 goto out_unlock; 4297 } 4298 4299 btrfs_set_trans_block_group(trans, inode); 4300 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4301 if (err) 4302 drop_inode = 1; 4303 else { 4304 inode->i_mapping->a_ops = &btrfs_aops; 4305 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4306 inode->i_fop = &btrfs_file_operations; 4307 inode->i_op = &btrfs_file_inode_operations; 4308 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4309 } 4310 btrfs_update_inode_block_group(trans, inode); 4311 btrfs_update_inode_block_group(trans, dir); 4312 out_unlock: 4313 nr = trans->blocks_used; 4314 btrfs_end_transaction_throttle(trans, root); 4315 fail: 4316 btrfs_unreserve_metadata_space(root, 5); 4317 if (drop_inode) { 4318 inode_dec_link_count(inode); 4319 iput(inode); 4320 } 4321 btrfs_btree_balance_dirty(root, nr); 4322 return err; 4323 } 4324 4325 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 4326 struct dentry *dentry) 4327 { 4328 struct btrfs_trans_handle *trans; 4329 struct btrfs_root *root = BTRFS_I(dir)->root; 4330 struct inode *inode = old_dentry->d_inode; 4331 u64 index; 4332 unsigned long nr = 0; 4333 int err; 4334 int drop_inode = 0; 4335 4336 if (inode->i_nlink == 0) 4337 return -ENOENT; 4338 4339 /* 4340 * 1 item for inode ref 4341 * 2 items for dir items 4342 */ 4343 err = btrfs_reserve_metadata_space(root, 3); 4344 if (err) 4345 return err; 4346 4347 btrfs_inc_nlink(inode); 4348 4349 err = btrfs_set_inode_index(dir, &index); 4350 if (err) 4351 goto fail; 4352 4353 trans = btrfs_start_transaction(root, 1); 4354 4355 btrfs_set_trans_block_group(trans, dir); 4356 atomic_inc(&inode->i_count); 4357 4358 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4359 4360 if (err) { 4361 drop_inode = 1; 4362 } else { 4363 btrfs_update_inode_block_group(trans, dir); 4364 err = btrfs_update_inode(trans, root, inode); 4365 BUG_ON(err); 4366 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); 4367 } 4368 4369 nr = trans->blocks_used; 4370 btrfs_end_transaction_throttle(trans, root); 4371 fail: 4372 btrfs_unreserve_metadata_space(root, 3); 4373 if (drop_inode) { 4374 inode_dec_link_count(inode); 4375 iput(inode); 4376 } 4377 btrfs_btree_balance_dirty(root, nr); 4378 return err; 4379 } 4380 4381 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 4382 { 4383 struct inode *inode = NULL; 4384 struct btrfs_trans_handle *trans; 4385 struct btrfs_root *root = BTRFS_I(dir)->root; 4386 int err = 0; 4387 int drop_on_err = 0; 4388 u64 objectid = 0; 4389 u64 index = 0; 4390 unsigned long nr = 1; 4391 4392 /* 4393 * 2 items for inode and ref 4394 * 2 items for dir items 4395 * 1 for xattr if selinux is on 4396 */ 4397 err = btrfs_reserve_metadata_space(root, 5); 4398 if (err) 4399 return err; 4400 4401 trans = btrfs_start_transaction(root, 1); 4402 if (!trans) { 4403 err = -ENOMEM; 4404 goto out_unlock; 4405 } 4406 btrfs_set_trans_block_group(trans, dir); 4407 4408 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4409 if (err) { 4410 err = -ENOSPC; 4411 goto out_unlock; 4412 } 4413 4414 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4415 dentry->d_name.len, 4416 dentry->d_parent->d_inode->i_ino, objectid, 4417 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4418 &index); 4419 if (IS_ERR(inode)) { 4420 err = PTR_ERR(inode); 4421 goto out_fail; 4422 } 4423 4424 drop_on_err = 1; 4425 4426 err = btrfs_init_inode_security(inode, dir); 4427 if (err) 4428 goto out_fail; 4429 4430 inode->i_op = &btrfs_dir_inode_operations; 4431 inode->i_fop = &btrfs_dir_file_operations; 4432 btrfs_set_trans_block_group(trans, inode); 4433 4434 btrfs_i_size_write(inode, 0); 4435 err = btrfs_update_inode(trans, root, inode); 4436 if (err) 4437 goto out_fail; 4438 4439 err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4440 inode, dentry->d_name.name, 4441 dentry->d_name.len, 0, index); 4442 if (err) 4443 goto out_fail; 4444 4445 d_instantiate(dentry, inode); 4446 drop_on_err = 0; 4447 btrfs_update_inode_block_group(trans, inode); 4448 btrfs_update_inode_block_group(trans, dir); 4449 4450 out_fail: 4451 nr = trans->blocks_used; 4452 btrfs_end_transaction_throttle(trans, root); 4453 4454 out_unlock: 4455 btrfs_unreserve_metadata_space(root, 5); 4456 if (drop_on_err) 4457 iput(inode); 4458 btrfs_btree_balance_dirty(root, nr); 4459 return err; 4460 } 4461 4462 /* helper for btfs_get_extent. Given an existing extent in the tree, 4463 * and an extent that you want to insert, deal with overlap and insert 4464 * the new extent into the tree. 4465 */ 4466 static int merge_extent_mapping(struct extent_map_tree *em_tree, 4467 struct extent_map *existing, 4468 struct extent_map *em, 4469 u64 map_start, u64 map_len) 4470 { 4471 u64 start_diff; 4472 4473 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 4474 start_diff = map_start - em->start; 4475 em->start = map_start; 4476 em->len = map_len; 4477 if (em->block_start < EXTENT_MAP_LAST_BYTE && 4478 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 4479 em->block_start += start_diff; 4480 em->block_len -= start_diff; 4481 } 4482 return add_extent_mapping(em_tree, em); 4483 } 4484 4485 static noinline int uncompress_inline(struct btrfs_path *path, 4486 struct inode *inode, struct page *page, 4487 size_t pg_offset, u64 extent_offset, 4488 struct btrfs_file_extent_item *item) 4489 { 4490 int ret; 4491 struct extent_buffer *leaf = path->nodes[0]; 4492 char *tmp; 4493 size_t max_size; 4494 unsigned long inline_size; 4495 unsigned long ptr; 4496 4497 WARN_ON(pg_offset != 0); 4498 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4499 inline_size = btrfs_file_extent_inline_item_len(leaf, 4500 btrfs_item_nr(leaf, path->slots[0])); 4501 tmp = kmalloc(inline_size, GFP_NOFS); 4502 ptr = btrfs_file_extent_inline_start(item); 4503 4504 read_extent_buffer(leaf, tmp, ptr, inline_size); 4505 4506 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4507 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4508 inline_size, max_size); 4509 if (ret) { 4510 char *kaddr = kmap_atomic(page, KM_USER0); 4511 unsigned long copy_size = min_t(u64, 4512 PAGE_CACHE_SIZE - pg_offset, 4513 max_size - extent_offset); 4514 memset(kaddr + pg_offset, 0, copy_size); 4515 kunmap_atomic(kaddr, KM_USER0); 4516 } 4517 kfree(tmp); 4518 return 0; 4519 } 4520 4521 /* 4522 * a bit scary, this does extent mapping from logical file offset to the disk. 4523 * the ugly parts come from merging extents from the disk with the in-ram 4524 * representation. This gets more complex because of the data=ordered code, 4525 * where the in-ram extents might be locked pending data=ordered completion. 4526 * 4527 * This also copies inline extents directly into the page. 4528 */ 4529 4530 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 4531 size_t pg_offset, u64 start, u64 len, 4532 int create) 4533 { 4534 int ret; 4535 int err = 0; 4536 u64 bytenr; 4537 u64 extent_start = 0; 4538 u64 extent_end = 0; 4539 u64 objectid = inode->i_ino; 4540 u32 found_type; 4541 struct btrfs_path *path = NULL; 4542 struct btrfs_root *root = BTRFS_I(inode)->root; 4543 struct btrfs_file_extent_item *item; 4544 struct extent_buffer *leaf; 4545 struct btrfs_key found_key; 4546 struct extent_map *em = NULL; 4547 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4548 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4549 struct btrfs_trans_handle *trans = NULL; 4550 int compressed; 4551 4552 again: 4553 read_lock(&em_tree->lock); 4554 em = lookup_extent_mapping(em_tree, start, len); 4555 if (em) 4556 em->bdev = root->fs_info->fs_devices->latest_bdev; 4557 read_unlock(&em_tree->lock); 4558 4559 if (em) { 4560 if (em->start > start || em->start + em->len <= start) 4561 free_extent_map(em); 4562 else if (em->block_start == EXTENT_MAP_INLINE && page) 4563 free_extent_map(em); 4564 else 4565 goto out; 4566 } 4567 em = alloc_extent_map(GFP_NOFS); 4568 if (!em) { 4569 err = -ENOMEM; 4570 goto out; 4571 } 4572 em->bdev = root->fs_info->fs_devices->latest_bdev; 4573 em->start = EXTENT_MAP_HOLE; 4574 em->orig_start = EXTENT_MAP_HOLE; 4575 em->len = (u64)-1; 4576 em->block_len = (u64)-1; 4577 4578 if (!path) { 4579 path = btrfs_alloc_path(); 4580 BUG_ON(!path); 4581 } 4582 4583 ret = btrfs_lookup_file_extent(trans, root, path, 4584 objectid, start, trans != NULL); 4585 if (ret < 0) { 4586 err = ret; 4587 goto out; 4588 } 4589 4590 if (ret != 0) { 4591 if (path->slots[0] == 0) 4592 goto not_found; 4593 path->slots[0]--; 4594 } 4595 4596 leaf = path->nodes[0]; 4597 item = btrfs_item_ptr(leaf, path->slots[0], 4598 struct btrfs_file_extent_item); 4599 /* are we inside the extent that was found? */ 4600 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4601 found_type = btrfs_key_type(&found_key); 4602 if (found_key.objectid != objectid || 4603 found_type != BTRFS_EXTENT_DATA_KEY) { 4604 goto not_found; 4605 } 4606 4607 found_type = btrfs_file_extent_type(leaf, item); 4608 extent_start = found_key.offset; 4609 compressed = btrfs_file_extent_compression(leaf, item); 4610 if (found_type == BTRFS_FILE_EXTENT_REG || 4611 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 4612 extent_end = extent_start + 4613 btrfs_file_extent_num_bytes(leaf, item); 4614 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 4615 size_t size; 4616 size = btrfs_file_extent_inline_len(leaf, item); 4617 extent_end = (extent_start + size + root->sectorsize - 1) & 4618 ~((u64)root->sectorsize - 1); 4619 } 4620 4621 if (start >= extent_end) { 4622 path->slots[0]++; 4623 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 4624 ret = btrfs_next_leaf(root, path); 4625 if (ret < 0) { 4626 err = ret; 4627 goto out; 4628 } 4629 if (ret > 0) 4630 goto not_found; 4631 leaf = path->nodes[0]; 4632 } 4633 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4634 if (found_key.objectid != objectid || 4635 found_key.type != BTRFS_EXTENT_DATA_KEY) 4636 goto not_found; 4637 if (start + len <= found_key.offset) 4638 goto not_found; 4639 em->start = start; 4640 em->len = found_key.offset - start; 4641 goto not_found_em; 4642 } 4643 4644 if (found_type == BTRFS_FILE_EXTENT_REG || 4645 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 4646 em->start = extent_start; 4647 em->len = extent_end - extent_start; 4648 em->orig_start = extent_start - 4649 btrfs_file_extent_offset(leaf, item); 4650 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 4651 if (bytenr == 0) { 4652 em->block_start = EXTENT_MAP_HOLE; 4653 goto insert; 4654 } 4655 if (compressed) { 4656 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 4657 em->block_start = bytenr; 4658 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 4659 item); 4660 } else { 4661 bytenr += btrfs_file_extent_offset(leaf, item); 4662 em->block_start = bytenr; 4663 em->block_len = em->len; 4664 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 4665 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 4666 } 4667 goto insert; 4668 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 4669 unsigned long ptr; 4670 char *map; 4671 size_t size; 4672 size_t extent_offset; 4673 size_t copy_size; 4674 4675 em->block_start = EXTENT_MAP_INLINE; 4676 if (!page || create) { 4677 em->start = extent_start; 4678 em->len = extent_end - extent_start; 4679 goto out; 4680 } 4681 4682 size = btrfs_file_extent_inline_len(leaf, item); 4683 extent_offset = page_offset(page) + pg_offset - extent_start; 4684 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 4685 size - extent_offset); 4686 em->start = extent_start + extent_offset; 4687 em->len = (copy_size + root->sectorsize - 1) & 4688 ~((u64)root->sectorsize - 1); 4689 em->orig_start = EXTENT_MAP_INLINE; 4690 if (compressed) 4691 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 4692 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 4693 if (create == 0 && !PageUptodate(page)) { 4694 if (btrfs_file_extent_compression(leaf, item) == 4695 BTRFS_COMPRESS_ZLIB) { 4696 ret = uncompress_inline(path, inode, page, 4697 pg_offset, 4698 extent_offset, item); 4699 BUG_ON(ret); 4700 } else { 4701 map = kmap(page); 4702 read_extent_buffer(leaf, map + pg_offset, ptr, 4703 copy_size); 4704 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 4705 memset(map + pg_offset + copy_size, 0, 4706 PAGE_CACHE_SIZE - pg_offset - 4707 copy_size); 4708 } 4709 kunmap(page); 4710 } 4711 flush_dcache_page(page); 4712 } else if (create && PageUptodate(page)) { 4713 if (!trans) { 4714 kunmap(page); 4715 free_extent_map(em); 4716 em = NULL; 4717 btrfs_release_path(root, path); 4718 trans = btrfs_join_transaction(root, 1); 4719 goto again; 4720 } 4721 map = kmap(page); 4722 write_extent_buffer(leaf, map + pg_offset, ptr, 4723 copy_size); 4724 kunmap(page); 4725 btrfs_mark_buffer_dirty(leaf); 4726 } 4727 set_extent_uptodate(io_tree, em->start, 4728 extent_map_end(em) - 1, GFP_NOFS); 4729 goto insert; 4730 } else { 4731 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 4732 WARN_ON(1); 4733 } 4734 not_found: 4735 em->start = start; 4736 em->len = len; 4737 not_found_em: 4738 em->block_start = EXTENT_MAP_HOLE; 4739 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 4740 insert: 4741 btrfs_release_path(root, path); 4742 if (em->start > start || extent_map_end(em) <= start) { 4743 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 4744 "[%llu %llu]\n", (unsigned long long)em->start, 4745 (unsigned long long)em->len, 4746 (unsigned long long)start, 4747 (unsigned long long)len); 4748 err = -EIO; 4749 goto out; 4750 } 4751 4752 err = 0; 4753 write_lock(&em_tree->lock); 4754 ret = add_extent_mapping(em_tree, em); 4755 /* it is possible that someone inserted the extent into the tree 4756 * while we had the lock dropped. It is also possible that 4757 * an overlapping map exists in the tree 4758 */ 4759 if (ret == -EEXIST) { 4760 struct extent_map *existing; 4761 4762 ret = 0; 4763 4764 existing = lookup_extent_mapping(em_tree, start, len); 4765 if (existing && (existing->start > start || 4766 existing->start + existing->len <= start)) { 4767 free_extent_map(existing); 4768 existing = NULL; 4769 } 4770 if (!existing) { 4771 existing = lookup_extent_mapping(em_tree, em->start, 4772 em->len); 4773 if (existing) { 4774 err = merge_extent_mapping(em_tree, existing, 4775 em, start, 4776 root->sectorsize); 4777 free_extent_map(existing); 4778 if (err) { 4779 free_extent_map(em); 4780 em = NULL; 4781 } 4782 } else { 4783 err = -EIO; 4784 free_extent_map(em); 4785 em = NULL; 4786 } 4787 } else { 4788 free_extent_map(em); 4789 em = existing; 4790 err = 0; 4791 } 4792 } 4793 write_unlock(&em_tree->lock); 4794 out: 4795 if (path) 4796 btrfs_free_path(path); 4797 if (trans) { 4798 ret = btrfs_end_transaction(trans, root); 4799 if (!err) 4800 err = ret; 4801 } 4802 if (err) { 4803 free_extent_map(em); 4804 return ERR_PTR(err); 4805 } 4806 return em; 4807 } 4808 4809 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 4810 const struct iovec *iov, loff_t offset, 4811 unsigned long nr_segs) 4812 { 4813 return -EINVAL; 4814 } 4815 4816 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4817 __u64 start, __u64 len) 4818 { 4819 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); 4820 } 4821 4822 int btrfs_readpage(struct file *file, struct page *page) 4823 { 4824 struct extent_io_tree *tree; 4825 tree = &BTRFS_I(page->mapping->host)->io_tree; 4826 return extent_read_full_page(tree, page, btrfs_get_extent); 4827 } 4828 4829 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 4830 { 4831 struct extent_io_tree *tree; 4832 4833 4834 if (current->flags & PF_MEMALLOC) { 4835 redirty_page_for_writepage(wbc, page); 4836 unlock_page(page); 4837 return 0; 4838 } 4839 tree = &BTRFS_I(page->mapping->host)->io_tree; 4840 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 4841 } 4842 4843 int btrfs_writepages(struct address_space *mapping, 4844 struct writeback_control *wbc) 4845 { 4846 struct extent_io_tree *tree; 4847 4848 tree = &BTRFS_I(mapping->host)->io_tree; 4849 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 4850 } 4851 4852 static int 4853 btrfs_readpages(struct file *file, struct address_space *mapping, 4854 struct list_head *pages, unsigned nr_pages) 4855 { 4856 struct extent_io_tree *tree; 4857 tree = &BTRFS_I(mapping->host)->io_tree; 4858 return extent_readpages(tree, mapping, pages, nr_pages, 4859 btrfs_get_extent); 4860 } 4861 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 4862 { 4863 struct extent_io_tree *tree; 4864 struct extent_map_tree *map; 4865 int ret; 4866 4867 tree = &BTRFS_I(page->mapping->host)->io_tree; 4868 map = &BTRFS_I(page->mapping->host)->extent_tree; 4869 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 4870 if (ret == 1) { 4871 ClearPagePrivate(page); 4872 set_page_private(page, 0); 4873 page_cache_release(page); 4874 } 4875 return ret; 4876 } 4877 4878 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 4879 { 4880 if (PageWriteback(page) || PageDirty(page)) 4881 return 0; 4882 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 4883 } 4884 4885 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 4886 { 4887 struct extent_io_tree *tree; 4888 struct btrfs_ordered_extent *ordered; 4889 u64 page_start = page_offset(page); 4890 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 4891 4892 4893 /* 4894 * we have the page locked, so new writeback can't start, 4895 * and the dirty bit won't be cleared while we are here. 4896 * 4897 * Wait for IO on this page so that we can safely clear 4898 * the PagePrivate2 bit and do ordered accounting 4899 */ 4900 wait_on_page_writeback(page); 4901 4902 tree = &BTRFS_I(page->mapping->host)->io_tree; 4903 if (offset) { 4904 btrfs_releasepage(page, GFP_NOFS); 4905 return; 4906 } 4907 lock_extent(tree, page_start, page_end, GFP_NOFS); 4908 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 4909 page_offset(page)); 4910 if (ordered) { 4911 /* 4912 * IO on this page will never be started, so we need 4913 * to account for any ordered extents now 4914 */ 4915 clear_extent_bit(tree, page_start, page_end, 4916 EXTENT_DIRTY | EXTENT_DELALLOC | 4917 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 4918 NULL, GFP_NOFS); 4919 /* 4920 * whoever cleared the private bit is responsible 4921 * for the finish_ordered_io 4922 */ 4923 if (TestClearPagePrivate2(page)) { 4924 btrfs_finish_ordered_io(page->mapping->host, 4925 page_start, page_end); 4926 } 4927 btrfs_put_ordered_extent(ordered); 4928 lock_extent(tree, page_start, page_end, GFP_NOFS); 4929 } 4930 clear_extent_bit(tree, page_start, page_end, 4931 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4932 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); 4933 __btrfs_releasepage(page, GFP_NOFS); 4934 4935 ClearPageChecked(page); 4936 if (PagePrivate(page)) { 4937 ClearPagePrivate(page); 4938 set_page_private(page, 0); 4939 page_cache_release(page); 4940 } 4941 } 4942 4943 /* 4944 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 4945 * called from a page fault handler when a page is first dirtied. Hence we must 4946 * be careful to check for EOF conditions here. We set the page up correctly 4947 * for a written page which means we get ENOSPC checking when writing into 4948 * holes and correct delalloc and unwritten extent mapping on filesystems that 4949 * support these features. 4950 * 4951 * We are not allowed to take the i_mutex here so we have to play games to 4952 * protect against truncate races as the page could now be beyond EOF. Because 4953 * vmtruncate() writes the inode size before removing pages, once we have the 4954 * page lock we can determine safely if the page is beyond EOF. If it is not 4955 * beyond EOF, then the page is guaranteed safe against truncation until we 4956 * unlock the page. 4957 */ 4958 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 4959 { 4960 struct page *page = vmf->page; 4961 struct inode *inode = fdentry(vma->vm_file)->d_inode; 4962 struct btrfs_root *root = BTRFS_I(inode)->root; 4963 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4964 struct btrfs_ordered_extent *ordered; 4965 char *kaddr; 4966 unsigned long zero_start; 4967 loff_t size; 4968 int ret; 4969 u64 page_start; 4970 u64 page_end; 4971 4972 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 4973 if (ret) { 4974 if (ret == -ENOMEM) 4975 ret = VM_FAULT_OOM; 4976 else /* -ENOSPC, -EIO, etc */ 4977 ret = VM_FAULT_SIGBUS; 4978 goto out; 4979 } 4980 4981 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); 4982 if (ret) { 4983 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 4984 ret = VM_FAULT_SIGBUS; 4985 goto out; 4986 } 4987 4988 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4989 again: 4990 lock_page(page); 4991 size = i_size_read(inode); 4992 page_start = page_offset(page); 4993 page_end = page_start + PAGE_CACHE_SIZE - 1; 4994 4995 if ((page->mapping != inode->i_mapping) || 4996 (page_start >= size)) { 4997 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 4998 /* page got truncated out from underneath us */ 4999 goto out_unlock; 5000 } 5001 wait_on_page_writeback(page); 5002 5003 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 5004 set_page_extent_mapped(page); 5005 5006 /* 5007 * we can't set the delalloc bits if there are pending ordered 5008 * extents. Drop our locks and wait for them to finish 5009 */ 5010 ordered = btrfs_lookup_ordered_extent(inode, page_start); 5011 if (ordered) { 5012 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5013 unlock_page(page); 5014 btrfs_start_ordered_extent(inode, ordered, 1); 5015 btrfs_put_ordered_extent(ordered); 5016 goto again; 5017 } 5018 5019 /* 5020 * XXX - page_mkwrite gets called every time the page is dirtied, even 5021 * if it was already dirty, so for space accounting reasons we need to 5022 * clear any delalloc bits for the range we are fixing to save. There 5023 * is probably a better way to do this, but for now keep consistent with 5024 * prepare_pages in the normal write path. 5025 */ 5026 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 5027 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 5028 GFP_NOFS); 5029 5030 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 5031 if (ret) { 5032 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5033 ret = VM_FAULT_SIGBUS; 5034 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 5035 goto out_unlock; 5036 } 5037 ret = 0; 5038 5039 /* page is wholly or partially inside EOF */ 5040 if (page_start + PAGE_CACHE_SIZE > size) 5041 zero_start = size & ~PAGE_CACHE_MASK; 5042 else 5043 zero_start = PAGE_CACHE_SIZE; 5044 5045 if (zero_start != PAGE_CACHE_SIZE) { 5046 kaddr = kmap(page); 5047 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 5048 flush_dcache_page(page); 5049 kunmap(page); 5050 } 5051 ClearPageChecked(page); 5052 set_page_dirty(page); 5053 SetPageUptodate(page); 5054 5055 BTRFS_I(inode)->last_trans = root->fs_info->generation; 5056 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 5057 5058 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5059 5060 out_unlock: 5061 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 5062 if (!ret) 5063 return VM_FAULT_LOCKED; 5064 unlock_page(page); 5065 out: 5066 return ret; 5067 } 5068 5069 static void btrfs_truncate(struct inode *inode) 5070 { 5071 struct btrfs_root *root = BTRFS_I(inode)->root; 5072 int ret; 5073 struct btrfs_trans_handle *trans; 5074 unsigned long nr; 5075 u64 mask = root->sectorsize - 1; 5076 5077 if (!S_ISREG(inode->i_mode)) 5078 return; 5079 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 5080 return; 5081 5082 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 5083 if (ret) 5084 return; 5085 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 5086 5087 trans = btrfs_start_transaction(root, 1); 5088 5089 /* 5090 * setattr is responsible for setting the ordered_data_close flag, 5091 * but that is only tested during the last file release. That 5092 * could happen well after the next commit, leaving a great big 5093 * window where new writes may get lost if someone chooses to write 5094 * to this file after truncating to zero 5095 * 5096 * The inode doesn't have any dirty data here, and so if we commit 5097 * this is a noop. If someone immediately starts writing to the inode 5098 * it is very likely we'll catch some of their writes in this 5099 * transaction, and the commit will find this file on the ordered 5100 * data list with good things to send down. 5101 * 5102 * This is a best effort solution, there is still a window where 5103 * using truncate to replace the contents of the file will 5104 * end up with a zero length file after a crash. 5105 */ 5106 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 5107 btrfs_add_ordered_operation(trans, root, inode); 5108 5109 btrfs_set_trans_block_group(trans, inode); 5110 btrfs_i_size_write(inode, inode->i_size); 5111 5112 ret = btrfs_orphan_add(trans, inode); 5113 if (ret) 5114 goto out; 5115 /* FIXME, add redo link to tree so we don't leak on crash */ 5116 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 5117 BTRFS_EXTENT_DATA_KEY); 5118 btrfs_update_inode(trans, root, inode); 5119 5120 ret = btrfs_orphan_del(trans, inode); 5121 BUG_ON(ret); 5122 5123 out: 5124 nr = trans->blocks_used; 5125 ret = btrfs_end_transaction_throttle(trans, root); 5126 BUG_ON(ret); 5127 btrfs_btree_balance_dirty(root, nr); 5128 } 5129 5130 /* 5131 * create a new subvolume directory/inode (helper for the ioctl). 5132 */ 5133 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 5134 struct btrfs_root *new_root, 5135 u64 new_dirid, u64 alloc_hint) 5136 { 5137 struct inode *inode; 5138 int err; 5139 u64 index = 0; 5140 5141 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 5142 new_dirid, alloc_hint, S_IFDIR | 0700, &index); 5143 if (IS_ERR(inode)) 5144 return PTR_ERR(inode); 5145 inode->i_op = &btrfs_dir_inode_operations; 5146 inode->i_fop = &btrfs_dir_file_operations; 5147 5148 inode->i_nlink = 1; 5149 btrfs_i_size_write(inode, 0); 5150 5151 err = btrfs_update_inode(trans, new_root, inode); 5152 BUG_ON(err); 5153 5154 iput(inode); 5155 return 0; 5156 } 5157 5158 /* helper function for file defrag and space balancing. This 5159 * forces readahead on a given range of bytes in an inode 5160 */ 5161 unsigned long btrfs_force_ra(struct address_space *mapping, 5162 struct file_ra_state *ra, struct file *file, 5163 pgoff_t offset, pgoff_t last_index) 5164 { 5165 pgoff_t req_size = last_index - offset + 1; 5166 5167 page_cache_sync_readahead(mapping, ra, file, offset, req_size); 5168 return offset + req_size; 5169 } 5170 5171 struct inode *btrfs_alloc_inode(struct super_block *sb) 5172 { 5173 struct btrfs_inode *ei; 5174 5175 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 5176 if (!ei) 5177 return NULL; 5178 ei->last_trans = 0; 5179 ei->last_sub_trans = 0; 5180 ei->logged_trans = 0; 5181 ei->outstanding_extents = 0; 5182 ei->reserved_extents = 0; 5183 ei->root = NULL; 5184 spin_lock_init(&ei->accounting_lock); 5185 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5186 INIT_LIST_HEAD(&ei->i_orphan); 5187 INIT_LIST_HEAD(&ei->ordered_operations); 5188 return &ei->vfs_inode; 5189 } 5190 5191 void btrfs_destroy_inode(struct inode *inode) 5192 { 5193 struct btrfs_ordered_extent *ordered; 5194 struct btrfs_root *root = BTRFS_I(inode)->root; 5195 5196 WARN_ON(!list_empty(&inode->i_dentry)); 5197 WARN_ON(inode->i_data.nrpages); 5198 5199 /* 5200 * This can happen where we create an inode, but somebody else also 5201 * created the same inode and we need to destroy the one we already 5202 * created. 5203 */ 5204 if (!root) 5205 goto free; 5206 5207 /* 5208 * Make sure we're properly removed from the ordered operation 5209 * lists. 5210 */ 5211 smp_mb(); 5212 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 5213 spin_lock(&root->fs_info->ordered_extent_lock); 5214 list_del_init(&BTRFS_I(inode)->ordered_operations); 5215 spin_unlock(&root->fs_info->ordered_extent_lock); 5216 } 5217 5218 spin_lock(&root->list_lock); 5219 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 5220 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 5221 " list\n", inode->i_ino); 5222 dump_stack(); 5223 } 5224 spin_unlock(&root->list_lock); 5225 5226 while (1) { 5227 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 5228 if (!ordered) 5229 break; 5230 else { 5231 printk(KERN_ERR "btrfs found ordered " 5232 "extent %llu %llu on inode cleanup\n", 5233 (unsigned long long)ordered->file_offset, 5234 (unsigned long long)ordered->len); 5235 btrfs_remove_ordered_extent(inode, ordered); 5236 btrfs_put_ordered_extent(ordered); 5237 btrfs_put_ordered_extent(ordered); 5238 } 5239 } 5240 inode_tree_del(inode); 5241 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 5242 free: 5243 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5244 } 5245 5246 void btrfs_drop_inode(struct inode *inode) 5247 { 5248 struct btrfs_root *root = BTRFS_I(inode)->root; 5249 5250 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 5251 generic_delete_inode(inode); 5252 else 5253 generic_drop_inode(inode); 5254 } 5255 5256 static void init_once(void *foo) 5257 { 5258 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 5259 5260 inode_init_once(&ei->vfs_inode); 5261 } 5262 5263 void btrfs_destroy_cachep(void) 5264 { 5265 if (btrfs_inode_cachep) 5266 kmem_cache_destroy(btrfs_inode_cachep); 5267 if (btrfs_trans_handle_cachep) 5268 kmem_cache_destroy(btrfs_trans_handle_cachep); 5269 if (btrfs_transaction_cachep) 5270 kmem_cache_destroy(btrfs_transaction_cachep); 5271 if (btrfs_path_cachep) 5272 kmem_cache_destroy(btrfs_path_cachep); 5273 } 5274 5275 int btrfs_init_cachep(void) 5276 { 5277 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 5278 sizeof(struct btrfs_inode), 0, 5279 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 5280 if (!btrfs_inode_cachep) 5281 goto fail; 5282 5283 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 5284 sizeof(struct btrfs_trans_handle), 0, 5285 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 5286 if (!btrfs_trans_handle_cachep) 5287 goto fail; 5288 5289 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 5290 sizeof(struct btrfs_transaction), 0, 5291 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 5292 if (!btrfs_transaction_cachep) 5293 goto fail; 5294 5295 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 5296 sizeof(struct btrfs_path), 0, 5297 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 5298 if (!btrfs_path_cachep) 5299 goto fail; 5300 5301 return 0; 5302 fail: 5303 btrfs_destroy_cachep(); 5304 return -ENOMEM; 5305 } 5306 5307 static int btrfs_getattr(struct vfsmount *mnt, 5308 struct dentry *dentry, struct kstat *stat) 5309 { 5310 struct inode *inode = dentry->d_inode; 5311 generic_fillattr(inode, stat); 5312 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; 5313 stat->blksize = PAGE_CACHE_SIZE; 5314 stat->blocks = (inode_get_bytes(inode) + 5315 BTRFS_I(inode)->delalloc_bytes) >> 9; 5316 return 0; 5317 } 5318 5319 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 5320 struct inode *new_dir, struct dentry *new_dentry) 5321 { 5322 struct btrfs_trans_handle *trans; 5323 struct btrfs_root *root = BTRFS_I(old_dir)->root; 5324 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 5325 struct inode *new_inode = new_dentry->d_inode; 5326 struct inode *old_inode = old_dentry->d_inode; 5327 struct timespec ctime = CURRENT_TIME; 5328 u64 index = 0; 5329 u64 root_objectid; 5330 int ret; 5331 5332 if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5333 return -EPERM; 5334 5335 /* we only allow rename subvolume link between subvolumes */ 5336 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 5337 return -EXDEV; 5338 5339 if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 5340 (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) 5341 return -ENOTEMPTY; 5342 5343 if (S_ISDIR(old_inode->i_mode) && new_inode && 5344 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 5345 return -ENOTEMPTY; 5346 5347 /* 5348 * We want to reserve the absolute worst case amount of items. So if 5349 * both inodes are subvols and we need to unlink them then that would 5350 * require 4 item modifications, but if they are both normal inodes it 5351 * would require 5 item modifications, so we'll assume their normal 5352 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 5353 * should cover the worst case number of items we'll modify. 5354 */ 5355 ret = btrfs_reserve_metadata_space(root, 11); 5356 if (ret) 5357 return ret; 5358 5359 /* 5360 * we're using rename to replace one file with another. 5361 * and the replacement file is large. Start IO on it now so 5362 * we don't add too much work to the end of the transaction 5363 */ 5364 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 5365 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 5366 filemap_flush(old_inode->i_mapping); 5367 5368 /* close the racy window with snapshot create/destroy ioctl */ 5369 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5370 down_read(&root->fs_info->subvol_sem); 5371 5372 trans = btrfs_start_transaction(root, 1); 5373 btrfs_set_trans_block_group(trans, new_dir); 5374 5375 if (dest != root) 5376 btrfs_record_root_in_trans(trans, dest); 5377 5378 ret = btrfs_set_inode_index(new_dir, &index); 5379 if (ret) 5380 goto out_fail; 5381 5382 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 5383 /* force full log commit if subvolume involved. */ 5384 root->fs_info->last_trans_log_full_commit = trans->transid; 5385 } else { 5386 ret = btrfs_insert_inode_ref(trans, dest, 5387 new_dentry->d_name.name, 5388 new_dentry->d_name.len, 5389 old_inode->i_ino, 5390 new_dir->i_ino, index); 5391 if (ret) 5392 goto out_fail; 5393 /* 5394 * this is an ugly little race, but the rename is required 5395 * to make sure that if we crash, the inode is either at the 5396 * old name or the new one. pinning the log transaction lets 5397 * us make sure we don't allow a log commit to come in after 5398 * we unlink the name but before we add the new name back in. 5399 */ 5400 btrfs_pin_log_trans(root); 5401 } 5402 /* 5403 * make sure the inode gets flushed if it is replacing 5404 * something. 5405 */ 5406 if (new_inode && new_inode->i_size && 5407 old_inode && S_ISREG(old_inode->i_mode)) { 5408 btrfs_add_ordered_operation(trans, root, old_inode); 5409 } 5410 5411 old_dir->i_ctime = old_dir->i_mtime = ctime; 5412 new_dir->i_ctime = new_dir->i_mtime = ctime; 5413 old_inode->i_ctime = ctime; 5414 5415 if (old_dentry->d_parent != new_dentry->d_parent) 5416 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 5417 5418 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 5419 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 5420 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 5421 old_dentry->d_name.name, 5422 old_dentry->d_name.len); 5423 } else { 5424 btrfs_inc_nlink(old_dentry->d_inode); 5425 ret = btrfs_unlink_inode(trans, root, old_dir, 5426 old_dentry->d_inode, 5427 old_dentry->d_name.name, 5428 old_dentry->d_name.len); 5429 } 5430 BUG_ON(ret); 5431 5432 if (new_inode) { 5433 new_inode->i_ctime = CURRENT_TIME; 5434 if (unlikely(new_inode->i_ino == 5435 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 5436 root_objectid = BTRFS_I(new_inode)->location.objectid; 5437 ret = btrfs_unlink_subvol(trans, dest, new_dir, 5438 root_objectid, 5439 new_dentry->d_name.name, 5440 new_dentry->d_name.len); 5441 BUG_ON(new_inode->i_nlink == 0); 5442 } else { 5443 ret = btrfs_unlink_inode(trans, dest, new_dir, 5444 new_dentry->d_inode, 5445 new_dentry->d_name.name, 5446 new_dentry->d_name.len); 5447 } 5448 BUG_ON(ret); 5449 if (new_inode->i_nlink == 0) { 5450 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 5451 BUG_ON(ret); 5452 } 5453 } 5454 5455 ret = btrfs_add_link(trans, new_dir, old_inode, 5456 new_dentry->d_name.name, 5457 new_dentry->d_name.len, 0, index); 5458 BUG_ON(ret); 5459 5460 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 5461 btrfs_log_new_name(trans, old_inode, old_dir, 5462 new_dentry->d_parent); 5463 btrfs_end_log_trans(root); 5464 } 5465 out_fail: 5466 btrfs_end_transaction_throttle(trans, root); 5467 5468 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5469 up_read(&root->fs_info->subvol_sem); 5470 5471 btrfs_unreserve_metadata_space(root, 11); 5472 return ret; 5473 } 5474 5475 /* 5476 * some fairly slow code that needs optimization. This walks the list 5477 * of all the inodes with pending delalloc and forces them to disk. 5478 */ 5479 int btrfs_start_delalloc_inodes(struct btrfs_root *root) 5480 { 5481 struct list_head *head = &root->fs_info->delalloc_inodes; 5482 struct btrfs_inode *binode; 5483 struct inode *inode; 5484 5485 if (root->fs_info->sb->s_flags & MS_RDONLY) 5486 return -EROFS; 5487 5488 spin_lock(&root->fs_info->delalloc_lock); 5489 while (!list_empty(head)) { 5490 binode = list_entry(head->next, struct btrfs_inode, 5491 delalloc_inodes); 5492 inode = igrab(&binode->vfs_inode); 5493 if (!inode) 5494 list_del_init(&binode->delalloc_inodes); 5495 spin_unlock(&root->fs_info->delalloc_lock); 5496 if (inode) { 5497 filemap_flush(inode->i_mapping); 5498 iput(inode); 5499 } 5500 cond_resched(); 5501 spin_lock(&root->fs_info->delalloc_lock); 5502 } 5503 spin_unlock(&root->fs_info->delalloc_lock); 5504 5505 /* the filemap_flush will queue IO into the worker threads, but 5506 * we have to make sure the IO is actually started and that 5507 * ordered extents get created before we return 5508 */ 5509 atomic_inc(&root->fs_info->async_submit_draining); 5510 while (atomic_read(&root->fs_info->nr_async_submits) || 5511 atomic_read(&root->fs_info->async_delalloc_pages)) { 5512 wait_event(root->fs_info->async_submit_wait, 5513 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 5514 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 5515 } 5516 atomic_dec(&root->fs_info->async_submit_draining); 5517 return 0; 5518 } 5519 5520 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 5521 const char *symname) 5522 { 5523 struct btrfs_trans_handle *trans; 5524 struct btrfs_root *root = BTRFS_I(dir)->root; 5525 struct btrfs_path *path; 5526 struct btrfs_key key; 5527 struct inode *inode = NULL; 5528 int err; 5529 int drop_inode = 0; 5530 u64 objectid; 5531 u64 index = 0 ; 5532 int name_len; 5533 int datasize; 5534 unsigned long ptr; 5535 struct btrfs_file_extent_item *ei; 5536 struct extent_buffer *leaf; 5537 unsigned long nr = 0; 5538 5539 name_len = strlen(symname) + 1; 5540 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5541 return -ENAMETOOLONG; 5542 5543 /* 5544 * 2 items for inode item and ref 5545 * 2 items for dir items 5546 * 1 item for xattr if selinux is on 5547 */ 5548 err = btrfs_reserve_metadata_space(root, 5); 5549 if (err) 5550 return err; 5551 5552 trans = btrfs_start_transaction(root, 1); 5553 if (!trans) 5554 goto out_fail; 5555 btrfs_set_trans_block_group(trans, dir); 5556 5557 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5558 if (err) { 5559 err = -ENOSPC; 5560 goto out_unlock; 5561 } 5562 5563 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5564 dentry->d_name.len, 5565 dentry->d_parent->d_inode->i_ino, objectid, 5566 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 5567 &index); 5568 err = PTR_ERR(inode); 5569 if (IS_ERR(inode)) 5570 goto out_unlock; 5571 5572 err = btrfs_init_inode_security(inode, dir); 5573 if (err) { 5574 drop_inode = 1; 5575 goto out_unlock; 5576 } 5577 5578 btrfs_set_trans_block_group(trans, inode); 5579 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 5580 if (err) 5581 drop_inode = 1; 5582 else { 5583 inode->i_mapping->a_ops = &btrfs_aops; 5584 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5585 inode->i_fop = &btrfs_file_operations; 5586 inode->i_op = &btrfs_file_inode_operations; 5587 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5588 } 5589 btrfs_update_inode_block_group(trans, inode); 5590 btrfs_update_inode_block_group(trans, dir); 5591 if (drop_inode) 5592 goto out_unlock; 5593 5594 path = btrfs_alloc_path(); 5595 BUG_ON(!path); 5596 key.objectid = inode->i_ino; 5597 key.offset = 0; 5598 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 5599 datasize = btrfs_file_extent_calc_inline_size(name_len); 5600 err = btrfs_insert_empty_item(trans, root, path, &key, 5601 datasize); 5602 if (err) { 5603 drop_inode = 1; 5604 goto out_unlock; 5605 } 5606 leaf = path->nodes[0]; 5607 ei = btrfs_item_ptr(leaf, path->slots[0], 5608 struct btrfs_file_extent_item); 5609 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 5610 btrfs_set_file_extent_type(leaf, ei, 5611 BTRFS_FILE_EXTENT_INLINE); 5612 btrfs_set_file_extent_encryption(leaf, ei, 0); 5613 btrfs_set_file_extent_compression(leaf, ei, 0); 5614 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 5615 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 5616 5617 ptr = btrfs_file_extent_inline_start(ei); 5618 write_extent_buffer(leaf, symname, ptr, name_len); 5619 btrfs_mark_buffer_dirty(leaf); 5620 btrfs_free_path(path); 5621 5622 inode->i_op = &btrfs_symlink_inode_operations; 5623 inode->i_mapping->a_ops = &btrfs_symlink_aops; 5624 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5625 inode_set_bytes(inode, name_len); 5626 btrfs_i_size_write(inode, name_len - 1); 5627 err = btrfs_update_inode(trans, root, inode); 5628 if (err) 5629 drop_inode = 1; 5630 5631 out_unlock: 5632 nr = trans->blocks_used; 5633 btrfs_end_transaction_throttle(trans, root); 5634 out_fail: 5635 btrfs_unreserve_metadata_space(root, 5); 5636 if (drop_inode) { 5637 inode_dec_link_count(inode); 5638 iput(inode); 5639 } 5640 btrfs_btree_balance_dirty(root, nr); 5641 return err; 5642 } 5643 5644 static int prealloc_file_range(struct btrfs_trans_handle *trans, 5645 struct inode *inode, u64 start, u64 end, 5646 u64 locked_end, u64 alloc_hint, int mode) 5647 { 5648 struct btrfs_root *root = BTRFS_I(inode)->root; 5649 struct btrfs_key ins; 5650 u64 alloc_size; 5651 u64 cur_offset = start; 5652 u64 num_bytes = end - start; 5653 int ret = 0; 5654 5655 while (num_bytes > 0) { 5656 alloc_size = min(num_bytes, root->fs_info->max_extent); 5657 5658 ret = btrfs_reserve_metadata_space(root, 1); 5659 if (ret) 5660 goto out; 5661 5662 ret = btrfs_reserve_extent(trans, root, alloc_size, 5663 root->sectorsize, 0, alloc_hint, 5664 (u64)-1, &ins, 1); 5665 if (ret) { 5666 WARN_ON(1); 5667 goto out; 5668 } 5669 ret = insert_reserved_file_extent(trans, inode, 5670 cur_offset, ins.objectid, 5671 ins.offset, ins.offset, 5672 ins.offset, locked_end, 5673 0, 0, 0, 5674 BTRFS_FILE_EXTENT_PREALLOC); 5675 BUG_ON(ret); 5676 btrfs_drop_extent_cache(inode, cur_offset, 5677 cur_offset + ins.offset -1, 0); 5678 num_bytes -= ins.offset; 5679 cur_offset += ins.offset; 5680 alloc_hint = ins.objectid + ins.offset; 5681 btrfs_unreserve_metadata_space(root, 1); 5682 } 5683 out: 5684 if (cur_offset > start) { 5685 inode->i_ctime = CURRENT_TIME; 5686 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 5687 if (!(mode & FALLOC_FL_KEEP_SIZE) && 5688 cur_offset > i_size_read(inode)) 5689 btrfs_i_size_write(inode, cur_offset); 5690 ret = btrfs_update_inode(trans, root, inode); 5691 BUG_ON(ret); 5692 } 5693 5694 return ret; 5695 } 5696 5697 static long btrfs_fallocate(struct inode *inode, int mode, 5698 loff_t offset, loff_t len) 5699 { 5700 u64 cur_offset; 5701 u64 last_byte; 5702 u64 alloc_start; 5703 u64 alloc_end; 5704 u64 alloc_hint = 0; 5705 u64 locked_end; 5706 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 5707 struct extent_map *em; 5708 struct btrfs_trans_handle *trans; 5709 struct btrfs_root *root; 5710 int ret; 5711 5712 alloc_start = offset & ~mask; 5713 alloc_end = (offset + len + mask) & ~mask; 5714 5715 /* 5716 * wait for ordered IO before we have any locks. We'll loop again 5717 * below with the locks held. 5718 */ 5719 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); 5720 5721 mutex_lock(&inode->i_mutex); 5722 if (alloc_start > inode->i_size) { 5723 ret = btrfs_cont_expand(inode, alloc_start); 5724 if (ret) 5725 goto out; 5726 } 5727 5728 root = BTRFS_I(inode)->root; 5729 5730 ret = btrfs_check_data_free_space(root, inode, 5731 alloc_end - alloc_start); 5732 if (ret) 5733 goto out; 5734 5735 locked_end = alloc_end - 1; 5736 while (1) { 5737 struct btrfs_ordered_extent *ordered; 5738 5739 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); 5740 if (!trans) { 5741 ret = -EIO; 5742 goto out_free; 5743 } 5744 5745 /* the extent lock is ordered inside the running 5746 * transaction 5747 */ 5748 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5749 GFP_NOFS); 5750 ordered = btrfs_lookup_first_ordered_extent(inode, 5751 alloc_end - 1); 5752 if (ordered && 5753 ordered->file_offset + ordered->len > alloc_start && 5754 ordered->file_offset < alloc_end) { 5755 btrfs_put_ordered_extent(ordered); 5756 unlock_extent(&BTRFS_I(inode)->io_tree, 5757 alloc_start, locked_end, GFP_NOFS); 5758 btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5759 5760 /* 5761 * we can't wait on the range with the transaction 5762 * running or with the extent lock held 5763 */ 5764 btrfs_wait_ordered_range(inode, alloc_start, 5765 alloc_end - alloc_start); 5766 } else { 5767 if (ordered) 5768 btrfs_put_ordered_extent(ordered); 5769 break; 5770 } 5771 } 5772 5773 cur_offset = alloc_start; 5774 while (1) { 5775 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 5776 alloc_end - cur_offset, 0); 5777 BUG_ON(IS_ERR(em) || !em); 5778 last_byte = min(extent_map_end(em), alloc_end); 5779 last_byte = (last_byte + mask) & ~mask; 5780 if (em->block_start == EXTENT_MAP_HOLE) { 5781 ret = prealloc_file_range(trans, inode, cur_offset, 5782 last_byte, locked_end + 1, 5783 alloc_hint, mode); 5784 if (ret < 0) { 5785 free_extent_map(em); 5786 break; 5787 } 5788 } 5789 if (em->block_start <= EXTENT_MAP_LAST_BYTE) 5790 alloc_hint = em->block_start; 5791 free_extent_map(em); 5792 5793 cur_offset = last_byte; 5794 if (cur_offset >= alloc_end) { 5795 ret = 0; 5796 break; 5797 } 5798 } 5799 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5800 GFP_NOFS); 5801 5802 btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5803 out_free: 5804 btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); 5805 out: 5806 mutex_unlock(&inode->i_mutex); 5807 return ret; 5808 } 5809 5810 static int btrfs_set_page_dirty(struct page *page) 5811 { 5812 return __set_page_dirty_nobuffers(page); 5813 } 5814 5815 static int btrfs_permission(struct inode *inode, int mask) 5816 { 5817 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 5818 return -EACCES; 5819 return generic_permission(inode, mask, btrfs_check_acl); 5820 } 5821 5822 static const struct inode_operations btrfs_dir_inode_operations = { 5823 .getattr = btrfs_getattr, 5824 .lookup = btrfs_lookup, 5825 .create = btrfs_create, 5826 .unlink = btrfs_unlink, 5827 .link = btrfs_link, 5828 .mkdir = btrfs_mkdir, 5829 .rmdir = btrfs_rmdir, 5830 .rename = btrfs_rename, 5831 .symlink = btrfs_symlink, 5832 .setattr = btrfs_setattr, 5833 .mknod = btrfs_mknod, 5834 .setxattr = btrfs_setxattr, 5835 .getxattr = btrfs_getxattr, 5836 .listxattr = btrfs_listxattr, 5837 .removexattr = btrfs_removexattr, 5838 .permission = btrfs_permission, 5839 }; 5840 static const struct inode_operations btrfs_dir_ro_inode_operations = { 5841 .lookup = btrfs_lookup, 5842 .permission = btrfs_permission, 5843 }; 5844 5845 static const struct file_operations btrfs_dir_file_operations = { 5846 .llseek = generic_file_llseek, 5847 .read = generic_read_dir, 5848 .readdir = btrfs_real_readdir, 5849 .unlocked_ioctl = btrfs_ioctl, 5850 #ifdef CONFIG_COMPAT 5851 .compat_ioctl = btrfs_ioctl, 5852 #endif 5853 .release = btrfs_release_file, 5854 .fsync = btrfs_sync_file, 5855 }; 5856 5857 static struct extent_io_ops btrfs_extent_io_ops = { 5858 .fill_delalloc = run_delalloc_range, 5859 .submit_bio_hook = btrfs_submit_bio_hook, 5860 .merge_bio_hook = btrfs_merge_bio_hook, 5861 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 5862 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 5863 .writepage_start_hook = btrfs_writepage_start_hook, 5864 .readpage_io_failed_hook = btrfs_io_failed_hook, 5865 .set_bit_hook = btrfs_set_bit_hook, 5866 .clear_bit_hook = btrfs_clear_bit_hook, 5867 .merge_extent_hook = btrfs_merge_extent_hook, 5868 .split_extent_hook = btrfs_split_extent_hook, 5869 }; 5870 5871 /* 5872 * btrfs doesn't support the bmap operation because swapfiles 5873 * use bmap to make a mapping of extents in the file. They assume 5874 * these extents won't change over the life of the file and they 5875 * use the bmap result to do IO directly to the drive. 5876 * 5877 * the btrfs bmap call would return logical addresses that aren't 5878 * suitable for IO and they also will change frequently as COW 5879 * operations happen. So, swapfile + btrfs == corruption. 5880 * 5881 * For now we're avoiding this by dropping bmap. 5882 */ 5883 static const struct address_space_operations btrfs_aops = { 5884 .readpage = btrfs_readpage, 5885 .writepage = btrfs_writepage, 5886 .writepages = btrfs_writepages, 5887 .readpages = btrfs_readpages, 5888 .sync_page = block_sync_page, 5889 .direct_IO = btrfs_direct_IO, 5890 .invalidatepage = btrfs_invalidatepage, 5891 .releasepage = btrfs_releasepage, 5892 .set_page_dirty = btrfs_set_page_dirty, 5893 .error_remove_page = generic_error_remove_page, 5894 }; 5895 5896 static const struct address_space_operations btrfs_symlink_aops = { 5897 .readpage = btrfs_readpage, 5898 .writepage = btrfs_writepage, 5899 .invalidatepage = btrfs_invalidatepage, 5900 .releasepage = btrfs_releasepage, 5901 }; 5902 5903 static const struct inode_operations btrfs_file_inode_operations = { 5904 .truncate = btrfs_truncate, 5905 .getattr = btrfs_getattr, 5906 .setattr = btrfs_setattr, 5907 .setxattr = btrfs_setxattr, 5908 .getxattr = btrfs_getxattr, 5909 .listxattr = btrfs_listxattr, 5910 .removexattr = btrfs_removexattr, 5911 .permission = btrfs_permission, 5912 .fallocate = btrfs_fallocate, 5913 .fiemap = btrfs_fiemap, 5914 }; 5915 static const struct inode_operations btrfs_special_inode_operations = { 5916 .getattr = btrfs_getattr, 5917 .setattr = btrfs_setattr, 5918 .permission = btrfs_permission, 5919 .setxattr = btrfs_setxattr, 5920 .getxattr = btrfs_getxattr, 5921 .listxattr = btrfs_listxattr, 5922 .removexattr = btrfs_removexattr, 5923 }; 5924 static const struct inode_operations btrfs_symlink_inode_operations = { 5925 .readlink = generic_readlink, 5926 .follow_link = page_follow_link_light, 5927 .put_link = page_put_link, 5928 .permission = btrfs_permission, 5929 .setxattr = btrfs_setxattr, 5930 .getxattr = btrfs_getxattr, 5931 .listxattr = btrfs_listxattr, 5932 .removexattr = btrfs_removexattr, 5933 }; 5934 5935 const struct dentry_operations btrfs_dentry_operations = { 5936 .d_delete = btrfs_dentry_delete, 5937 }; 5938