1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include <linux/slab.h> 40 #include "compat.h" 41 #include "ctree.h" 42 #include "disk-io.h" 43 #include "transaction.h" 44 #include "btrfs_inode.h" 45 #include "ioctl.h" 46 #include "print-tree.h" 47 #include "volumes.h" 48 #include "ordered-data.h" 49 #include "xattr.h" 50 #include "tree-log.h" 51 #include "compression.h" 52 #include "locking.h" 53 54 struct btrfs_iget_args { 55 u64 ino; 56 struct btrfs_root *root; 57 }; 58 59 static const struct inode_operations btrfs_dir_inode_operations; 60 static const struct inode_operations btrfs_symlink_inode_operations; 61 static const struct inode_operations btrfs_dir_ro_inode_operations; 62 static const struct inode_operations btrfs_special_inode_operations; 63 static const struct inode_operations btrfs_file_inode_operations; 64 static const struct address_space_operations btrfs_aops; 65 static const struct address_space_operations btrfs_symlink_aops; 66 static const struct file_operations btrfs_dir_file_operations; 67 static struct extent_io_ops btrfs_extent_io_ops; 68 69 static struct kmem_cache *btrfs_inode_cachep; 70 struct kmem_cache *btrfs_trans_handle_cachep; 71 struct kmem_cache *btrfs_transaction_cachep; 72 struct kmem_cache *btrfs_path_cachep; 73 74 #define S_SHIFT 12 75 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 76 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 77 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 78 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 79 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 80 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 81 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 82 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 83 }; 84 85 static void btrfs_truncate(struct inode *inode); 86 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 87 static noinline int cow_file_range(struct inode *inode, 88 struct page *locked_page, 89 u64 start, u64 end, int *page_started, 90 unsigned long *nr_written, int unlock); 91 92 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 93 struct inode *inode, struct inode *dir) 94 { 95 int err; 96 97 err = btrfs_init_acl(trans, inode, dir); 98 if (!err) 99 err = btrfs_xattr_security_init(trans, inode, dir); 100 return err; 101 } 102 103 /* 104 * this does all the hard work for inserting an inline extent into 105 * the btree. The caller should have done a btrfs_drop_extents so that 106 * no overlapping inline items exist in the btree 107 */ 108 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 109 struct btrfs_root *root, struct inode *inode, 110 u64 start, size_t size, size_t compressed_size, 111 struct page **compressed_pages) 112 { 113 struct btrfs_key key; 114 struct btrfs_path *path; 115 struct extent_buffer *leaf; 116 struct page *page = NULL; 117 char *kaddr; 118 unsigned long ptr; 119 struct btrfs_file_extent_item *ei; 120 int err = 0; 121 int ret; 122 size_t cur_size = size; 123 size_t datasize; 124 unsigned long offset; 125 int compress_type = BTRFS_COMPRESS_NONE; 126 127 if (compressed_size && compressed_pages) { 128 compress_type = root->fs_info->compress_type; 129 cur_size = compressed_size; 130 } 131 132 path = btrfs_alloc_path(); 133 if (!path) 134 return -ENOMEM; 135 136 path->leave_spinning = 1; 137 btrfs_set_trans_block_group(trans, inode); 138 139 key.objectid = inode->i_ino; 140 key.offset = start; 141 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 142 datasize = btrfs_file_extent_calc_inline_size(cur_size); 143 144 inode_add_bytes(inode, size); 145 ret = btrfs_insert_empty_item(trans, root, path, &key, 146 datasize); 147 BUG_ON(ret); 148 if (ret) { 149 err = ret; 150 goto fail; 151 } 152 leaf = path->nodes[0]; 153 ei = btrfs_item_ptr(leaf, path->slots[0], 154 struct btrfs_file_extent_item); 155 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 156 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 157 btrfs_set_file_extent_encryption(leaf, ei, 0); 158 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 159 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 160 ptr = btrfs_file_extent_inline_start(ei); 161 162 if (compress_type != BTRFS_COMPRESS_NONE) { 163 struct page *cpage; 164 int i = 0; 165 while (compressed_size > 0) { 166 cpage = compressed_pages[i]; 167 cur_size = min_t(unsigned long, compressed_size, 168 PAGE_CACHE_SIZE); 169 170 kaddr = kmap_atomic(cpage, KM_USER0); 171 write_extent_buffer(leaf, kaddr, ptr, cur_size); 172 kunmap_atomic(kaddr, KM_USER0); 173 174 i++; 175 ptr += cur_size; 176 compressed_size -= cur_size; 177 } 178 btrfs_set_file_extent_compression(leaf, ei, 179 compress_type); 180 } else { 181 page = find_get_page(inode->i_mapping, 182 start >> PAGE_CACHE_SHIFT); 183 btrfs_set_file_extent_compression(leaf, ei, 0); 184 kaddr = kmap_atomic(page, KM_USER0); 185 offset = start & (PAGE_CACHE_SIZE - 1); 186 write_extent_buffer(leaf, kaddr + offset, ptr, size); 187 kunmap_atomic(kaddr, KM_USER0); 188 page_cache_release(page); 189 } 190 btrfs_mark_buffer_dirty(leaf); 191 btrfs_free_path(path); 192 193 /* 194 * we're an inline extent, so nobody can 195 * extend the file past i_size without locking 196 * a page we already have locked. 197 * 198 * We must do any isize and inode updates 199 * before we unlock the pages. Otherwise we 200 * could end up racing with unlink. 201 */ 202 BTRFS_I(inode)->disk_i_size = inode->i_size; 203 btrfs_update_inode(trans, root, inode); 204 205 return 0; 206 fail: 207 btrfs_free_path(path); 208 return err; 209 } 210 211 212 /* 213 * conditionally insert an inline extent into the file. This 214 * does the checks required to make sure the data is small enough 215 * to fit as an inline extent. 216 */ 217 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 218 struct btrfs_root *root, 219 struct inode *inode, u64 start, u64 end, 220 size_t compressed_size, 221 struct page **compressed_pages) 222 { 223 u64 isize = i_size_read(inode); 224 u64 actual_end = min(end + 1, isize); 225 u64 inline_len = actual_end - start; 226 u64 aligned_end = (end + root->sectorsize - 1) & 227 ~((u64)root->sectorsize - 1); 228 u64 hint_byte; 229 u64 data_len = inline_len; 230 int ret; 231 232 if (compressed_size) 233 data_len = compressed_size; 234 235 if (start > 0 || 236 actual_end >= PAGE_CACHE_SIZE || 237 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 238 (!compressed_size && 239 (actual_end & (root->sectorsize - 1)) == 0) || 240 end + 1 < isize || 241 data_len > root->fs_info->max_inline) { 242 return 1; 243 } 244 245 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 246 &hint_byte, 1); 247 BUG_ON(ret); 248 249 if (isize > actual_end) 250 inline_len = min_t(u64, isize, actual_end); 251 ret = insert_inline_extent(trans, root, inode, start, 252 inline_len, compressed_size, 253 compressed_pages); 254 BUG_ON(ret); 255 btrfs_delalloc_release_metadata(inode, end + 1 - start); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 257 return 0; 258 } 259 260 struct async_extent { 261 u64 start; 262 u64 ram_size; 263 u64 compressed_size; 264 struct page **pages; 265 unsigned long nr_pages; 266 int compress_type; 267 struct list_head list; 268 }; 269 270 struct async_cow { 271 struct inode *inode; 272 struct btrfs_root *root; 273 struct page *locked_page; 274 u64 start; 275 u64 end; 276 struct list_head extents; 277 struct btrfs_work work; 278 }; 279 280 static noinline int add_async_extent(struct async_cow *cow, 281 u64 start, u64 ram_size, 282 u64 compressed_size, 283 struct page **pages, 284 unsigned long nr_pages, 285 int compress_type) 286 { 287 struct async_extent *async_extent; 288 289 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 290 async_extent->start = start; 291 async_extent->ram_size = ram_size; 292 async_extent->compressed_size = compressed_size; 293 async_extent->pages = pages; 294 async_extent->nr_pages = nr_pages; 295 async_extent->compress_type = compress_type; 296 list_add_tail(&async_extent->list, &cow->extents); 297 return 0; 298 } 299 300 /* 301 * we create compressed extents in two phases. The first 302 * phase compresses a range of pages that have already been 303 * locked (both pages and state bits are locked). 304 * 305 * This is done inside an ordered work queue, and the compression 306 * is spread across many cpus. The actual IO submission is step 307 * two, and the ordered work queue takes care of making sure that 308 * happens in the same order things were put onto the queue by 309 * writepages and friends. 310 * 311 * If this code finds it can't get good compression, it puts an 312 * entry onto the work queue to write the uncompressed bytes. This 313 * makes sure that both compressed inodes and uncompressed inodes 314 * are written in the same order that pdflush sent them down. 315 */ 316 static noinline int compress_file_range(struct inode *inode, 317 struct page *locked_page, 318 u64 start, u64 end, 319 struct async_cow *async_cow, 320 int *num_added) 321 { 322 struct btrfs_root *root = BTRFS_I(inode)->root; 323 struct btrfs_trans_handle *trans; 324 u64 num_bytes; 325 u64 blocksize = root->sectorsize; 326 u64 actual_end; 327 u64 isize = i_size_read(inode); 328 int ret = 0; 329 struct page **pages = NULL; 330 unsigned long nr_pages; 331 unsigned long nr_pages_ret = 0; 332 unsigned long total_compressed = 0; 333 unsigned long total_in = 0; 334 unsigned long max_compressed = 128 * 1024; 335 unsigned long max_uncompressed = 128 * 1024; 336 int i; 337 int will_compress; 338 int compress_type = root->fs_info->compress_type; 339 340 actual_end = min_t(u64, isize, end + 1); 341 again: 342 will_compress = 0; 343 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 344 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 345 346 /* 347 * we don't want to send crud past the end of i_size through 348 * compression, that's just a waste of CPU time. So, if the 349 * end of the file is before the start of our current 350 * requested range of bytes, we bail out to the uncompressed 351 * cleanup code that can deal with all of this. 352 * 353 * It isn't really the fastest way to fix things, but this is a 354 * very uncommon corner. 355 */ 356 if (actual_end <= start) 357 goto cleanup_and_bail_uncompressed; 358 359 total_compressed = actual_end - start; 360 361 /* we want to make sure that amount of ram required to uncompress 362 * an extent is reasonable, so we limit the total size in ram 363 * of a compressed extent to 128k. This is a crucial number 364 * because it also controls how easily we can spread reads across 365 * cpus for decompression. 366 * 367 * We also want to make sure the amount of IO required to do 368 * a random read is reasonably small, so we limit the size of 369 * a compressed extent to 128k. 370 */ 371 total_compressed = min(total_compressed, max_uncompressed); 372 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 373 num_bytes = max(blocksize, num_bytes); 374 total_in = 0; 375 ret = 0; 376 377 /* 378 * we do compression for mount -o compress and when the 379 * inode has not been flagged as nocompress. This flag can 380 * change at any time if we discover bad compression ratios. 381 */ 382 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 383 (btrfs_test_opt(root, COMPRESS) || 384 (BTRFS_I(inode)->force_compress))) { 385 WARN_ON(pages); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 387 388 if (BTRFS_I(inode)->force_compress) 389 compress_type = BTRFS_I(inode)->force_compress; 390 391 ret = btrfs_compress_pages(compress_type, 392 inode->i_mapping, start, 393 total_compressed, pages, 394 nr_pages, &nr_pages_ret, 395 &total_in, 396 &total_compressed, 397 max_compressed); 398 399 if (!ret) { 400 unsigned long offset = total_compressed & 401 (PAGE_CACHE_SIZE - 1); 402 struct page *page = pages[nr_pages_ret - 1]; 403 char *kaddr; 404 405 /* zero the tail end of the last page, we might be 406 * sending it down to disk 407 */ 408 if (offset) { 409 kaddr = kmap_atomic(page, KM_USER0); 410 memset(kaddr + offset, 0, 411 PAGE_CACHE_SIZE - offset); 412 kunmap_atomic(kaddr, KM_USER0); 413 } 414 will_compress = 1; 415 } 416 } 417 if (start == 0) { 418 trans = btrfs_join_transaction(root, 1); 419 BUG_ON(!trans); 420 btrfs_set_trans_block_group(trans, inode); 421 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 422 423 /* lets try to make an inline extent */ 424 if (ret || total_in < (actual_end - start)) { 425 /* we didn't compress the entire range, try 426 * to make an uncompressed inline extent. 427 */ 428 ret = cow_file_range_inline(trans, root, inode, 429 start, end, 0, NULL); 430 } else { 431 /* try making a compressed inline extent */ 432 ret = cow_file_range_inline(trans, root, inode, 433 start, end, 434 total_compressed, pages); 435 } 436 if (ret == 0) { 437 /* 438 * inline extent creation worked, we don't need 439 * to create any more async work items. Unlock 440 * and free up our temp pages. 441 */ 442 extent_clear_unlock_delalloc(inode, 443 &BTRFS_I(inode)->io_tree, 444 start, end, NULL, 445 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 446 EXTENT_CLEAR_DELALLOC | 447 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 448 449 btrfs_end_transaction(trans, root); 450 goto free_pages_out; 451 } 452 btrfs_end_transaction(trans, root); 453 } 454 455 if (will_compress) { 456 /* 457 * we aren't doing an inline extent round the compressed size 458 * up to a block size boundary so the allocator does sane 459 * things 460 */ 461 total_compressed = (total_compressed + blocksize - 1) & 462 ~(blocksize - 1); 463 464 /* 465 * one last check to make sure the compression is really a 466 * win, compare the page count read with the blocks on disk 467 */ 468 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 469 ~(PAGE_CACHE_SIZE - 1); 470 if (total_compressed >= total_in) { 471 will_compress = 0; 472 } else { 473 num_bytes = total_in; 474 } 475 } 476 if (!will_compress && pages) { 477 /* 478 * the compression code ran but failed to make things smaller, 479 * free any pages it allocated and our page pointer array 480 */ 481 for (i = 0; i < nr_pages_ret; i++) { 482 WARN_ON(pages[i]->mapping); 483 page_cache_release(pages[i]); 484 } 485 kfree(pages); 486 pages = NULL; 487 total_compressed = 0; 488 nr_pages_ret = 0; 489 490 /* flag the file so we don't compress in the future */ 491 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 492 !(BTRFS_I(inode)->force_compress)) { 493 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 494 } 495 } 496 if (will_compress) { 497 *num_added += 1; 498 499 /* the async work queues will take care of doing actual 500 * allocation on disk for these compressed pages, 501 * and will submit them to the elevator. 502 */ 503 add_async_extent(async_cow, start, num_bytes, 504 total_compressed, pages, nr_pages_ret, 505 compress_type); 506 507 if (start + num_bytes < end) { 508 start += num_bytes; 509 pages = NULL; 510 cond_resched(); 511 goto again; 512 } 513 } else { 514 cleanup_and_bail_uncompressed: 515 /* 516 * No compression, but we still need to write the pages in 517 * the file we've been given so far. redirty the locked 518 * page if it corresponds to our extent and set things up 519 * for the async work queue to run cow_file_range to do 520 * the normal delalloc dance 521 */ 522 if (page_offset(locked_page) >= start && 523 page_offset(locked_page) <= end) { 524 __set_page_dirty_nobuffers(locked_page); 525 /* unlocked later on in the async handlers */ 526 } 527 add_async_extent(async_cow, start, end - start + 1, 528 0, NULL, 0, BTRFS_COMPRESS_NONE); 529 *num_added += 1; 530 } 531 532 out: 533 return 0; 534 535 free_pages_out: 536 for (i = 0; i < nr_pages_ret; i++) { 537 WARN_ON(pages[i]->mapping); 538 page_cache_release(pages[i]); 539 } 540 kfree(pages); 541 542 goto out; 543 } 544 545 /* 546 * phase two of compressed writeback. This is the ordered portion 547 * of the code, which only gets called in the order the work was 548 * queued. We walk all the async extents created by compress_file_range 549 * and send them down to the disk. 550 */ 551 static noinline int submit_compressed_extents(struct inode *inode, 552 struct async_cow *async_cow) 553 { 554 struct async_extent *async_extent; 555 u64 alloc_hint = 0; 556 struct btrfs_trans_handle *trans; 557 struct btrfs_key ins; 558 struct extent_map *em; 559 struct btrfs_root *root = BTRFS_I(inode)->root; 560 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 561 struct extent_io_tree *io_tree; 562 int ret = 0; 563 564 if (list_empty(&async_cow->extents)) 565 return 0; 566 567 568 while (!list_empty(&async_cow->extents)) { 569 async_extent = list_entry(async_cow->extents.next, 570 struct async_extent, list); 571 list_del(&async_extent->list); 572 573 io_tree = &BTRFS_I(inode)->io_tree; 574 575 retry: 576 /* did the compression code fall back to uncompressed IO? */ 577 if (!async_extent->pages) { 578 int page_started = 0; 579 unsigned long nr_written = 0; 580 581 lock_extent(io_tree, async_extent->start, 582 async_extent->start + 583 async_extent->ram_size - 1, GFP_NOFS); 584 585 /* allocate blocks */ 586 ret = cow_file_range(inode, async_cow->locked_page, 587 async_extent->start, 588 async_extent->start + 589 async_extent->ram_size - 1, 590 &page_started, &nr_written, 0); 591 592 /* 593 * if page_started, cow_file_range inserted an 594 * inline extent and took care of all the unlocking 595 * and IO for us. Otherwise, we need to submit 596 * all those pages down to the drive. 597 */ 598 if (!page_started && !ret) 599 extent_write_locked_range(io_tree, 600 inode, async_extent->start, 601 async_extent->start + 602 async_extent->ram_size - 1, 603 btrfs_get_extent, 604 WB_SYNC_ALL); 605 kfree(async_extent); 606 cond_resched(); 607 continue; 608 } 609 610 lock_extent(io_tree, async_extent->start, 611 async_extent->start + async_extent->ram_size - 1, 612 GFP_NOFS); 613 614 trans = btrfs_join_transaction(root, 1); 615 ret = btrfs_reserve_extent(trans, root, 616 async_extent->compressed_size, 617 async_extent->compressed_size, 618 0, alloc_hint, 619 (u64)-1, &ins, 1); 620 btrfs_end_transaction(trans, root); 621 622 if (ret) { 623 int i; 624 for (i = 0; i < async_extent->nr_pages; i++) { 625 WARN_ON(async_extent->pages[i]->mapping); 626 page_cache_release(async_extent->pages[i]); 627 } 628 kfree(async_extent->pages); 629 async_extent->nr_pages = 0; 630 async_extent->pages = NULL; 631 unlock_extent(io_tree, async_extent->start, 632 async_extent->start + 633 async_extent->ram_size - 1, GFP_NOFS); 634 goto retry; 635 } 636 637 /* 638 * here we're doing allocation and writeback of the 639 * compressed pages 640 */ 641 btrfs_drop_extent_cache(inode, async_extent->start, 642 async_extent->start + 643 async_extent->ram_size - 1, 0); 644 645 em = alloc_extent_map(GFP_NOFS); 646 em->start = async_extent->start; 647 em->len = async_extent->ram_size; 648 em->orig_start = em->start; 649 650 em->block_start = ins.objectid; 651 em->block_len = ins.offset; 652 em->bdev = root->fs_info->fs_devices->latest_bdev; 653 em->compress_type = async_extent->compress_type; 654 set_bit(EXTENT_FLAG_PINNED, &em->flags); 655 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 656 657 while (1) { 658 write_lock(&em_tree->lock); 659 ret = add_extent_mapping(em_tree, em); 660 write_unlock(&em_tree->lock); 661 if (ret != -EEXIST) { 662 free_extent_map(em); 663 break; 664 } 665 btrfs_drop_extent_cache(inode, async_extent->start, 666 async_extent->start + 667 async_extent->ram_size - 1, 0); 668 } 669 670 ret = btrfs_add_ordered_extent_compress(inode, 671 async_extent->start, 672 ins.objectid, 673 async_extent->ram_size, 674 ins.offset, 675 BTRFS_ORDERED_COMPRESSED, 676 async_extent->compress_type); 677 BUG_ON(ret); 678 679 /* 680 * clear dirty, set writeback and unlock the pages. 681 */ 682 extent_clear_unlock_delalloc(inode, 683 &BTRFS_I(inode)->io_tree, 684 async_extent->start, 685 async_extent->start + 686 async_extent->ram_size - 1, 687 NULL, EXTENT_CLEAR_UNLOCK_PAGE | 688 EXTENT_CLEAR_UNLOCK | 689 EXTENT_CLEAR_DELALLOC | 690 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); 691 692 ret = btrfs_submit_compressed_write(inode, 693 async_extent->start, 694 async_extent->ram_size, 695 ins.objectid, 696 ins.offset, async_extent->pages, 697 async_extent->nr_pages); 698 699 BUG_ON(ret); 700 alloc_hint = ins.objectid + ins.offset; 701 kfree(async_extent); 702 cond_resched(); 703 } 704 705 return 0; 706 } 707 708 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 709 u64 num_bytes) 710 { 711 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 712 struct extent_map *em; 713 u64 alloc_hint = 0; 714 715 read_lock(&em_tree->lock); 716 em = search_extent_mapping(em_tree, start, num_bytes); 717 if (em) { 718 /* 719 * if block start isn't an actual block number then find the 720 * first block in this inode and use that as a hint. If that 721 * block is also bogus then just don't worry about it. 722 */ 723 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 724 free_extent_map(em); 725 em = search_extent_mapping(em_tree, 0, 0); 726 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 727 alloc_hint = em->block_start; 728 if (em) 729 free_extent_map(em); 730 } else { 731 alloc_hint = em->block_start; 732 free_extent_map(em); 733 } 734 } 735 read_unlock(&em_tree->lock); 736 737 return alloc_hint; 738 } 739 740 /* 741 * when extent_io.c finds a delayed allocation range in the file, 742 * the call backs end up in this code. The basic idea is to 743 * allocate extents on disk for the range, and create ordered data structs 744 * in ram to track those extents. 745 * 746 * locked_page is the page that writepage had locked already. We use 747 * it to make sure we don't do extra locks or unlocks. 748 * 749 * *page_started is set to one if we unlock locked_page and do everything 750 * required to start IO on it. It may be clean and already done with 751 * IO when we return. 752 */ 753 static noinline int cow_file_range(struct inode *inode, 754 struct page *locked_page, 755 u64 start, u64 end, int *page_started, 756 unsigned long *nr_written, 757 int unlock) 758 { 759 struct btrfs_root *root = BTRFS_I(inode)->root; 760 struct btrfs_trans_handle *trans; 761 u64 alloc_hint = 0; 762 u64 num_bytes; 763 unsigned long ram_size; 764 u64 disk_num_bytes; 765 u64 cur_alloc_size; 766 u64 blocksize = root->sectorsize; 767 struct btrfs_key ins; 768 struct extent_map *em; 769 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 770 int ret = 0; 771 772 BUG_ON(root == root->fs_info->tree_root); 773 trans = btrfs_join_transaction(root, 1); 774 BUG_ON(!trans); 775 btrfs_set_trans_block_group(trans, inode); 776 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 777 778 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 779 num_bytes = max(blocksize, num_bytes); 780 disk_num_bytes = num_bytes; 781 ret = 0; 782 783 if (start == 0) { 784 /* lets try to make an inline extent */ 785 ret = cow_file_range_inline(trans, root, inode, 786 start, end, 0, NULL); 787 if (ret == 0) { 788 extent_clear_unlock_delalloc(inode, 789 &BTRFS_I(inode)->io_tree, 790 start, end, NULL, 791 EXTENT_CLEAR_UNLOCK_PAGE | 792 EXTENT_CLEAR_UNLOCK | 793 EXTENT_CLEAR_DELALLOC | 794 EXTENT_CLEAR_DIRTY | 795 EXTENT_SET_WRITEBACK | 796 EXTENT_END_WRITEBACK); 797 798 *nr_written = *nr_written + 799 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 800 *page_started = 1; 801 ret = 0; 802 goto out; 803 } 804 } 805 806 BUG_ON(disk_num_bytes > 807 btrfs_super_total_bytes(&root->fs_info->super_copy)); 808 809 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 810 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 811 812 while (disk_num_bytes > 0) { 813 unsigned long op; 814 815 cur_alloc_size = disk_num_bytes; 816 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 817 root->sectorsize, 0, alloc_hint, 818 (u64)-1, &ins, 1); 819 BUG_ON(ret); 820 821 em = alloc_extent_map(GFP_NOFS); 822 em->start = start; 823 em->orig_start = em->start; 824 ram_size = ins.offset; 825 em->len = ins.offset; 826 827 em->block_start = ins.objectid; 828 em->block_len = ins.offset; 829 em->bdev = root->fs_info->fs_devices->latest_bdev; 830 set_bit(EXTENT_FLAG_PINNED, &em->flags); 831 832 while (1) { 833 write_lock(&em_tree->lock); 834 ret = add_extent_mapping(em_tree, em); 835 write_unlock(&em_tree->lock); 836 if (ret != -EEXIST) { 837 free_extent_map(em); 838 break; 839 } 840 btrfs_drop_extent_cache(inode, start, 841 start + ram_size - 1, 0); 842 } 843 844 cur_alloc_size = ins.offset; 845 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 846 ram_size, cur_alloc_size, 0); 847 BUG_ON(ret); 848 849 if (root->root_key.objectid == 850 BTRFS_DATA_RELOC_TREE_OBJECTID) { 851 ret = btrfs_reloc_clone_csums(inode, start, 852 cur_alloc_size); 853 BUG_ON(ret); 854 } 855 856 if (disk_num_bytes < cur_alloc_size) 857 break; 858 859 /* we're not doing compressed IO, don't unlock the first 860 * page (which the caller expects to stay locked), don't 861 * clear any dirty bits and don't set any writeback bits 862 * 863 * Do set the Private2 bit so we know this page was properly 864 * setup for writepage 865 */ 866 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; 867 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 868 EXTENT_SET_PRIVATE2; 869 870 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 871 start, start + ram_size - 1, 872 locked_page, op); 873 disk_num_bytes -= cur_alloc_size; 874 num_bytes -= cur_alloc_size; 875 alloc_hint = ins.objectid + ins.offset; 876 start += cur_alloc_size; 877 } 878 out: 879 ret = 0; 880 btrfs_end_transaction(trans, root); 881 882 return ret; 883 } 884 885 /* 886 * work queue call back to started compression on a file and pages 887 */ 888 static noinline void async_cow_start(struct btrfs_work *work) 889 { 890 struct async_cow *async_cow; 891 int num_added = 0; 892 async_cow = container_of(work, struct async_cow, work); 893 894 compress_file_range(async_cow->inode, async_cow->locked_page, 895 async_cow->start, async_cow->end, async_cow, 896 &num_added); 897 if (num_added == 0) 898 async_cow->inode = NULL; 899 } 900 901 /* 902 * work queue call back to submit previously compressed pages 903 */ 904 static noinline void async_cow_submit(struct btrfs_work *work) 905 { 906 struct async_cow *async_cow; 907 struct btrfs_root *root; 908 unsigned long nr_pages; 909 910 async_cow = container_of(work, struct async_cow, work); 911 912 root = async_cow->root; 913 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 914 PAGE_CACHE_SHIFT; 915 916 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 917 918 if (atomic_read(&root->fs_info->async_delalloc_pages) < 919 5 * 1042 * 1024 && 920 waitqueue_active(&root->fs_info->async_submit_wait)) 921 wake_up(&root->fs_info->async_submit_wait); 922 923 if (async_cow->inode) 924 submit_compressed_extents(async_cow->inode, async_cow); 925 } 926 927 static noinline void async_cow_free(struct btrfs_work *work) 928 { 929 struct async_cow *async_cow; 930 async_cow = container_of(work, struct async_cow, work); 931 kfree(async_cow); 932 } 933 934 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 935 u64 start, u64 end, int *page_started, 936 unsigned long *nr_written) 937 { 938 struct async_cow *async_cow; 939 struct btrfs_root *root = BTRFS_I(inode)->root; 940 unsigned long nr_pages; 941 u64 cur_end; 942 int limit = 10 * 1024 * 1042; 943 944 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 945 1, 0, NULL, GFP_NOFS); 946 while (start < end) { 947 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 948 async_cow->inode = inode; 949 async_cow->root = root; 950 async_cow->locked_page = locked_page; 951 async_cow->start = start; 952 953 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 954 cur_end = end; 955 else 956 cur_end = min(end, start + 512 * 1024 - 1); 957 958 async_cow->end = cur_end; 959 INIT_LIST_HEAD(&async_cow->extents); 960 961 async_cow->work.func = async_cow_start; 962 async_cow->work.ordered_func = async_cow_submit; 963 async_cow->work.ordered_free = async_cow_free; 964 async_cow->work.flags = 0; 965 966 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 967 PAGE_CACHE_SHIFT; 968 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 969 970 btrfs_queue_worker(&root->fs_info->delalloc_workers, 971 &async_cow->work); 972 973 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 974 wait_event(root->fs_info->async_submit_wait, 975 (atomic_read(&root->fs_info->async_delalloc_pages) < 976 limit)); 977 } 978 979 while (atomic_read(&root->fs_info->async_submit_draining) && 980 atomic_read(&root->fs_info->async_delalloc_pages)) { 981 wait_event(root->fs_info->async_submit_wait, 982 (atomic_read(&root->fs_info->async_delalloc_pages) == 983 0)); 984 } 985 986 *nr_written += nr_pages; 987 start = cur_end + 1; 988 } 989 *page_started = 1; 990 return 0; 991 } 992 993 static noinline int csum_exist_in_range(struct btrfs_root *root, 994 u64 bytenr, u64 num_bytes) 995 { 996 int ret; 997 struct btrfs_ordered_sum *sums; 998 LIST_HEAD(list); 999 1000 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1001 bytenr + num_bytes - 1, &list); 1002 if (ret == 0 && list_empty(&list)) 1003 return 0; 1004 1005 while (!list_empty(&list)) { 1006 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1007 list_del(&sums->list); 1008 kfree(sums); 1009 } 1010 return 1; 1011 } 1012 1013 /* 1014 * when nowcow writeback call back. This checks for snapshots or COW copies 1015 * of the extents that exist in the file, and COWs the file as required. 1016 * 1017 * If no cow copies or snapshots exist, we write directly to the existing 1018 * blocks on disk 1019 */ 1020 static noinline int run_delalloc_nocow(struct inode *inode, 1021 struct page *locked_page, 1022 u64 start, u64 end, int *page_started, int force, 1023 unsigned long *nr_written) 1024 { 1025 struct btrfs_root *root = BTRFS_I(inode)->root; 1026 struct btrfs_trans_handle *trans; 1027 struct extent_buffer *leaf; 1028 struct btrfs_path *path; 1029 struct btrfs_file_extent_item *fi; 1030 struct btrfs_key found_key; 1031 u64 cow_start; 1032 u64 cur_offset; 1033 u64 extent_end; 1034 u64 extent_offset; 1035 u64 disk_bytenr; 1036 u64 num_bytes; 1037 int extent_type; 1038 int ret; 1039 int type; 1040 int nocow; 1041 int check_prev = 1; 1042 bool nolock = false; 1043 1044 path = btrfs_alloc_path(); 1045 BUG_ON(!path); 1046 if (root == root->fs_info->tree_root) { 1047 nolock = true; 1048 trans = btrfs_join_transaction_nolock(root, 1); 1049 } else { 1050 trans = btrfs_join_transaction(root, 1); 1051 } 1052 BUG_ON(!trans); 1053 1054 cow_start = (u64)-1; 1055 cur_offset = start; 1056 while (1) { 1057 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 1058 cur_offset, 0); 1059 BUG_ON(ret < 0); 1060 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1061 leaf = path->nodes[0]; 1062 btrfs_item_key_to_cpu(leaf, &found_key, 1063 path->slots[0] - 1); 1064 if (found_key.objectid == inode->i_ino && 1065 found_key.type == BTRFS_EXTENT_DATA_KEY) 1066 path->slots[0]--; 1067 } 1068 check_prev = 0; 1069 next_slot: 1070 leaf = path->nodes[0]; 1071 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1072 ret = btrfs_next_leaf(root, path); 1073 if (ret < 0) 1074 BUG_ON(1); 1075 if (ret > 0) 1076 break; 1077 leaf = path->nodes[0]; 1078 } 1079 1080 nocow = 0; 1081 disk_bytenr = 0; 1082 num_bytes = 0; 1083 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1084 1085 if (found_key.objectid > inode->i_ino || 1086 found_key.type > BTRFS_EXTENT_DATA_KEY || 1087 found_key.offset > end) 1088 break; 1089 1090 if (found_key.offset > cur_offset) { 1091 extent_end = found_key.offset; 1092 extent_type = 0; 1093 goto out_check; 1094 } 1095 1096 fi = btrfs_item_ptr(leaf, path->slots[0], 1097 struct btrfs_file_extent_item); 1098 extent_type = btrfs_file_extent_type(leaf, fi); 1099 1100 if (extent_type == BTRFS_FILE_EXTENT_REG || 1101 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1102 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1103 extent_offset = btrfs_file_extent_offset(leaf, fi); 1104 extent_end = found_key.offset + 1105 btrfs_file_extent_num_bytes(leaf, fi); 1106 if (extent_end <= start) { 1107 path->slots[0]++; 1108 goto next_slot; 1109 } 1110 if (disk_bytenr == 0) 1111 goto out_check; 1112 if (btrfs_file_extent_compression(leaf, fi) || 1113 btrfs_file_extent_encryption(leaf, fi) || 1114 btrfs_file_extent_other_encoding(leaf, fi)) 1115 goto out_check; 1116 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1117 goto out_check; 1118 if (btrfs_extent_readonly(root, disk_bytenr)) 1119 goto out_check; 1120 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 1121 found_key.offset - 1122 extent_offset, disk_bytenr)) 1123 goto out_check; 1124 disk_bytenr += extent_offset; 1125 disk_bytenr += cur_offset - found_key.offset; 1126 num_bytes = min(end + 1, extent_end) - cur_offset; 1127 /* 1128 * force cow if csum exists in the range. 1129 * this ensure that csum for a given extent are 1130 * either valid or do not exist. 1131 */ 1132 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1133 goto out_check; 1134 nocow = 1; 1135 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1136 extent_end = found_key.offset + 1137 btrfs_file_extent_inline_len(leaf, fi); 1138 extent_end = ALIGN(extent_end, root->sectorsize); 1139 } else { 1140 BUG_ON(1); 1141 } 1142 out_check: 1143 if (extent_end <= start) { 1144 path->slots[0]++; 1145 goto next_slot; 1146 } 1147 if (!nocow) { 1148 if (cow_start == (u64)-1) 1149 cow_start = cur_offset; 1150 cur_offset = extent_end; 1151 if (cur_offset > end) 1152 break; 1153 path->slots[0]++; 1154 goto next_slot; 1155 } 1156 1157 btrfs_release_path(root, path); 1158 if (cow_start != (u64)-1) { 1159 ret = cow_file_range(inode, locked_page, cow_start, 1160 found_key.offset - 1, page_started, 1161 nr_written, 1); 1162 BUG_ON(ret); 1163 cow_start = (u64)-1; 1164 } 1165 1166 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1167 struct extent_map *em; 1168 struct extent_map_tree *em_tree; 1169 em_tree = &BTRFS_I(inode)->extent_tree; 1170 em = alloc_extent_map(GFP_NOFS); 1171 em->start = cur_offset; 1172 em->orig_start = em->start; 1173 em->len = num_bytes; 1174 em->block_len = num_bytes; 1175 em->block_start = disk_bytenr; 1176 em->bdev = root->fs_info->fs_devices->latest_bdev; 1177 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1178 while (1) { 1179 write_lock(&em_tree->lock); 1180 ret = add_extent_mapping(em_tree, em); 1181 write_unlock(&em_tree->lock); 1182 if (ret != -EEXIST) { 1183 free_extent_map(em); 1184 break; 1185 } 1186 btrfs_drop_extent_cache(inode, em->start, 1187 em->start + em->len - 1, 0); 1188 } 1189 type = BTRFS_ORDERED_PREALLOC; 1190 } else { 1191 type = BTRFS_ORDERED_NOCOW; 1192 } 1193 1194 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1195 num_bytes, num_bytes, type); 1196 BUG_ON(ret); 1197 1198 if (root->root_key.objectid == 1199 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1200 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1201 num_bytes); 1202 BUG_ON(ret); 1203 } 1204 1205 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1206 cur_offset, cur_offset + num_bytes - 1, 1207 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1208 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 1209 EXTENT_SET_PRIVATE2); 1210 cur_offset = extent_end; 1211 if (cur_offset > end) 1212 break; 1213 } 1214 btrfs_release_path(root, path); 1215 1216 if (cur_offset <= end && cow_start == (u64)-1) 1217 cow_start = cur_offset; 1218 if (cow_start != (u64)-1) { 1219 ret = cow_file_range(inode, locked_page, cow_start, end, 1220 page_started, nr_written, 1); 1221 BUG_ON(ret); 1222 } 1223 1224 if (nolock) { 1225 ret = btrfs_end_transaction_nolock(trans, root); 1226 BUG_ON(ret); 1227 } else { 1228 ret = btrfs_end_transaction(trans, root); 1229 BUG_ON(ret); 1230 } 1231 btrfs_free_path(path); 1232 return 0; 1233 } 1234 1235 /* 1236 * extent_io.c call back to do delayed allocation processing 1237 */ 1238 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1239 u64 start, u64 end, int *page_started, 1240 unsigned long *nr_written) 1241 { 1242 int ret; 1243 struct btrfs_root *root = BTRFS_I(inode)->root; 1244 1245 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) 1246 ret = run_delalloc_nocow(inode, locked_page, start, end, 1247 page_started, 1, nr_written); 1248 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1249 ret = run_delalloc_nocow(inode, locked_page, start, end, 1250 page_started, 0, nr_written); 1251 else if (!btrfs_test_opt(root, COMPRESS) && 1252 !(BTRFS_I(inode)->force_compress)) 1253 ret = cow_file_range(inode, locked_page, start, end, 1254 page_started, nr_written, 1); 1255 else 1256 ret = cow_file_range_async(inode, locked_page, start, end, 1257 page_started, nr_written); 1258 return ret; 1259 } 1260 1261 static int btrfs_split_extent_hook(struct inode *inode, 1262 struct extent_state *orig, u64 split) 1263 { 1264 /* not delalloc, ignore it */ 1265 if (!(orig->state & EXTENT_DELALLOC)) 1266 return 0; 1267 1268 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1269 return 0; 1270 } 1271 1272 /* 1273 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1274 * extents so we can keep track of new extents that are just merged onto old 1275 * extents, such as when we are doing sequential writes, so we can properly 1276 * account for the metadata space we'll need. 1277 */ 1278 static int btrfs_merge_extent_hook(struct inode *inode, 1279 struct extent_state *new, 1280 struct extent_state *other) 1281 { 1282 /* not delalloc, ignore it */ 1283 if (!(other->state & EXTENT_DELALLOC)) 1284 return 0; 1285 1286 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1287 return 0; 1288 } 1289 1290 /* 1291 * extent_io.c set_bit_hook, used to track delayed allocation 1292 * bytes in this file, and to maintain the list of inodes that 1293 * have pending delalloc work to be done. 1294 */ 1295 static int btrfs_set_bit_hook(struct inode *inode, 1296 struct extent_state *state, int *bits) 1297 { 1298 1299 /* 1300 * set_bit and clear bit hooks normally require _irqsave/restore 1301 * but in this case, we are only testeing for the DELALLOC 1302 * bit, which is only set or cleared with irqs on 1303 */ 1304 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1305 struct btrfs_root *root = BTRFS_I(inode)->root; 1306 u64 len = state->end + 1 - state->start; 1307 int do_list = (root->root_key.objectid != 1308 BTRFS_ROOT_TREE_OBJECTID); 1309 1310 if (*bits & EXTENT_FIRST_DELALLOC) 1311 *bits &= ~EXTENT_FIRST_DELALLOC; 1312 else 1313 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1314 1315 spin_lock(&root->fs_info->delalloc_lock); 1316 BTRFS_I(inode)->delalloc_bytes += len; 1317 root->fs_info->delalloc_bytes += len; 1318 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1319 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1320 &root->fs_info->delalloc_inodes); 1321 } 1322 spin_unlock(&root->fs_info->delalloc_lock); 1323 } 1324 return 0; 1325 } 1326 1327 /* 1328 * extent_io.c clear_bit_hook, see set_bit_hook for why 1329 */ 1330 static int btrfs_clear_bit_hook(struct inode *inode, 1331 struct extent_state *state, int *bits) 1332 { 1333 /* 1334 * set_bit and clear bit hooks normally require _irqsave/restore 1335 * but in this case, we are only testeing for the DELALLOC 1336 * bit, which is only set or cleared with irqs on 1337 */ 1338 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1339 struct btrfs_root *root = BTRFS_I(inode)->root; 1340 u64 len = state->end + 1 - state->start; 1341 int do_list = (root->root_key.objectid != 1342 BTRFS_ROOT_TREE_OBJECTID); 1343 1344 if (*bits & EXTENT_FIRST_DELALLOC) 1345 *bits &= ~EXTENT_FIRST_DELALLOC; 1346 else if (!(*bits & EXTENT_DO_ACCOUNTING)) 1347 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1348 1349 if (*bits & EXTENT_DO_ACCOUNTING) 1350 btrfs_delalloc_release_metadata(inode, len); 1351 1352 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1353 && do_list) 1354 btrfs_free_reserved_data_space(inode, len); 1355 1356 spin_lock(&root->fs_info->delalloc_lock); 1357 root->fs_info->delalloc_bytes -= len; 1358 BTRFS_I(inode)->delalloc_bytes -= len; 1359 1360 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1361 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1362 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1363 } 1364 spin_unlock(&root->fs_info->delalloc_lock); 1365 } 1366 return 0; 1367 } 1368 1369 /* 1370 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1371 * we don't create bios that span stripes or chunks 1372 */ 1373 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1374 size_t size, struct bio *bio, 1375 unsigned long bio_flags) 1376 { 1377 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1378 struct btrfs_mapping_tree *map_tree; 1379 u64 logical = (u64)bio->bi_sector << 9; 1380 u64 length = 0; 1381 u64 map_length; 1382 int ret; 1383 1384 if (bio_flags & EXTENT_BIO_COMPRESSED) 1385 return 0; 1386 1387 length = bio->bi_size; 1388 map_tree = &root->fs_info->mapping_tree; 1389 map_length = length; 1390 ret = btrfs_map_block(map_tree, READ, logical, 1391 &map_length, NULL, 0); 1392 1393 if (map_length < length + size) 1394 return 1; 1395 return ret; 1396 } 1397 1398 /* 1399 * in order to insert checksums into the metadata in large chunks, 1400 * we wait until bio submission time. All the pages in the bio are 1401 * checksummed and sums are attached onto the ordered extent record. 1402 * 1403 * At IO completion time the cums attached on the ordered extent record 1404 * are inserted into the btree 1405 */ 1406 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1407 struct bio *bio, int mirror_num, 1408 unsigned long bio_flags, 1409 u64 bio_offset) 1410 { 1411 struct btrfs_root *root = BTRFS_I(inode)->root; 1412 int ret = 0; 1413 1414 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1415 BUG_ON(ret); 1416 return 0; 1417 } 1418 1419 /* 1420 * in order to insert checksums into the metadata in large chunks, 1421 * we wait until bio submission time. All the pages in the bio are 1422 * checksummed and sums are attached onto the ordered extent record. 1423 * 1424 * At IO completion time the cums attached on the ordered extent record 1425 * are inserted into the btree 1426 */ 1427 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1428 int mirror_num, unsigned long bio_flags, 1429 u64 bio_offset) 1430 { 1431 struct btrfs_root *root = BTRFS_I(inode)->root; 1432 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1433 } 1434 1435 /* 1436 * extent_io.c submission hook. This does the right thing for csum calculation 1437 * on write, or reading the csums from the tree before a read 1438 */ 1439 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1440 int mirror_num, unsigned long bio_flags, 1441 u64 bio_offset) 1442 { 1443 struct btrfs_root *root = BTRFS_I(inode)->root; 1444 int ret = 0; 1445 int skip_sum; 1446 1447 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1448 1449 if (root == root->fs_info->tree_root) 1450 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); 1451 else 1452 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1453 BUG_ON(ret); 1454 1455 if (!(rw & REQ_WRITE)) { 1456 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1457 return btrfs_submit_compressed_read(inode, bio, 1458 mirror_num, bio_flags); 1459 } else if (!skip_sum) 1460 btrfs_lookup_bio_sums(root, inode, bio, NULL); 1461 goto mapit; 1462 } else if (!skip_sum) { 1463 /* csum items have already been cloned */ 1464 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1465 goto mapit; 1466 /* we're doing a write, do the async checksumming */ 1467 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1468 inode, rw, bio, mirror_num, 1469 bio_flags, bio_offset, 1470 __btrfs_submit_bio_start, 1471 __btrfs_submit_bio_done); 1472 } 1473 1474 mapit: 1475 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1476 } 1477 1478 /* 1479 * given a list of ordered sums record them in the inode. This happens 1480 * at IO completion time based on sums calculated at bio submission time. 1481 */ 1482 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1483 struct inode *inode, u64 file_offset, 1484 struct list_head *list) 1485 { 1486 struct btrfs_ordered_sum *sum; 1487 1488 btrfs_set_trans_block_group(trans, inode); 1489 1490 list_for_each_entry(sum, list, list) { 1491 btrfs_csum_file_blocks(trans, 1492 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1493 } 1494 return 0; 1495 } 1496 1497 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1498 struct extent_state **cached_state) 1499 { 1500 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1501 WARN_ON(1); 1502 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1503 cached_state, GFP_NOFS); 1504 } 1505 1506 /* see btrfs_writepage_start_hook for details on why this is required */ 1507 struct btrfs_writepage_fixup { 1508 struct page *page; 1509 struct btrfs_work work; 1510 }; 1511 1512 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1513 { 1514 struct btrfs_writepage_fixup *fixup; 1515 struct btrfs_ordered_extent *ordered; 1516 struct extent_state *cached_state = NULL; 1517 struct page *page; 1518 struct inode *inode; 1519 u64 page_start; 1520 u64 page_end; 1521 1522 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1523 page = fixup->page; 1524 again: 1525 lock_page(page); 1526 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1527 ClearPageChecked(page); 1528 goto out_page; 1529 } 1530 1531 inode = page->mapping->host; 1532 page_start = page_offset(page); 1533 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1534 1535 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1536 &cached_state, GFP_NOFS); 1537 1538 /* already ordered? We're done */ 1539 if (PagePrivate2(page)) 1540 goto out; 1541 1542 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1543 if (ordered) { 1544 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1545 page_end, &cached_state, GFP_NOFS); 1546 unlock_page(page); 1547 btrfs_start_ordered_extent(inode, ordered, 1); 1548 goto again; 1549 } 1550 1551 BUG(); 1552 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1553 ClearPageChecked(page); 1554 out: 1555 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1556 &cached_state, GFP_NOFS); 1557 out_page: 1558 unlock_page(page); 1559 page_cache_release(page); 1560 } 1561 1562 /* 1563 * There are a few paths in the higher layers of the kernel that directly 1564 * set the page dirty bit without asking the filesystem if it is a 1565 * good idea. This causes problems because we want to make sure COW 1566 * properly happens and the data=ordered rules are followed. 1567 * 1568 * In our case any range that doesn't have the ORDERED bit set 1569 * hasn't been properly setup for IO. We kick off an async process 1570 * to fix it up. The async helper will wait for ordered extents, set 1571 * the delalloc bit and make it safe to write the page. 1572 */ 1573 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1574 { 1575 struct inode *inode = page->mapping->host; 1576 struct btrfs_writepage_fixup *fixup; 1577 struct btrfs_root *root = BTRFS_I(inode)->root; 1578 1579 /* this page is properly in the ordered list */ 1580 if (TestClearPagePrivate2(page)) 1581 return 0; 1582 1583 if (PageChecked(page)) 1584 return -EAGAIN; 1585 1586 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1587 if (!fixup) 1588 return -EAGAIN; 1589 1590 SetPageChecked(page); 1591 page_cache_get(page); 1592 fixup->work.func = btrfs_writepage_fixup_worker; 1593 fixup->page = page; 1594 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1595 return -EAGAIN; 1596 } 1597 1598 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1599 struct inode *inode, u64 file_pos, 1600 u64 disk_bytenr, u64 disk_num_bytes, 1601 u64 num_bytes, u64 ram_bytes, 1602 u8 compression, u8 encryption, 1603 u16 other_encoding, int extent_type) 1604 { 1605 struct btrfs_root *root = BTRFS_I(inode)->root; 1606 struct btrfs_file_extent_item *fi; 1607 struct btrfs_path *path; 1608 struct extent_buffer *leaf; 1609 struct btrfs_key ins; 1610 u64 hint; 1611 int ret; 1612 1613 path = btrfs_alloc_path(); 1614 BUG_ON(!path); 1615 1616 path->leave_spinning = 1; 1617 1618 /* 1619 * we may be replacing one extent in the tree with another. 1620 * The new extent is pinned in the extent map, and we don't want 1621 * to drop it from the cache until it is completely in the btree. 1622 * 1623 * So, tell btrfs_drop_extents to leave this extent in the cache. 1624 * the caller is expected to unpin it and allow it to be merged 1625 * with the others. 1626 */ 1627 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1628 &hint, 0); 1629 BUG_ON(ret); 1630 1631 ins.objectid = inode->i_ino; 1632 ins.offset = file_pos; 1633 ins.type = BTRFS_EXTENT_DATA_KEY; 1634 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1635 BUG_ON(ret); 1636 leaf = path->nodes[0]; 1637 fi = btrfs_item_ptr(leaf, path->slots[0], 1638 struct btrfs_file_extent_item); 1639 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1640 btrfs_set_file_extent_type(leaf, fi, extent_type); 1641 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1642 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1643 btrfs_set_file_extent_offset(leaf, fi, 0); 1644 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1645 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1646 btrfs_set_file_extent_compression(leaf, fi, compression); 1647 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1648 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1649 1650 btrfs_unlock_up_safe(path, 1); 1651 btrfs_set_lock_blocking(leaf); 1652 1653 btrfs_mark_buffer_dirty(leaf); 1654 1655 inode_add_bytes(inode, num_bytes); 1656 1657 ins.objectid = disk_bytenr; 1658 ins.offset = disk_num_bytes; 1659 ins.type = BTRFS_EXTENT_ITEM_KEY; 1660 ret = btrfs_alloc_reserved_file_extent(trans, root, 1661 root->root_key.objectid, 1662 inode->i_ino, file_pos, &ins); 1663 BUG_ON(ret); 1664 btrfs_free_path(path); 1665 1666 return 0; 1667 } 1668 1669 /* 1670 * helper function for btrfs_finish_ordered_io, this 1671 * just reads in some of the csum leaves to prime them into ram 1672 * before we start the transaction. It limits the amount of btree 1673 * reads required while inside the transaction. 1674 */ 1675 /* as ordered data IO finishes, this gets called so we can finish 1676 * an ordered extent if the range of bytes in the file it covers are 1677 * fully written. 1678 */ 1679 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1680 { 1681 struct btrfs_root *root = BTRFS_I(inode)->root; 1682 struct btrfs_trans_handle *trans = NULL; 1683 struct btrfs_ordered_extent *ordered_extent = NULL; 1684 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1685 struct extent_state *cached_state = NULL; 1686 int compress_type = 0; 1687 int ret; 1688 bool nolock = false; 1689 1690 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1691 end - start + 1); 1692 if (!ret) 1693 return 0; 1694 BUG_ON(!ordered_extent); 1695 1696 nolock = (root == root->fs_info->tree_root); 1697 1698 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1699 BUG_ON(!list_empty(&ordered_extent->list)); 1700 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1701 if (!ret) { 1702 if (nolock) 1703 trans = btrfs_join_transaction_nolock(root, 1); 1704 else 1705 trans = btrfs_join_transaction(root, 1); 1706 BUG_ON(!trans); 1707 btrfs_set_trans_block_group(trans, inode); 1708 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1709 ret = btrfs_update_inode(trans, root, inode); 1710 BUG_ON(ret); 1711 } 1712 goto out; 1713 } 1714 1715 lock_extent_bits(io_tree, ordered_extent->file_offset, 1716 ordered_extent->file_offset + ordered_extent->len - 1, 1717 0, &cached_state, GFP_NOFS); 1718 1719 if (nolock) 1720 trans = btrfs_join_transaction_nolock(root, 1); 1721 else 1722 trans = btrfs_join_transaction(root, 1); 1723 btrfs_set_trans_block_group(trans, inode); 1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1725 1726 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1727 compress_type = ordered_extent->compress_type; 1728 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1729 BUG_ON(compress_type); 1730 ret = btrfs_mark_extent_written(trans, inode, 1731 ordered_extent->file_offset, 1732 ordered_extent->file_offset + 1733 ordered_extent->len); 1734 BUG_ON(ret); 1735 } else { 1736 BUG_ON(root == root->fs_info->tree_root); 1737 ret = insert_reserved_file_extent(trans, inode, 1738 ordered_extent->file_offset, 1739 ordered_extent->start, 1740 ordered_extent->disk_len, 1741 ordered_extent->len, 1742 ordered_extent->len, 1743 compress_type, 0, 0, 1744 BTRFS_FILE_EXTENT_REG); 1745 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1746 ordered_extent->file_offset, 1747 ordered_extent->len); 1748 BUG_ON(ret); 1749 } 1750 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1751 ordered_extent->file_offset + 1752 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1753 1754 add_pending_csums(trans, inode, ordered_extent->file_offset, 1755 &ordered_extent->list); 1756 1757 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1758 ret = btrfs_update_inode(trans, root, inode); 1759 BUG_ON(ret); 1760 out: 1761 if (nolock) { 1762 if (trans) 1763 btrfs_end_transaction_nolock(trans, root); 1764 } else { 1765 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1766 if (trans) 1767 btrfs_end_transaction(trans, root); 1768 } 1769 1770 /* once for us */ 1771 btrfs_put_ordered_extent(ordered_extent); 1772 /* once for the tree */ 1773 btrfs_put_ordered_extent(ordered_extent); 1774 1775 return 0; 1776 } 1777 1778 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1779 struct extent_state *state, int uptodate) 1780 { 1781 ClearPagePrivate2(page); 1782 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1783 } 1784 1785 /* 1786 * When IO fails, either with EIO or csum verification fails, we 1787 * try other mirrors that might have a good copy of the data. This 1788 * io_failure_record is used to record state as we go through all the 1789 * mirrors. If another mirror has good data, the page is set up to date 1790 * and things continue. If a good mirror can't be found, the original 1791 * bio end_io callback is called to indicate things have failed. 1792 */ 1793 struct io_failure_record { 1794 struct page *page; 1795 u64 start; 1796 u64 len; 1797 u64 logical; 1798 unsigned long bio_flags; 1799 int last_mirror; 1800 }; 1801 1802 static int btrfs_io_failed_hook(struct bio *failed_bio, 1803 struct page *page, u64 start, u64 end, 1804 struct extent_state *state) 1805 { 1806 struct io_failure_record *failrec = NULL; 1807 u64 private; 1808 struct extent_map *em; 1809 struct inode *inode = page->mapping->host; 1810 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1811 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1812 struct bio *bio; 1813 int num_copies; 1814 int ret; 1815 int rw; 1816 u64 logical; 1817 1818 ret = get_state_private(failure_tree, start, &private); 1819 if (ret) { 1820 failrec = kmalloc(sizeof(*failrec), GFP_NOFS); 1821 if (!failrec) 1822 return -ENOMEM; 1823 failrec->start = start; 1824 failrec->len = end - start + 1; 1825 failrec->last_mirror = 0; 1826 failrec->bio_flags = 0; 1827 1828 read_lock(&em_tree->lock); 1829 em = lookup_extent_mapping(em_tree, start, failrec->len); 1830 if (em->start > start || em->start + em->len < start) { 1831 free_extent_map(em); 1832 em = NULL; 1833 } 1834 read_unlock(&em_tree->lock); 1835 1836 if (!em || IS_ERR(em)) { 1837 kfree(failrec); 1838 return -EIO; 1839 } 1840 logical = start - em->start; 1841 logical = em->block_start + logical; 1842 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1843 logical = em->block_start; 1844 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1845 extent_set_compress_type(&failrec->bio_flags, 1846 em->compress_type); 1847 } 1848 failrec->logical = logical; 1849 free_extent_map(em); 1850 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | 1851 EXTENT_DIRTY, GFP_NOFS); 1852 set_state_private(failure_tree, start, 1853 (u64)(unsigned long)failrec); 1854 } else { 1855 failrec = (struct io_failure_record *)(unsigned long)private; 1856 } 1857 num_copies = btrfs_num_copies( 1858 &BTRFS_I(inode)->root->fs_info->mapping_tree, 1859 failrec->logical, failrec->len); 1860 failrec->last_mirror++; 1861 if (!state) { 1862 spin_lock(&BTRFS_I(inode)->io_tree.lock); 1863 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 1864 failrec->start, 1865 EXTENT_LOCKED); 1866 if (state && state->start != failrec->start) 1867 state = NULL; 1868 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 1869 } 1870 if (!state || failrec->last_mirror > num_copies) { 1871 set_state_private(failure_tree, failrec->start, 0); 1872 clear_extent_bits(failure_tree, failrec->start, 1873 failrec->start + failrec->len - 1, 1874 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1875 kfree(failrec); 1876 return -EIO; 1877 } 1878 bio = bio_alloc(GFP_NOFS, 1); 1879 bio->bi_private = state; 1880 bio->bi_end_io = failed_bio->bi_end_io; 1881 bio->bi_sector = failrec->logical >> 9; 1882 bio->bi_bdev = failed_bio->bi_bdev; 1883 bio->bi_size = 0; 1884 1885 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1886 if (failed_bio->bi_rw & REQ_WRITE) 1887 rw = WRITE; 1888 else 1889 rw = READ; 1890 1891 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1892 failrec->last_mirror, 1893 failrec->bio_flags, 0); 1894 return 0; 1895 } 1896 1897 /* 1898 * each time an IO finishes, we do a fast check in the IO failure tree 1899 * to see if we need to process or clean up an io_failure_record 1900 */ 1901 static int btrfs_clean_io_failures(struct inode *inode, u64 start) 1902 { 1903 u64 private; 1904 u64 private_failure; 1905 struct io_failure_record *failure; 1906 int ret; 1907 1908 private = 0; 1909 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1910 (u64)-1, 1, EXTENT_DIRTY)) { 1911 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, 1912 start, &private_failure); 1913 if (ret == 0) { 1914 failure = (struct io_failure_record *)(unsigned long) 1915 private_failure; 1916 set_state_private(&BTRFS_I(inode)->io_failure_tree, 1917 failure->start, 0); 1918 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, 1919 failure->start, 1920 failure->start + failure->len - 1, 1921 EXTENT_DIRTY | EXTENT_LOCKED, 1922 GFP_NOFS); 1923 kfree(failure); 1924 } 1925 } 1926 return 0; 1927 } 1928 1929 /* 1930 * when reads are done, we need to check csums to verify the data is correct 1931 * if there's a match, we allow the bio to finish. If not, we go through 1932 * the io_failure_record routines to find good copies 1933 */ 1934 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1935 struct extent_state *state) 1936 { 1937 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 1938 struct inode *inode = page->mapping->host; 1939 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1940 char *kaddr; 1941 u64 private = ~(u32)0; 1942 int ret; 1943 struct btrfs_root *root = BTRFS_I(inode)->root; 1944 u32 csum = ~(u32)0; 1945 1946 if (PageChecked(page)) { 1947 ClearPageChecked(page); 1948 goto good; 1949 } 1950 1951 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 1952 return 0; 1953 1954 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1955 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 1956 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 1957 GFP_NOFS); 1958 return 0; 1959 } 1960 1961 if (state && state->start == start) { 1962 private = state->private; 1963 ret = 0; 1964 } else { 1965 ret = get_state_private(io_tree, start, &private); 1966 } 1967 kaddr = kmap_atomic(page, KM_USER0); 1968 if (ret) 1969 goto zeroit; 1970 1971 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 1972 btrfs_csum_final(csum, (char *)&csum); 1973 if (csum != private) 1974 goto zeroit; 1975 1976 kunmap_atomic(kaddr, KM_USER0); 1977 good: 1978 /* if the io failure tree for this inode is non-empty, 1979 * check to see if we've recovered from a failed IO 1980 */ 1981 btrfs_clean_io_failures(inode, start); 1982 return 0; 1983 1984 zeroit: 1985 if (printk_ratelimit()) { 1986 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " 1987 "private %llu\n", page->mapping->host->i_ino, 1988 (unsigned long long)start, csum, 1989 (unsigned long long)private); 1990 } 1991 memset(kaddr + offset, 1, end - start + 1); 1992 flush_dcache_page(page); 1993 kunmap_atomic(kaddr, KM_USER0); 1994 if (private == 0) 1995 return 0; 1996 return -EIO; 1997 } 1998 1999 struct delayed_iput { 2000 struct list_head list; 2001 struct inode *inode; 2002 }; 2003 2004 void btrfs_add_delayed_iput(struct inode *inode) 2005 { 2006 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2007 struct delayed_iput *delayed; 2008 2009 if (atomic_add_unless(&inode->i_count, -1, 1)) 2010 return; 2011 2012 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 2013 delayed->inode = inode; 2014 2015 spin_lock(&fs_info->delayed_iput_lock); 2016 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 2017 spin_unlock(&fs_info->delayed_iput_lock); 2018 } 2019 2020 void btrfs_run_delayed_iputs(struct btrfs_root *root) 2021 { 2022 LIST_HEAD(list); 2023 struct btrfs_fs_info *fs_info = root->fs_info; 2024 struct delayed_iput *delayed; 2025 int empty; 2026 2027 spin_lock(&fs_info->delayed_iput_lock); 2028 empty = list_empty(&fs_info->delayed_iputs); 2029 spin_unlock(&fs_info->delayed_iput_lock); 2030 if (empty) 2031 return; 2032 2033 down_read(&root->fs_info->cleanup_work_sem); 2034 spin_lock(&fs_info->delayed_iput_lock); 2035 list_splice_init(&fs_info->delayed_iputs, &list); 2036 spin_unlock(&fs_info->delayed_iput_lock); 2037 2038 while (!list_empty(&list)) { 2039 delayed = list_entry(list.next, struct delayed_iput, list); 2040 list_del(&delayed->list); 2041 iput(delayed->inode); 2042 kfree(delayed); 2043 } 2044 up_read(&root->fs_info->cleanup_work_sem); 2045 } 2046 2047 /* 2048 * calculate extra metadata reservation when snapshotting a subvolume 2049 * contains orphan files. 2050 */ 2051 void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, 2052 struct btrfs_pending_snapshot *pending, 2053 u64 *bytes_to_reserve) 2054 { 2055 struct btrfs_root *root; 2056 struct btrfs_block_rsv *block_rsv; 2057 u64 num_bytes; 2058 int index; 2059 2060 root = pending->root; 2061 if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) 2062 return; 2063 2064 block_rsv = root->orphan_block_rsv; 2065 2066 /* orphan block reservation for the snapshot */ 2067 num_bytes = block_rsv->size; 2068 2069 /* 2070 * after the snapshot is created, COWing tree blocks may use more 2071 * space than it frees. So we should make sure there is enough 2072 * reserved space. 2073 */ 2074 index = trans->transid & 0x1; 2075 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { 2076 num_bytes += block_rsv->size - 2077 (block_rsv->reserved + block_rsv->freed[index]); 2078 } 2079 2080 *bytes_to_reserve += num_bytes; 2081 } 2082 2083 void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, 2084 struct btrfs_pending_snapshot *pending) 2085 { 2086 struct btrfs_root *root = pending->root; 2087 struct btrfs_root *snap = pending->snap; 2088 struct btrfs_block_rsv *block_rsv; 2089 u64 num_bytes; 2090 int index; 2091 int ret; 2092 2093 if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) 2094 return; 2095 2096 /* refill source subvolume's orphan block reservation */ 2097 block_rsv = root->orphan_block_rsv; 2098 index = trans->transid & 0x1; 2099 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { 2100 num_bytes = block_rsv->size - 2101 (block_rsv->reserved + block_rsv->freed[index]); 2102 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 2103 root->orphan_block_rsv, 2104 num_bytes); 2105 BUG_ON(ret); 2106 } 2107 2108 /* setup orphan block reservation for the snapshot */ 2109 block_rsv = btrfs_alloc_block_rsv(snap); 2110 BUG_ON(!block_rsv); 2111 2112 btrfs_add_durable_block_rsv(root->fs_info, block_rsv); 2113 snap->orphan_block_rsv = block_rsv; 2114 2115 num_bytes = root->orphan_block_rsv->size; 2116 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 2117 block_rsv, num_bytes); 2118 BUG_ON(ret); 2119 2120 #if 0 2121 /* insert orphan item for the snapshot */ 2122 WARN_ON(!root->orphan_item_inserted); 2123 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2124 snap->root_key.objectid); 2125 BUG_ON(ret); 2126 snap->orphan_item_inserted = 1; 2127 #endif 2128 } 2129 2130 enum btrfs_orphan_cleanup_state { 2131 ORPHAN_CLEANUP_STARTED = 1, 2132 ORPHAN_CLEANUP_DONE = 2, 2133 }; 2134 2135 /* 2136 * This is called in transaction commmit time. If there are no orphan 2137 * files in the subvolume, it removes orphan item and frees block_rsv 2138 * structure. 2139 */ 2140 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2141 struct btrfs_root *root) 2142 { 2143 int ret; 2144 2145 if (!list_empty(&root->orphan_list) || 2146 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2147 return; 2148 2149 if (root->orphan_item_inserted && 2150 btrfs_root_refs(&root->root_item) > 0) { 2151 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2152 root->root_key.objectid); 2153 BUG_ON(ret); 2154 root->orphan_item_inserted = 0; 2155 } 2156 2157 if (root->orphan_block_rsv) { 2158 WARN_ON(root->orphan_block_rsv->size > 0); 2159 btrfs_free_block_rsv(root, root->orphan_block_rsv); 2160 root->orphan_block_rsv = NULL; 2161 } 2162 } 2163 2164 /* 2165 * This creates an orphan entry for the given inode in case something goes 2166 * wrong in the middle of an unlink/truncate. 2167 * 2168 * NOTE: caller of this function should reserve 5 units of metadata for 2169 * this function. 2170 */ 2171 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2172 { 2173 struct btrfs_root *root = BTRFS_I(inode)->root; 2174 struct btrfs_block_rsv *block_rsv = NULL; 2175 int reserve = 0; 2176 int insert = 0; 2177 int ret; 2178 2179 if (!root->orphan_block_rsv) { 2180 block_rsv = btrfs_alloc_block_rsv(root); 2181 BUG_ON(!block_rsv); 2182 } 2183 2184 spin_lock(&root->orphan_lock); 2185 if (!root->orphan_block_rsv) { 2186 root->orphan_block_rsv = block_rsv; 2187 } else if (block_rsv) { 2188 btrfs_free_block_rsv(root, block_rsv); 2189 block_rsv = NULL; 2190 } 2191 2192 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2193 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2194 #if 0 2195 /* 2196 * For proper ENOSPC handling, we should do orphan 2197 * cleanup when mounting. But this introduces backward 2198 * compatibility issue. 2199 */ 2200 if (!xchg(&root->orphan_item_inserted, 1)) 2201 insert = 2; 2202 else 2203 insert = 1; 2204 #endif 2205 insert = 1; 2206 } else { 2207 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved); 2208 } 2209 2210 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2211 BTRFS_I(inode)->orphan_meta_reserved = 1; 2212 reserve = 1; 2213 } 2214 spin_unlock(&root->orphan_lock); 2215 2216 if (block_rsv) 2217 btrfs_add_durable_block_rsv(root->fs_info, block_rsv); 2218 2219 /* grab metadata reservation from transaction handle */ 2220 if (reserve) { 2221 ret = btrfs_orphan_reserve_metadata(trans, inode); 2222 BUG_ON(ret); 2223 } 2224 2225 /* insert an orphan item to track this unlinked/truncated file */ 2226 if (insert >= 1) { 2227 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2228 BUG_ON(ret); 2229 } 2230 2231 /* insert an orphan item to track subvolume contains orphan files */ 2232 if (insert >= 2) { 2233 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2234 root->root_key.objectid); 2235 BUG_ON(ret); 2236 } 2237 return 0; 2238 } 2239 2240 /* 2241 * We have done the truncate/delete so we can go ahead and remove the orphan 2242 * item for this particular inode. 2243 */ 2244 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2245 { 2246 struct btrfs_root *root = BTRFS_I(inode)->root; 2247 int delete_item = 0; 2248 int release_rsv = 0; 2249 int ret = 0; 2250 2251 spin_lock(&root->orphan_lock); 2252 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2253 list_del_init(&BTRFS_I(inode)->i_orphan); 2254 delete_item = 1; 2255 } 2256 2257 if (BTRFS_I(inode)->orphan_meta_reserved) { 2258 BTRFS_I(inode)->orphan_meta_reserved = 0; 2259 release_rsv = 1; 2260 } 2261 spin_unlock(&root->orphan_lock); 2262 2263 if (trans && delete_item) { 2264 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2265 BUG_ON(ret); 2266 } 2267 2268 if (release_rsv) 2269 btrfs_orphan_release_metadata(inode); 2270 2271 return 0; 2272 } 2273 2274 /* 2275 * this cleans up any orphans that may be left on the list from the last use 2276 * of this root. 2277 */ 2278 void btrfs_orphan_cleanup(struct btrfs_root *root) 2279 { 2280 struct btrfs_path *path; 2281 struct extent_buffer *leaf; 2282 struct btrfs_key key, found_key; 2283 struct btrfs_trans_handle *trans; 2284 struct inode *inode; 2285 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2286 2287 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2288 return; 2289 2290 path = btrfs_alloc_path(); 2291 BUG_ON(!path); 2292 path->reada = -1; 2293 2294 key.objectid = BTRFS_ORPHAN_OBJECTID; 2295 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2296 key.offset = (u64)-1; 2297 2298 while (1) { 2299 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2300 if (ret < 0) { 2301 printk(KERN_ERR "Error searching slot for orphan: %d" 2302 "\n", ret); 2303 break; 2304 } 2305 2306 /* 2307 * if ret == 0 means we found what we were searching for, which 2308 * is weird, but possible, so only screw with path if we didnt 2309 * find the key and see if we have stuff that matches 2310 */ 2311 if (ret > 0) { 2312 if (path->slots[0] == 0) 2313 break; 2314 path->slots[0]--; 2315 } 2316 2317 /* pull out the item */ 2318 leaf = path->nodes[0]; 2319 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2320 2321 /* make sure the item matches what we want */ 2322 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 2323 break; 2324 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 2325 break; 2326 2327 /* release the path since we're done with it */ 2328 btrfs_release_path(root, path); 2329 2330 /* 2331 * this is where we are basically btrfs_lookup, without the 2332 * crossing root thing. we store the inode number in the 2333 * offset of the orphan item. 2334 */ 2335 found_key.objectid = found_key.offset; 2336 found_key.type = BTRFS_INODE_ITEM_KEY; 2337 found_key.offset = 0; 2338 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2339 BUG_ON(IS_ERR(inode)); 2340 2341 /* 2342 * add this inode to the orphan list so btrfs_orphan_del does 2343 * the proper thing when we hit it 2344 */ 2345 spin_lock(&root->orphan_lock); 2346 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2347 spin_unlock(&root->orphan_lock); 2348 2349 /* 2350 * if this is a bad inode, means we actually succeeded in 2351 * removing the inode, but not the orphan record, which means 2352 * we need to manually delete the orphan since iput will just 2353 * do a destroy_inode 2354 */ 2355 if (is_bad_inode(inode)) { 2356 trans = btrfs_start_transaction(root, 0); 2357 btrfs_orphan_del(trans, inode); 2358 btrfs_end_transaction(trans, root); 2359 iput(inode); 2360 continue; 2361 } 2362 2363 /* if we have links, this was a truncate, lets do that */ 2364 if (inode->i_nlink) { 2365 nr_truncate++; 2366 btrfs_truncate(inode); 2367 } else { 2368 nr_unlink++; 2369 } 2370 2371 /* this will do delete_inode and everything for us */ 2372 iput(inode); 2373 } 2374 btrfs_free_path(path); 2375 2376 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2377 2378 if (root->orphan_block_rsv) 2379 btrfs_block_rsv_release(root, root->orphan_block_rsv, 2380 (u64)-1); 2381 2382 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2383 trans = btrfs_join_transaction(root, 1); 2384 btrfs_end_transaction(trans, root); 2385 } 2386 2387 if (nr_unlink) 2388 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2389 if (nr_truncate) 2390 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2391 } 2392 2393 /* 2394 * very simple check to peek ahead in the leaf looking for xattrs. If we 2395 * don't find any xattrs, we know there can't be any acls. 2396 * 2397 * slot is the slot the inode is in, objectid is the objectid of the inode 2398 */ 2399 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2400 int slot, u64 objectid) 2401 { 2402 u32 nritems = btrfs_header_nritems(leaf); 2403 struct btrfs_key found_key; 2404 int scanned = 0; 2405 2406 slot++; 2407 while (slot < nritems) { 2408 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2409 2410 /* we found a different objectid, there must not be acls */ 2411 if (found_key.objectid != objectid) 2412 return 0; 2413 2414 /* we found an xattr, assume we've got an acl */ 2415 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2416 return 1; 2417 2418 /* 2419 * we found a key greater than an xattr key, there can't 2420 * be any acls later on 2421 */ 2422 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2423 return 0; 2424 2425 slot++; 2426 scanned++; 2427 2428 /* 2429 * it goes inode, inode backrefs, xattrs, extents, 2430 * so if there are a ton of hard links to an inode there can 2431 * be a lot of backrefs. Don't waste time searching too hard, 2432 * this is just an optimization 2433 */ 2434 if (scanned >= 8) 2435 break; 2436 } 2437 /* we hit the end of the leaf before we found an xattr or 2438 * something larger than an xattr. We have to assume the inode 2439 * has acls 2440 */ 2441 return 1; 2442 } 2443 2444 /* 2445 * read an inode from the btree into the in-memory inode 2446 */ 2447 static void btrfs_read_locked_inode(struct inode *inode) 2448 { 2449 struct btrfs_path *path; 2450 struct extent_buffer *leaf; 2451 struct btrfs_inode_item *inode_item; 2452 struct btrfs_timespec *tspec; 2453 struct btrfs_root *root = BTRFS_I(inode)->root; 2454 struct btrfs_key location; 2455 int maybe_acls; 2456 u64 alloc_group_block; 2457 u32 rdev; 2458 int ret; 2459 2460 path = btrfs_alloc_path(); 2461 BUG_ON(!path); 2462 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2463 2464 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2465 if (ret) 2466 goto make_bad; 2467 2468 leaf = path->nodes[0]; 2469 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2470 struct btrfs_inode_item); 2471 2472 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2473 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2474 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2475 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2476 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2477 2478 tspec = btrfs_inode_atime(inode_item); 2479 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2480 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2481 2482 tspec = btrfs_inode_mtime(inode_item); 2483 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2484 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2485 2486 tspec = btrfs_inode_ctime(inode_item); 2487 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2488 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2489 2490 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2491 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2492 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2493 inode->i_generation = BTRFS_I(inode)->generation; 2494 inode->i_rdev = 0; 2495 rdev = btrfs_inode_rdev(leaf, inode_item); 2496 2497 BTRFS_I(inode)->index_cnt = (u64)-1; 2498 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2499 2500 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2501 2502 /* 2503 * try to precache a NULL acl entry for files that don't have 2504 * any xattrs or acls 2505 */ 2506 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); 2507 if (!maybe_acls) 2508 cache_no_acl(inode); 2509 2510 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2511 alloc_group_block, 0); 2512 btrfs_free_path(path); 2513 inode_item = NULL; 2514 2515 switch (inode->i_mode & S_IFMT) { 2516 case S_IFREG: 2517 inode->i_mapping->a_ops = &btrfs_aops; 2518 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2519 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2520 inode->i_fop = &btrfs_file_operations; 2521 inode->i_op = &btrfs_file_inode_operations; 2522 break; 2523 case S_IFDIR: 2524 inode->i_fop = &btrfs_dir_file_operations; 2525 if (root == root->fs_info->tree_root) 2526 inode->i_op = &btrfs_dir_ro_inode_operations; 2527 else 2528 inode->i_op = &btrfs_dir_inode_operations; 2529 break; 2530 case S_IFLNK: 2531 inode->i_op = &btrfs_symlink_inode_operations; 2532 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2533 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2534 break; 2535 default: 2536 inode->i_op = &btrfs_special_inode_operations; 2537 init_special_inode(inode, inode->i_mode, rdev); 2538 break; 2539 } 2540 2541 btrfs_update_iflags(inode); 2542 return; 2543 2544 make_bad: 2545 btrfs_free_path(path); 2546 make_bad_inode(inode); 2547 } 2548 2549 /* 2550 * given a leaf and an inode, copy the inode fields into the leaf 2551 */ 2552 static void fill_inode_item(struct btrfs_trans_handle *trans, 2553 struct extent_buffer *leaf, 2554 struct btrfs_inode_item *item, 2555 struct inode *inode) 2556 { 2557 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2558 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2559 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2560 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2561 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2562 2563 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2564 inode->i_atime.tv_sec); 2565 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2566 inode->i_atime.tv_nsec); 2567 2568 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2569 inode->i_mtime.tv_sec); 2570 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2571 inode->i_mtime.tv_nsec); 2572 2573 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2574 inode->i_ctime.tv_sec); 2575 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2576 inode->i_ctime.tv_nsec); 2577 2578 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2579 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2580 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2581 btrfs_set_inode_transid(leaf, item, trans->transid); 2582 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2583 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2584 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2585 } 2586 2587 /* 2588 * copy everything in the in-memory inode into the btree. 2589 */ 2590 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2591 struct btrfs_root *root, struct inode *inode) 2592 { 2593 struct btrfs_inode_item *inode_item; 2594 struct btrfs_path *path; 2595 struct extent_buffer *leaf; 2596 int ret; 2597 2598 path = btrfs_alloc_path(); 2599 BUG_ON(!path); 2600 path->leave_spinning = 1; 2601 ret = btrfs_lookup_inode(trans, root, path, 2602 &BTRFS_I(inode)->location, 1); 2603 if (ret) { 2604 if (ret > 0) 2605 ret = -ENOENT; 2606 goto failed; 2607 } 2608 2609 btrfs_unlock_up_safe(path, 1); 2610 leaf = path->nodes[0]; 2611 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2612 struct btrfs_inode_item); 2613 2614 fill_inode_item(trans, leaf, inode_item, inode); 2615 btrfs_mark_buffer_dirty(leaf); 2616 btrfs_set_inode_last_trans(trans, inode); 2617 ret = 0; 2618 failed: 2619 btrfs_free_path(path); 2620 return ret; 2621 } 2622 2623 2624 /* 2625 * unlink helper that gets used here in inode.c and in the tree logging 2626 * recovery code. It remove a link in a directory with a given name, and 2627 * also drops the back refs in the inode to the directory 2628 */ 2629 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2630 struct btrfs_root *root, 2631 struct inode *dir, struct inode *inode, 2632 const char *name, int name_len) 2633 { 2634 struct btrfs_path *path; 2635 int ret = 0; 2636 struct extent_buffer *leaf; 2637 struct btrfs_dir_item *di; 2638 struct btrfs_key key; 2639 u64 index; 2640 2641 path = btrfs_alloc_path(); 2642 if (!path) { 2643 ret = -ENOMEM; 2644 goto err; 2645 } 2646 2647 path->leave_spinning = 1; 2648 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2649 name, name_len, -1); 2650 if (IS_ERR(di)) { 2651 ret = PTR_ERR(di); 2652 goto err; 2653 } 2654 if (!di) { 2655 ret = -ENOENT; 2656 goto err; 2657 } 2658 leaf = path->nodes[0]; 2659 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2660 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2661 if (ret) 2662 goto err; 2663 btrfs_release_path(root, path); 2664 2665 ret = btrfs_del_inode_ref(trans, root, name, name_len, 2666 inode->i_ino, 2667 dir->i_ino, &index); 2668 if (ret) { 2669 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2670 "inode %lu parent %lu\n", name_len, name, 2671 inode->i_ino, dir->i_ino); 2672 goto err; 2673 } 2674 2675 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 2676 index, name, name_len, -1); 2677 if (IS_ERR(di)) { 2678 ret = PTR_ERR(di); 2679 goto err; 2680 } 2681 if (!di) { 2682 ret = -ENOENT; 2683 goto err; 2684 } 2685 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2686 btrfs_release_path(root, path); 2687 2688 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2689 inode, dir->i_ino); 2690 BUG_ON(ret != 0 && ret != -ENOENT); 2691 2692 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2693 dir, index); 2694 if (ret == -ENOENT) 2695 ret = 0; 2696 err: 2697 btrfs_free_path(path); 2698 if (ret) 2699 goto out; 2700 2701 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2702 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2703 btrfs_update_inode(trans, root, dir); 2704 btrfs_drop_nlink(inode); 2705 ret = btrfs_update_inode(trans, root, inode); 2706 out: 2707 return ret; 2708 } 2709 2710 /* helper to check if there is any shared block in the path */ 2711 static int check_path_shared(struct btrfs_root *root, 2712 struct btrfs_path *path) 2713 { 2714 struct extent_buffer *eb; 2715 int level; 2716 u64 refs = 1; 2717 int uninitialized_var(ret); 2718 2719 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2720 if (!path->nodes[level]) 2721 break; 2722 eb = path->nodes[level]; 2723 if (!btrfs_block_can_be_shared(root, eb)) 2724 continue; 2725 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, 2726 &refs, NULL); 2727 if (refs > 1) 2728 return 1; 2729 } 2730 return ret; /* XXX callers? */ 2731 } 2732 2733 /* 2734 * helper to start transaction for unlink and rmdir. 2735 * 2736 * unlink and rmdir are special in btrfs, they do not always free space. 2737 * so in enospc case, we should make sure they will free space before 2738 * allowing them to use the global metadata reservation. 2739 */ 2740 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 2741 struct dentry *dentry) 2742 { 2743 struct btrfs_trans_handle *trans; 2744 struct btrfs_root *root = BTRFS_I(dir)->root; 2745 struct btrfs_path *path; 2746 struct btrfs_inode_ref *ref; 2747 struct btrfs_dir_item *di; 2748 struct inode *inode = dentry->d_inode; 2749 u64 index; 2750 int check_link = 1; 2751 int err = -ENOSPC; 2752 int ret; 2753 2754 trans = btrfs_start_transaction(root, 10); 2755 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2756 return trans; 2757 2758 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 2759 return ERR_PTR(-ENOSPC); 2760 2761 /* check if there is someone else holds reference */ 2762 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) 2763 return ERR_PTR(-ENOSPC); 2764 2765 if (atomic_read(&inode->i_count) > 2) 2766 return ERR_PTR(-ENOSPC); 2767 2768 if (xchg(&root->fs_info->enospc_unlink, 1)) 2769 return ERR_PTR(-ENOSPC); 2770 2771 path = btrfs_alloc_path(); 2772 if (!path) { 2773 root->fs_info->enospc_unlink = 0; 2774 return ERR_PTR(-ENOMEM); 2775 } 2776 2777 trans = btrfs_start_transaction(root, 0); 2778 if (IS_ERR(trans)) { 2779 btrfs_free_path(path); 2780 root->fs_info->enospc_unlink = 0; 2781 return trans; 2782 } 2783 2784 path->skip_locking = 1; 2785 path->search_commit_root = 1; 2786 2787 ret = btrfs_lookup_inode(trans, root, path, 2788 &BTRFS_I(dir)->location, 0); 2789 if (ret < 0) { 2790 err = ret; 2791 goto out; 2792 } 2793 if (ret == 0) { 2794 if (check_path_shared(root, path)) 2795 goto out; 2796 } else { 2797 check_link = 0; 2798 } 2799 btrfs_release_path(root, path); 2800 2801 ret = btrfs_lookup_inode(trans, root, path, 2802 &BTRFS_I(inode)->location, 0); 2803 if (ret < 0) { 2804 err = ret; 2805 goto out; 2806 } 2807 if (ret == 0) { 2808 if (check_path_shared(root, path)) 2809 goto out; 2810 } else { 2811 check_link = 0; 2812 } 2813 btrfs_release_path(root, path); 2814 2815 if (ret == 0 && S_ISREG(inode->i_mode)) { 2816 ret = btrfs_lookup_file_extent(trans, root, path, 2817 inode->i_ino, (u64)-1, 0); 2818 if (ret < 0) { 2819 err = ret; 2820 goto out; 2821 } 2822 BUG_ON(ret == 0); 2823 if (check_path_shared(root, path)) 2824 goto out; 2825 btrfs_release_path(root, path); 2826 } 2827 2828 if (!check_link) { 2829 err = 0; 2830 goto out; 2831 } 2832 2833 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2834 dentry->d_name.name, dentry->d_name.len, 0); 2835 if (IS_ERR(di)) { 2836 err = PTR_ERR(di); 2837 goto out; 2838 } 2839 if (di) { 2840 if (check_path_shared(root, path)) 2841 goto out; 2842 } else { 2843 err = 0; 2844 goto out; 2845 } 2846 btrfs_release_path(root, path); 2847 2848 ref = btrfs_lookup_inode_ref(trans, root, path, 2849 dentry->d_name.name, dentry->d_name.len, 2850 inode->i_ino, dir->i_ino, 0); 2851 if (IS_ERR(ref)) { 2852 err = PTR_ERR(ref); 2853 goto out; 2854 } 2855 BUG_ON(!ref); 2856 if (check_path_shared(root, path)) 2857 goto out; 2858 index = btrfs_inode_ref_index(path->nodes[0], ref); 2859 btrfs_release_path(root, path); 2860 2861 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index, 2862 dentry->d_name.name, dentry->d_name.len, 0); 2863 if (IS_ERR(di)) { 2864 err = PTR_ERR(di); 2865 goto out; 2866 } 2867 BUG_ON(ret == -ENOENT); 2868 if (check_path_shared(root, path)) 2869 goto out; 2870 2871 err = 0; 2872 out: 2873 btrfs_free_path(path); 2874 if (err) { 2875 btrfs_end_transaction(trans, root); 2876 root->fs_info->enospc_unlink = 0; 2877 return ERR_PTR(err); 2878 } 2879 2880 trans->block_rsv = &root->fs_info->global_block_rsv; 2881 return trans; 2882 } 2883 2884 static void __unlink_end_trans(struct btrfs_trans_handle *trans, 2885 struct btrfs_root *root) 2886 { 2887 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2888 BUG_ON(!root->fs_info->enospc_unlink); 2889 root->fs_info->enospc_unlink = 0; 2890 } 2891 btrfs_end_transaction_throttle(trans, root); 2892 } 2893 2894 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2895 { 2896 struct btrfs_root *root = BTRFS_I(dir)->root; 2897 struct btrfs_trans_handle *trans; 2898 struct inode *inode = dentry->d_inode; 2899 int ret; 2900 unsigned long nr = 0; 2901 2902 trans = __unlink_start_trans(dir, dentry); 2903 if (IS_ERR(trans)) 2904 return PTR_ERR(trans); 2905 2906 btrfs_set_trans_block_group(trans, dir); 2907 2908 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 2909 2910 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2911 dentry->d_name.name, dentry->d_name.len); 2912 BUG_ON(ret); 2913 2914 if (inode->i_nlink == 0) { 2915 ret = btrfs_orphan_add(trans, inode); 2916 BUG_ON(ret); 2917 } 2918 2919 nr = trans->blocks_used; 2920 __unlink_end_trans(trans, root); 2921 btrfs_btree_balance_dirty(root, nr); 2922 return ret; 2923 } 2924 2925 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 2926 struct btrfs_root *root, 2927 struct inode *dir, u64 objectid, 2928 const char *name, int name_len) 2929 { 2930 struct btrfs_path *path; 2931 struct extent_buffer *leaf; 2932 struct btrfs_dir_item *di; 2933 struct btrfs_key key; 2934 u64 index; 2935 int ret; 2936 2937 path = btrfs_alloc_path(); 2938 if (!path) 2939 return -ENOMEM; 2940 2941 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2942 name, name_len, -1); 2943 BUG_ON(!di || IS_ERR(di)); 2944 2945 leaf = path->nodes[0]; 2946 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2947 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 2948 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2949 BUG_ON(ret); 2950 btrfs_release_path(root, path); 2951 2952 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 2953 objectid, root->root_key.objectid, 2954 dir->i_ino, &index, name, name_len); 2955 if (ret < 0) { 2956 BUG_ON(ret != -ENOENT); 2957 di = btrfs_search_dir_index_item(root, path, dir->i_ino, 2958 name, name_len); 2959 BUG_ON(!di || IS_ERR(di)); 2960 2961 leaf = path->nodes[0]; 2962 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2963 btrfs_release_path(root, path); 2964 index = key.offset; 2965 } 2966 2967 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 2968 index, name, name_len, -1); 2969 BUG_ON(!di || IS_ERR(di)); 2970 2971 leaf = path->nodes[0]; 2972 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2973 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 2974 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2975 BUG_ON(ret); 2976 btrfs_release_path(root, path); 2977 2978 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2979 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2980 ret = btrfs_update_inode(trans, root, dir); 2981 BUG_ON(ret); 2982 2983 btrfs_free_path(path); 2984 return 0; 2985 } 2986 2987 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 2988 { 2989 struct inode *inode = dentry->d_inode; 2990 int err = 0; 2991 struct btrfs_root *root = BTRFS_I(dir)->root; 2992 struct btrfs_trans_handle *trans; 2993 unsigned long nr = 0; 2994 2995 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 2996 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2997 return -ENOTEMPTY; 2998 2999 trans = __unlink_start_trans(dir, dentry); 3000 if (IS_ERR(trans)) 3001 return PTR_ERR(trans); 3002 3003 btrfs_set_trans_block_group(trans, dir); 3004 3005 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 3006 err = btrfs_unlink_subvol(trans, root, dir, 3007 BTRFS_I(inode)->location.objectid, 3008 dentry->d_name.name, 3009 dentry->d_name.len); 3010 goto out; 3011 } 3012 3013 err = btrfs_orphan_add(trans, inode); 3014 if (err) 3015 goto out; 3016 3017 /* now the directory is empty */ 3018 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3019 dentry->d_name.name, dentry->d_name.len); 3020 if (!err) 3021 btrfs_i_size_write(inode, 0); 3022 out: 3023 nr = trans->blocks_used; 3024 __unlink_end_trans(trans, root); 3025 btrfs_btree_balance_dirty(root, nr); 3026 3027 return err; 3028 } 3029 3030 #if 0 3031 /* 3032 * when truncating bytes in a file, it is possible to avoid reading 3033 * the leaves that contain only checksum items. This can be the 3034 * majority of the IO required to delete a large file, but it must 3035 * be done carefully. 3036 * 3037 * The keys in the level just above the leaves are checked to make sure 3038 * the lowest key in a given leaf is a csum key, and starts at an offset 3039 * after the new size. 3040 * 3041 * Then the key for the next leaf is checked to make sure it also has 3042 * a checksum item for the same file. If it does, we know our target leaf 3043 * contains only checksum items, and it can be safely freed without reading 3044 * it. 3045 * 3046 * This is just an optimization targeted at large files. It may do 3047 * nothing. It will return 0 unless things went badly. 3048 */ 3049 static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, 3050 struct btrfs_root *root, 3051 struct btrfs_path *path, 3052 struct inode *inode, u64 new_size) 3053 { 3054 struct btrfs_key key; 3055 int ret; 3056 int nritems; 3057 struct btrfs_key found_key; 3058 struct btrfs_key other_key; 3059 struct btrfs_leaf_ref *ref; 3060 u64 leaf_gen; 3061 u64 leaf_start; 3062 3063 path->lowest_level = 1; 3064 key.objectid = inode->i_ino; 3065 key.type = BTRFS_CSUM_ITEM_KEY; 3066 key.offset = new_size; 3067 again: 3068 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3069 if (ret < 0) 3070 goto out; 3071 3072 if (path->nodes[1] == NULL) { 3073 ret = 0; 3074 goto out; 3075 } 3076 ret = 0; 3077 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); 3078 nritems = btrfs_header_nritems(path->nodes[1]); 3079 3080 if (!nritems) 3081 goto out; 3082 3083 if (path->slots[1] >= nritems) 3084 goto next_node; 3085 3086 /* did we find a key greater than anything we want to delete? */ 3087 if (found_key.objectid > inode->i_ino || 3088 (found_key.objectid == inode->i_ino && found_key.type > key.type)) 3089 goto out; 3090 3091 /* we check the next key in the node to make sure the leave contains 3092 * only checksum items. This comparison doesn't work if our 3093 * leaf is the last one in the node 3094 */ 3095 if (path->slots[1] + 1 >= nritems) { 3096 next_node: 3097 /* search forward from the last key in the node, this 3098 * will bring us into the next node in the tree 3099 */ 3100 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); 3101 3102 /* unlikely, but we inc below, so check to be safe */ 3103 if (found_key.offset == (u64)-1) 3104 goto out; 3105 3106 /* search_forward needs a path with locks held, do the 3107 * search again for the original key. It is possible 3108 * this will race with a balance and return a path that 3109 * we could modify, but this drop is just an optimization 3110 * and is allowed to miss some leaves. 3111 */ 3112 btrfs_release_path(root, path); 3113 found_key.offset++; 3114 3115 /* setup a max key for search_forward */ 3116 other_key.offset = (u64)-1; 3117 other_key.type = key.type; 3118 other_key.objectid = key.objectid; 3119 3120 path->keep_locks = 1; 3121 ret = btrfs_search_forward(root, &found_key, &other_key, 3122 path, 0, 0); 3123 path->keep_locks = 0; 3124 if (ret || found_key.objectid != key.objectid || 3125 found_key.type != key.type) { 3126 ret = 0; 3127 goto out; 3128 } 3129 3130 key.offset = found_key.offset; 3131 btrfs_release_path(root, path); 3132 cond_resched(); 3133 goto again; 3134 } 3135 3136 /* we know there's one more slot after us in the tree, 3137 * read that key so we can verify it is also a checksum item 3138 */ 3139 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); 3140 3141 if (found_key.objectid < inode->i_ino) 3142 goto next_key; 3143 3144 if (found_key.type != key.type || found_key.offset < new_size) 3145 goto next_key; 3146 3147 /* 3148 * if the key for the next leaf isn't a csum key from this objectid, 3149 * we can't be sure there aren't good items inside this leaf. 3150 * Bail out 3151 */ 3152 if (other_key.objectid != inode->i_ino || other_key.type != key.type) 3153 goto out; 3154 3155 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); 3156 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); 3157 /* 3158 * it is safe to delete this leaf, it contains only 3159 * csum items from this inode at an offset >= new_size 3160 */ 3161 ret = btrfs_del_leaf(trans, root, path, leaf_start); 3162 BUG_ON(ret); 3163 3164 if (root->ref_cows && leaf_gen < trans->transid) { 3165 ref = btrfs_alloc_leaf_ref(root, 0); 3166 if (ref) { 3167 ref->root_gen = root->root_key.offset; 3168 ref->bytenr = leaf_start; 3169 ref->owner = 0; 3170 ref->generation = leaf_gen; 3171 ref->nritems = 0; 3172 3173 btrfs_sort_leaf_ref(ref); 3174 3175 ret = btrfs_add_leaf_ref(root, ref, 0); 3176 WARN_ON(ret); 3177 btrfs_free_leaf_ref(root, ref); 3178 } else { 3179 WARN_ON(1); 3180 } 3181 } 3182 next_key: 3183 btrfs_release_path(root, path); 3184 3185 if (other_key.objectid == inode->i_ino && 3186 other_key.type == key.type && other_key.offset > key.offset) { 3187 key.offset = other_key.offset; 3188 cond_resched(); 3189 goto again; 3190 } 3191 ret = 0; 3192 out: 3193 /* fixup any changes we've made to the path */ 3194 path->lowest_level = 0; 3195 path->keep_locks = 0; 3196 btrfs_release_path(root, path); 3197 return ret; 3198 } 3199 3200 #endif 3201 3202 /* 3203 * this can truncate away extent items, csum items and directory items. 3204 * It starts at a high offset and removes keys until it can't find 3205 * any higher than new_size 3206 * 3207 * csum items that cross the new i_size are truncated to the new size 3208 * as well. 3209 * 3210 * min_type is the minimum key type to truncate down to. If set to 0, this 3211 * will kill all the items on this inode, including the INODE_ITEM_KEY. 3212 */ 3213 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3214 struct btrfs_root *root, 3215 struct inode *inode, 3216 u64 new_size, u32 min_type) 3217 { 3218 struct btrfs_path *path; 3219 struct extent_buffer *leaf; 3220 struct btrfs_file_extent_item *fi; 3221 struct btrfs_key key; 3222 struct btrfs_key found_key; 3223 u64 extent_start = 0; 3224 u64 extent_num_bytes = 0; 3225 u64 extent_offset = 0; 3226 u64 item_end = 0; 3227 u64 mask = root->sectorsize - 1; 3228 u32 found_type = (u8)-1; 3229 int found_extent; 3230 int del_item; 3231 int pending_del_nr = 0; 3232 int pending_del_slot = 0; 3233 int extent_type = -1; 3234 int encoding; 3235 int ret; 3236 int err = 0; 3237 3238 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3239 3240 if (root->ref_cows || root == root->fs_info->tree_root) 3241 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3242 3243 path = btrfs_alloc_path(); 3244 BUG_ON(!path); 3245 path->reada = -1; 3246 3247 key.objectid = inode->i_ino; 3248 key.offset = (u64)-1; 3249 key.type = (u8)-1; 3250 3251 search_again: 3252 path->leave_spinning = 1; 3253 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3254 if (ret < 0) { 3255 err = ret; 3256 goto out; 3257 } 3258 3259 if (ret > 0) { 3260 /* there are no items in the tree for us to truncate, we're 3261 * done 3262 */ 3263 if (path->slots[0] == 0) 3264 goto out; 3265 path->slots[0]--; 3266 } 3267 3268 while (1) { 3269 fi = NULL; 3270 leaf = path->nodes[0]; 3271 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3272 found_type = btrfs_key_type(&found_key); 3273 encoding = 0; 3274 3275 if (found_key.objectid != inode->i_ino) 3276 break; 3277 3278 if (found_type < min_type) 3279 break; 3280 3281 item_end = found_key.offset; 3282 if (found_type == BTRFS_EXTENT_DATA_KEY) { 3283 fi = btrfs_item_ptr(leaf, path->slots[0], 3284 struct btrfs_file_extent_item); 3285 extent_type = btrfs_file_extent_type(leaf, fi); 3286 encoding = btrfs_file_extent_compression(leaf, fi); 3287 encoding |= btrfs_file_extent_encryption(leaf, fi); 3288 encoding |= btrfs_file_extent_other_encoding(leaf, fi); 3289 3290 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3291 item_end += 3292 btrfs_file_extent_num_bytes(leaf, fi); 3293 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3294 item_end += btrfs_file_extent_inline_len(leaf, 3295 fi); 3296 } 3297 item_end--; 3298 } 3299 if (found_type > min_type) { 3300 del_item = 1; 3301 } else { 3302 if (item_end < new_size) 3303 break; 3304 if (found_key.offset >= new_size) 3305 del_item = 1; 3306 else 3307 del_item = 0; 3308 } 3309 found_extent = 0; 3310 /* FIXME, shrink the extent if the ref count is only 1 */ 3311 if (found_type != BTRFS_EXTENT_DATA_KEY) 3312 goto delete; 3313 3314 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3315 u64 num_dec; 3316 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3317 if (!del_item && !encoding) { 3318 u64 orig_num_bytes = 3319 btrfs_file_extent_num_bytes(leaf, fi); 3320 extent_num_bytes = new_size - 3321 found_key.offset + root->sectorsize - 1; 3322 extent_num_bytes = extent_num_bytes & 3323 ~((u64)root->sectorsize - 1); 3324 btrfs_set_file_extent_num_bytes(leaf, fi, 3325 extent_num_bytes); 3326 num_dec = (orig_num_bytes - 3327 extent_num_bytes); 3328 if (root->ref_cows && extent_start != 0) 3329 inode_sub_bytes(inode, num_dec); 3330 btrfs_mark_buffer_dirty(leaf); 3331 } else { 3332 extent_num_bytes = 3333 btrfs_file_extent_disk_num_bytes(leaf, 3334 fi); 3335 extent_offset = found_key.offset - 3336 btrfs_file_extent_offset(leaf, fi); 3337 3338 /* FIXME blocksize != 4096 */ 3339 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 3340 if (extent_start != 0) { 3341 found_extent = 1; 3342 if (root->ref_cows) 3343 inode_sub_bytes(inode, num_dec); 3344 } 3345 } 3346 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3347 /* 3348 * we can't truncate inline items that have had 3349 * special encodings 3350 */ 3351 if (!del_item && 3352 btrfs_file_extent_compression(leaf, fi) == 0 && 3353 btrfs_file_extent_encryption(leaf, fi) == 0 && 3354 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 3355 u32 size = new_size - found_key.offset; 3356 3357 if (root->ref_cows) { 3358 inode_sub_bytes(inode, item_end + 1 - 3359 new_size); 3360 } 3361 size = 3362 btrfs_file_extent_calc_inline_size(size); 3363 ret = btrfs_truncate_item(trans, root, path, 3364 size, 1); 3365 BUG_ON(ret); 3366 } else if (root->ref_cows) { 3367 inode_sub_bytes(inode, item_end + 1 - 3368 found_key.offset); 3369 } 3370 } 3371 delete: 3372 if (del_item) { 3373 if (!pending_del_nr) { 3374 /* no pending yet, add ourselves */ 3375 pending_del_slot = path->slots[0]; 3376 pending_del_nr = 1; 3377 } else if (pending_del_nr && 3378 path->slots[0] + 1 == pending_del_slot) { 3379 /* hop on the pending chunk */ 3380 pending_del_nr++; 3381 pending_del_slot = path->slots[0]; 3382 } else { 3383 BUG(); 3384 } 3385 } else { 3386 break; 3387 } 3388 if (found_extent && (root->ref_cows || 3389 root == root->fs_info->tree_root)) { 3390 btrfs_set_path_blocking(path); 3391 ret = btrfs_free_extent(trans, root, extent_start, 3392 extent_num_bytes, 0, 3393 btrfs_header_owner(leaf), 3394 inode->i_ino, extent_offset); 3395 BUG_ON(ret); 3396 } 3397 3398 if (found_type == BTRFS_INODE_ITEM_KEY) 3399 break; 3400 3401 if (path->slots[0] == 0 || 3402 path->slots[0] != pending_del_slot) { 3403 if (root->ref_cows) { 3404 err = -EAGAIN; 3405 goto out; 3406 } 3407 if (pending_del_nr) { 3408 ret = btrfs_del_items(trans, root, path, 3409 pending_del_slot, 3410 pending_del_nr); 3411 BUG_ON(ret); 3412 pending_del_nr = 0; 3413 } 3414 btrfs_release_path(root, path); 3415 goto search_again; 3416 } else { 3417 path->slots[0]--; 3418 } 3419 } 3420 out: 3421 if (pending_del_nr) { 3422 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3423 pending_del_nr); 3424 BUG_ON(ret); 3425 } 3426 btrfs_free_path(path); 3427 return err; 3428 } 3429 3430 /* 3431 * taken from block_truncate_page, but does cow as it zeros out 3432 * any bytes left in the last page in the file. 3433 */ 3434 static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3435 { 3436 struct inode *inode = mapping->host; 3437 struct btrfs_root *root = BTRFS_I(inode)->root; 3438 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3439 struct btrfs_ordered_extent *ordered; 3440 struct extent_state *cached_state = NULL; 3441 char *kaddr; 3442 u32 blocksize = root->sectorsize; 3443 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3444 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3445 struct page *page; 3446 int ret = 0; 3447 u64 page_start; 3448 u64 page_end; 3449 3450 if ((offset & (blocksize - 1)) == 0) 3451 goto out; 3452 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3453 if (ret) 3454 goto out; 3455 3456 ret = -ENOMEM; 3457 again: 3458 page = grab_cache_page(mapping, index); 3459 if (!page) { 3460 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3461 goto out; 3462 } 3463 3464 page_start = page_offset(page); 3465 page_end = page_start + PAGE_CACHE_SIZE - 1; 3466 3467 if (!PageUptodate(page)) { 3468 ret = btrfs_readpage(NULL, page); 3469 lock_page(page); 3470 if (page->mapping != mapping) { 3471 unlock_page(page); 3472 page_cache_release(page); 3473 goto again; 3474 } 3475 if (!PageUptodate(page)) { 3476 ret = -EIO; 3477 goto out_unlock; 3478 } 3479 } 3480 wait_on_page_writeback(page); 3481 3482 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, 3483 GFP_NOFS); 3484 set_page_extent_mapped(page); 3485 3486 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3487 if (ordered) { 3488 unlock_extent_cached(io_tree, page_start, page_end, 3489 &cached_state, GFP_NOFS); 3490 unlock_page(page); 3491 page_cache_release(page); 3492 btrfs_start_ordered_extent(inode, ordered, 1); 3493 btrfs_put_ordered_extent(ordered); 3494 goto again; 3495 } 3496 3497 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3498 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3499 0, 0, &cached_state, GFP_NOFS); 3500 3501 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3502 &cached_state); 3503 if (ret) { 3504 unlock_extent_cached(io_tree, page_start, page_end, 3505 &cached_state, GFP_NOFS); 3506 goto out_unlock; 3507 } 3508 3509 ret = 0; 3510 if (offset != PAGE_CACHE_SIZE) { 3511 kaddr = kmap(page); 3512 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3513 flush_dcache_page(page); 3514 kunmap(page); 3515 } 3516 ClearPageChecked(page); 3517 set_page_dirty(page); 3518 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 3519 GFP_NOFS); 3520 3521 out_unlock: 3522 if (ret) 3523 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3524 unlock_page(page); 3525 page_cache_release(page); 3526 out: 3527 return ret; 3528 } 3529 3530 int btrfs_cont_expand(struct inode *inode, loff_t size) 3531 { 3532 struct btrfs_trans_handle *trans; 3533 struct btrfs_root *root = BTRFS_I(inode)->root; 3534 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3535 struct extent_map *em = NULL; 3536 struct extent_state *cached_state = NULL; 3537 u64 mask = root->sectorsize - 1; 3538 u64 hole_start = (inode->i_size + mask) & ~mask; 3539 u64 block_end = (size + mask) & ~mask; 3540 u64 last_byte; 3541 u64 cur_offset; 3542 u64 hole_size; 3543 int err = 0; 3544 3545 if (size <= hole_start) 3546 return 0; 3547 3548 while (1) { 3549 struct btrfs_ordered_extent *ordered; 3550 btrfs_wait_ordered_range(inode, hole_start, 3551 block_end - hole_start); 3552 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 3553 &cached_state, GFP_NOFS); 3554 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3555 if (!ordered) 3556 break; 3557 unlock_extent_cached(io_tree, hole_start, block_end - 1, 3558 &cached_state, GFP_NOFS); 3559 btrfs_put_ordered_extent(ordered); 3560 } 3561 3562 cur_offset = hole_start; 3563 while (1) { 3564 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3565 block_end - cur_offset, 0); 3566 BUG_ON(IS_ERR(em) || !em); 3567 last_byte = min(extent_map_end(em), block_end); 3568 last_byte = (last_byte + mask) & ~mask; 3569 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3570 u64 hint_byte = 0; 3571 hole_size = last_byte - cur_offset; 3572 3573 trans = btrfs_start_transaction(root, 2); 3574 if (IS_ERR(trans)) { 3575 err = PTR_ERR(trans); 3576 break; 3577 } 3578 btrfs_set_trans_block_group(trans, inode); 3579 3580 err = btrfs_drop_extents(trans, inode, cur_offset, 3581 cur_offset + hole_size, 3582 &hint_byte, 1); 3583 BUG_ON(err); 3584 3585 err = btrfs_insert_file_extent(trans, root, 3586 inode->i_ino, cur_offset, 0, 3587 0, hole_size, 0, hole_size, 3588 0, 0, 0); 3589 BUG_ON(err); 3590 3591 btrfs_drop_extent_cache(inode, hole_start, 3592 last_byte - 1, 0); 3593 3594 btrfs_end_transaction(trans, root); 3595 } 3596 free_extent_map(em); 3597 em = NULL; 3598 cur_offset = last_byte; 3599 if (cur_offset >= block_end) 3600 break; 3601 } 3602 3603 free_extent_map(em); 3604 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3605 GFP_NOFS); 3606 return err; 3607 } 3608 3609 static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) 3610 { 3611 struct btrfs_root *root = BTRFS_I(inode)->root; 3612 struct btrfs_trans_handle *trans; 3613 unsigned long nr; 3614 int ret; 3615 3616 if (attr->ia_size == inode->i_size) 3617 return 0; 3618 3619 if (attr->ia_size > inode->i_size) { 3620 unsigned long limit; 3621 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 3622 if (attr->ia_size > inode->i_sb->s_maxbytes) 3623 return -EFBIG; 3624 if (limit != RLIM_INFINITY && attr->ia_size > limit) { 3625 send_sig(SIGXFSZ, current, 0); 3626 return -EFBIG; 3627 } 3628 } 3629 3630 trans = btrfs_start_transaction(root, 5); 3631 if (IS_ERR(trans)) 3632 return PTR_ERR(trans); 3633 3634 btrfs_set_trans_block_group(trans, inode); 3635 3636 ret = btrfs_orphan_add(trans, inode); 3637 BUG_ON(ret); 3638 3639 nr = trans->blocks_used; 3640 btrfs_end_transaction(trans, root); 3641 btrfs_btree_balance_dirty(root, nr); 3642 3643 if (attr->ia_size > inode->i_size) { 3644 ret = btrfs_cont_expand(inode, attr->ia_size); 3645 if (ret) { 3646 btrfs_truncate(inode); 3647 return ret; 3648 } 3649 3650 i_size_write(inode, attr->ia_size); 3651 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3652 3653 trans = btrfs_start_transaction(root, 0); 3654 BUG_ON(IS_ERR(trans)); 3655 btrfs_set_trans_block_group(trans, inode); 3656 trans->block_rsv = root->orphan_block_rsv; 3657 BUG_ON(!trans->block_rsv); 3658 3659 ret = btrfs_update_inode(trans, root, inode); 3660 BUG_ON(ret); 3661 if (inode->i_nlink > 0) { 3662 ret = btrfs_orphan_del(trans, inode); 3663 BUG_ON(ret); 3664 } 3665 nr = trans->blocks_used; 3666 btrfs_end_transaction(trans, root); 3667 btrfs_btree_balance_dirty(root, nr); 3668 return 0; 3669 } 3670 3671 /* 3672 * We're truncating a file that used to have good data down to 3673 * zero. Make sure it gets into the ordered flush list so that 3674 * any new writes get down to disk quickly. 3675 */ 3676 if (attr->ia_size == 0) 3677 BTRFS_I(inode)->ordered_data_close = 1; 3678 3679 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3680 ret = vmtruncate(inode, attr->ia_size); 3681 BUG_ON(ret); 3682 3683 return 0; 3684 } 3685 3686 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3687 { 3688 struct inode *inode = dentry->d_inode; 3689 struct btrfs_root *root = BTRFS_I(inode)->root; 3690 int err; 3691 3692 if (btrfs_root_readonly(root)) 3693 return -EROFS; 3694 3695 err = inode_change_ok(inode, attr); 3696 if (err) 3697 return err; 3698 3699 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3700 err = btrfs_setattr_size(inode, attr); 3701 if (err) 3702 return err; 3703 } 3704 3705 if (attr->ia_valid) { 3706 setattr_copy(inode, attr); 3707 mark_inode_dirty(inode); 3708 3709 if (attr->ia_valid & ATTR_MODE) 3710 err = btrfs_acl_chmod(inode); 3711 } 3712 3713 return err; 3714 } 3715 3716 void btrfs_evict_inode(struct inode *inode) 3717 { 3718 struct btrfs_trans_handle *trans; 3719 struct btrfs_root *root = BTRFS_I(inode)->root; 3720 unsigned long nr; 3721 int ret; 3722 3723 truncate_inode_pages(&inode->i_data, 0); 3724 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3725 root == root->fs_info->tree_root)) 3726 goto no_delete; 3727 3728 if (is_bad_inode(inode)) { 3729 btrfs_orphan_del(NULL, inode); 3730 goto no_delete; 3731 } 3732 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 3733 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3734 3735 if (root->fs_info->log_root_recovering) { 3736 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); 3737 goto no_delete; 3738 } 3739 3740 if (inode->i_nlink > 0) { 3741 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3742 goto no_delete; 3743 } 3744 3745 btrfs_i_size_write(inode, 0); 3746 3747 while (1) { 3748 trans = btrfs_start_transaction(root, 0); 3749 BUG_ON(IS_ERR(trans)); 3750 btrfs_set_trans_block_group(trans, inode); 3751 trans->block_rsv = root->orphan_block_rsv; 3752 3753 ret = btrfs_block_rsv_check(trans, root, 3754 root->orphan_block_rsv, 0, 5); 3755 if (ret) { 3756 BUG_ON(ret != -EAGAIN); 3757 ret = btrfs_commit_transaction(trans, root); 3758 BUG_ON(ret); 3759 continue; 3760 } 3761 3762 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3763 if (ret != -EAGAIN) 3764 break; 3765 3766 nr = trans->blocks_used; 3767 btrfs_end_transaction(trans, root); 3768 trans = NULL; 3769 btrfs_btree_balance_dirty(root, nr); 3770 3771 } 3772 3773 if (ret == 0) { 3774 ret = btrfs_orphan_del(trans, inode); 3775 BUG_ON(ret); 3776 } 3777 3778 nr = trans->blocks_used; 3779 btrfs_end_transaction(trans, root); 3780 btrfs_btree_balance_dirty(root, nr); 3781 no_delete: 3782 end_writeback(inode); 3783 return; 3784 } 3785 3786 /* 3787 * this returns the key found in the dir entry in the location pointer. 3788 * If no dir entries were found, location->objectid is 0. 3789 */ 3790 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3791 struct btrfs_key *location) 3792 { 3793 const char *name = dentry->d_name.name; 3794 int namelen = dentry->d_name.len; 3795 struct btrfs_dir_item *di; 3796 struct btrfs_path *path; 3797 struct btrfs_root *root = BTRFS_I(dir)->root; 3798 int ret = 0; 3799 3800 path = btrfs_alloc_path(); 3801 BUG_ON(!path); 3802 3803 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, 3804 namelen, 0); 3805 if (IS_ERR(di)) 3806 ret = PTR_ERR(di); 3807 3808 if (!di || IS_ERR(di)) 3809 goto out_err; 3810 3811 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3812 out: 3813 btrfs_free_path(path); 3814 return ret; 3815 out_err: 3816 location->objectid = 0; 3817 goto out; 3818 } 3819 3820 /* 3821 * when we hit a tree root in a directory, the btrfs part of the inode 3822 * needs to be changed to reflect the root directory of the tree root. This 3823 * is kind of like crossing a mount point. 3824 */ 3825 static int fixup_tree_root_location(struct btrfs_root *root, 3826 struct inode *dir, 3827 struct dentry *dentry, 3828 struct btrfs_key *location, 3829 struct btrfs_root **sub_root) 3830 { 3831 struct btrfs_path *path; 3832 struct btrfs_root *new_root; 3833 struct btrfs_root_ref *ref; 3834 struct extent_buffer *leaf; 3835 int ret; 3836 int err = 0; 3837 3838 path = btrfs_alloc_path(); 3839 if (!path) { 3840 err = -ENOMEM; 3841 goto out; 3842 } 3843 3844 err = -ENOENT; 3845 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3846 BTRFS_I(dir)->root->root_key.objectid, 3847 location->objectid); 3848 if (ret) { 3849 if (ret < 0) 3850 err = ret; 3851 goto out; 3852 } 3853 3854 leaf = path->nodes[0]; 3855 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3856 if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || 3857 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3858 goto out; 3859 3860 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3861 (unsigned long)(ref + 1), 3862 dentry->d_name.len); 3863 if (ret) 3864 goto out; 3865 3866 btrfs_release_path(root->fs_info->tree_root, path); 3867 3868 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3869 if (IS_ERR(new_root)) { 3870 err = PTR_ERR(new_root); 3871 goto out; 3872 } 3873 3874 if (btrfs_root_refs(&new_root->root_item) == 0) { 3875 err = -ENOENT; 3876 goto out; 3877 } 3878 3879 *sub_root = new_root; 3880 location->objectid = btrfs_root_dirid(&new_root->root_item); 3881 location->type = BTRFS_INODE_ITEM_KEY; 3882 location->offset = 0; 3883 err = 0; 3884 out: 3885 btrfs_free_path(path); 3886 return err; 3887 } 3888 3889 static void inode_tree_add(struct inode *inode) 3890 { 3891 struct btrfs_root *root = BTRFS_I(inode)->root; 3892 struct btrfs_inode *entry; 3893 struct rb_node **p; 3894 struct rb_node *parent; 3895 again: 3896 p = &root->inode_tree.rb_node; 3897 parent = NULL; 3898 3899 if (inode_unhashed(inode)) 3900 return; 3901 3902 spin_lock(&root->inode_lock); 3903 while (*p) { 3904 parent = *p; 3905 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3906 3907 if (inode->i_ino < entry->vfs_inode.i_ino) 3908 p = &parent->rb_left; 3909 else if (inode->i_ino > entry->vfs_inode.i_ino) 3910 p = &parent->rb_right; 3911 else { 3912 WARN_ON(!(entry->vfs_inode.i_state & 3913 (I_WILL_FREE | I_FREEING))); 3914 rb_erase(parent, &root->inode_tree); 3915 RB_CLEAR_NODE(parent); 3916 spin_unlock(&root->inode_lock); 3917 goto again; 3918 } 3919 } 3920 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3921 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3922 spin_unlock(&root->inode_lock); 3923 } 3924 3925 static void inode_tree_del(struct inode *inode) 3926 { 3927 struct btrfs_root *root = BTRFS_I(inode)->root; 3928 int empty = 0; 3929 3930 spin_lock(&root->inode_lock); 3931 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3932 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3933 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3934 empty = RB_EMPTY_ROOT(&root->inode_tree); 3935 } 3936 spin_unlock(&root->inode_lock); 3937 3938 /* 3939 * Free space cache has inodes in the tree root, but the tree root has a 3940 * root_refs of 0, so this could end up dropping the tree root as a 3941 * snapshot, so we need the extra !root->fs_info->tree_root check to 3942 * make sure we don't drop it. 3943 */ 3944 if (empty && btrfs_root_refs(&root->root_item) == 0 && 3945 root != root->fs_info->tree_root) { 3946 synchronize_srcu(&root->fs_info->subvol_srcu); 3947 spin_lock(&root->inode_lock); 3948 empty = RB_EMPTY_ROOT(&root->inode_tree); 3949 spin_unlock(&root->inode_lock); 3950 if (empty) 3951 btrfs_add_dead_root(root); 3952 } 3953 } 3954 3955 int btrfs_invalidate_inodes(struct btrfs_root *root) 3956 { 3957 struct rb_node *node; 3958 struct rb_node *prev; 3959 struct btrfs_inode *entry; 3960 struct inode *inode; 3961 u64 objectid = 0; 3962 3963 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 3964 3965 spin_lock(&root->inode_lock); 3966 again: 3967 node = root->inode_tree.rb_node; 3968 prev = NULL; 3969 while (node) { 3970 prev = node; 3971 entry = rb_entry(node, struct btrfs_inode, rb_node); 3972 3973 if (objectid < entry->vfs_inode.i_ino) 3974 node = node->rb_left; 3975 else if (objectid > entry->vfs_inode.i_ino) 3976 node = node->rb_right; 3977 else 3978 break; 3979 } 3980 if (!node) { 3981 while (prev) { 3982 entry = rb_entry(prev, struct btrfs_inode, rb_node); 3983 if (objectid <= entry->vfs_inode.i_ino) { 3984 node = prev; 3985 break; 3986 } 3987 prev = rb_next(prev); 3988 } 3989 } 3990 while (node) { 3991 entry = rb_entry(node, struct btrfs_inode, rb_node); 3992 objectid = entry->vfs_inode.i_ino + 1; 3993 inode = igrab(&entry->vfs_inode); 3994 if (inode) { 3995 spin_unlock(&root->inode_lock); 3996 if (atomic_read(&inode->i_count) > 1) 3997 d_prune_aliases(inode); 3998 /* 3999 * btrfs_drop_inode will have it removed from 4000 * the inode cache when its usage count 4001 * hits zero. 4002 */ 4003 iput(inode); 4004 cond_resched(); 4005 spin_lock(&root->inode_lock); 4006 goto again; 4007 } 4008 4009 if (cond_resched_lock(&root->inode_lock)) 4010 goto again; 4011 4012 node = rb_next(node); 4013 } 4014 spin_unlock(&root->inode_lock); 4015 return 0; 4016 } 4017 4018 static int btrfs_init_locked_inode(struct inode *inode, void *p) 4019 { 4020 struct btrfs_iget_args *args = p; 4021 inode->i_ino = args->ino; 4022 BTRFS_I(inode)->root = args->root; 4023 btrfs_set_inode_space_info(args->root, inode); 4024 return 0; 4025 } 4026 4027 static int btrfs_find_actor(struct inode *inode, void *opaque) 4028 { 4029 struct btrfs_iget_args *args = opaque; 4030 return args->ino == inode->i_ino && 4031 args->root == BTRFS_I(inode)->root; 4032 } 4033 4034 static struct inode *btrfs_iget_locked(struct super_block *s, 4035 u64 objectid, 4036 struct btrfs_root *root) 4037 { 4038 struct inode *inode; 4039 struct btrfs_iget_args args; 4040 args.ino = objectid; 4041 args.root = root; 4042 4043 inode = iget5_locked(s, objectid, btrfs_find_actor, 4044 btrfs_init_locked_inode, 4045 (void *)&args); 4046 return inode; 4047 } 4048 4049 /* Get an inode object given its location and corresponding root. 4050 * Returns in *is_new if the inode was read from disk 4051 */ 4052 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 4053 struct btrfs_root *root, int *new) 4054 { 4055 struct inode *inode; 4056 4057 inode = btrfs_iget_locked(s, location->objectid, root); 4058 if (!inode) 4059 return ERR_PTR(-ENOMEM); 4060 4061 if (inode->i_state & I_NEW) { 4062 BTRFS_I(inode)->root = root; 4063 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4064 btrfs_read_locked_inode(inode); 4065 4066 inode_tree_add(inode); 4067 unlock_new_inode(inode); 4068 if (new) 4069 *new = 1; 4070 } 4071 4072 return inode; 4073 } 4074 4075 static struct inode *new_simple_dir(struct super_block *s, 4076 struct btrfs_key *key, 4077 struct btrfs_root *root) 4078 { 4079 struct inode *inode = new_inode(s); 4080 4081 if (!inode) 4082 return ERR_PTR(-ENOMEM); 4083 4084 BTRFS_I(inode)->root = root; 4085 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4086 BTRFS_I(inode)->dummy_inode = 1; 4087 4088 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4089 inode->i_op = &simple_dir_inode_operations; 4090 inode->i_fop = &simple_dir_operations; 4091 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 4092 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4093 4094 return inode; 4095 } 4096 4097 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 4098 { 4099 struct inode *inode; 4100 struct btrfs_root *root = BTRFS_I(dir)->root; 4101 struct btrfs_root *sub_root = root; 4102 struct btrfs_key location; 4103 int index; 4104 int ret; 4105 4106 if (dentry->d_name.len > BTRFS_NAME_LEN) 4107 return ERR_PTR(-ENAMETOOLONG); 4108 4109 ret = btrfs_inode_by_name(dir, dentry, &location); 4110 4111 if (ret < 0) 4112 return ERR_PTR(ret); 4113 4114 if (location.objectid == 0) 4115 return NULL; 4116 4117 if (location.type == BTRFS_INODE_ITEM_KEY) { 4118 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 4119 return inode; 4120 } 4121 4122 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 4123 4124 index = srcu_read_lock(&root->fs_info->subvol_srcu); 4125 ret = fixup_tree_root_location(root, dir, dentry, 4126 &location, &sub_root); 4127 if (ret < 0) { 4128 if (ret != -ENOENT) 4129 inode = ERR_PTR(ret); 4130 else 4131 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4132 } else { 4133 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 4134 } 4135 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4136 4137 if (root != sub_root) { 4138 down_read(&root->fs_info->cleanup_work_sem); 4139 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4140 btrfs_orphan_cleanup(sub_root); 4141 up_read(&root->fs_info->cleanup_work_sem); 4142 } 4143 4144 return inode; 4145 } 4146 4147 static int btrfs_dentry_delete(const struct dentry *dentry) 4148 { 4149 struct btrfs_root *root; 4150 4151 if (!dentry->d_inode && !IS_ROOT(dentry)) 4152 dentry = dentry->d_parent; 4153 4154 if (dentry->d_inode) { 4155 root = BTRFS_I(dentry->d_inode)->root; 4156 if (btrfs_root_refs(&root->root_item) == 0) 4157 return 1; 4158 } 4159 return 0; 4160 } 4161 4162 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4163 struct nameidata *nd) 4164 { 4165 struct inode *inode; 4166 4167 inode = btrfs_lookup_dentry(dir, dentry); 4168 if (IS_ERR(inode)) 4169 return ERR_CAST(inode); 4170 4171 return d_splice_alias(inode, dentry); 4172 } 4173 4174 static unsigned char btrfs_filetype_table[] = { 4175 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 4176 }; 4177 4178 static int btrfs_real_readdir(struct file *filp, void *dirent, 4179 filldir_t filldir) 4180 { 4181 struct inode *inode = filp->f_dentry->d_inode; 4182 struct btrfs_root *root = BTRFS_I(inode)->root; 4183 struct btrfs_item *item; 4184 struct btrfs_dir_item *di; 4185 struct btrfs_key key; 4186 struct btrfs_key found_key; 4187 struct btrfs_path *path; 4188 int ret; 4189 u32 nritems; 4190 struct extent_buffer *leaf; 4191 int slot; 4192 int advance; 4193 unsigned char d_type; 4194 int over = 0; 4195 u32 di_cur; 4196 u32 di_total; 4197 u32 di_len; 4198 int key_type = BTRFS_DIR_INDEX_KEY; 4199 char tmp_name[32]; 4200 char *name_ptr; 4201 int name_len; 4202 4203 /* FIXME, use a real flag for deciding about the key type */ 4204 if (root->fs_info->tree_root == root) 4205 key_type = BTRFS_DIR_ITEM_KEY; 4206 4207 /* special case for "." */ 4208 if (filp->f_pos == 0) { 4209 over = filldir(dirent, ".", 1, 4210 1, inode->i_ino, 4211 DT_DIR); 4212 if (over) 4213 return 0; 4214 filp->f_pos = 1; 4215 } 4216 /* special case for .., just use the back ref */ 4217 if (filp->f_pos == 1) { 4218 u64 pino = parent_ino(filp->f_path.dentry); 4219 over = filldir(dirent, "..", 2, 4220 2, pino, DT_DIR); 4221 if (over) 4222 return 0; 4223 filp->f_pos = 2; 4224 } 4225 path = btrfs_alloc_path(); 4226 path->reada = 2; 4227 4228 btrfs_set_key_type(&key, key_type); 4229 key.offset = filp->f_pos; 4230 key.objectid = inode->i_ino; 4231 4232 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4233 if (ret < 0) 4234 goto err; 4235 advance = 0; 4236 4237 while (1) { 4238 leaf = path->nodes[0]; 4239 nritems = btrfs_header_nritems(leaf); 4240 slot = path->slots[0]; 4241 if (advance || slot >= nritems) { 4242 if (slot >= nritems - 1) { 4243 ret = btrfs_next_leaf(root, path); 4244 if (ret) 4245 break; 4246 leaf = path->nodes[0]; 4247 nritems = btrfs_header_nritems(leaf); 4248 slot = path->slots[0]; 4249 } else { 4250 slot++; 4251 path->slots[0]++; 4252 } 4253 } 4254 4255 advance = 1; 4256 item = btrfs_item_nr(leaf, slot); 4257 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4258 4259 if (found_key.objectid != key.objectid) 4260 break; 4261 if (btrfs_key_type(&found_key) != key_type) 4262 break; 4263 if (found_key.offset < filp->f_pos) 4264 continue; 4265 4266 filp->f_pos = found_key.offset; 4267 4268 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 4269 di_cur = 0; 4270 di_total = btrfs_item_size(leaf, item); 4271 4272 while (di_cur < di_total) { 4273 struct btrfs_key location; 4274 4275 name_len = btrfs_dir_name_len(leaf, di); 4276 if (name_len <= sizeof(tmp_name)) { 4277 name_ptr = tmp_name; 4278 } else { 4279 name_ptr = kmalloc(name_len, GFP_NOFS); 4280 if (!name_ptr) { 4281 ret = -ENOMEM; 4282 goto err; 4283 } 4284 } 4285 read_extent_buffer(leaf, name_ptr, 4286 (unsigned long)(di + 1), name_len); 4287 4288 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4289 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4290 4291 /* is this a reference to our own snapshot? If so 4292 * skip it 4293 */ 4294 if (location.type == BTRFS_ROOT_ITEM_KEY && 4295 location.objectid == root->root_key.objectid) { 4296 over = 0; 4297 goto skip; 4298 } 4299 over = filldir(dirent, name_ptr, name_len, 4300 found_key.offset, location.objectid, 4301 d_type); 4302 4303 skip: 4304 if (name_ptr != tmp_name) 4305 kfree(name_ptr); 4306 4307 if (over) 4308 goto nopos; 4309 di_len = btrfs_dir_name_len(leaf, di) + 4310 btrfs_dir_data_len(leaf, di) + sizeof(*di); 4311 di_cur += di_len; 4312 di = (struct btrfs_dir_item *)((char *)di + di_len); 4313 } 4314 } 4315 4316 /* Reached end of directory/root. Bump pos past the last item. */ 4317 if (key_type == BTRFS_DIR_INDEX_KEY) 4318 /* 4319 * 32-bit glibc will use getdents64, but then strtol - 4320 * so the last number we can serve is this. 4321 */ 4322 filp->f_pos = 0x7fffffff; 4323 else 4324 filp->f_pos++; 4325 nopos: 4326 ret = 0; 4327 err: 4328 btrfs_free_path(path); 4329 return ret; 4330 } 4331 4332 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 4333 { 4334 struct btrfs_root *root = BTRFS_I(inode)->root; 4335 struct btrfs_trans_handle *trans; 4336 int ret = 0; 4337 bool nolock = false; 4338 4339 if (BTRFS_I(inode)->dummy_inode) 4340 return 0; 4341 4342 smp_mb(); 4343 nolock = (root->fs_info->closing && root == root->fs_info->tree_root); 4344 4345 if (wbc->sync_mode == WB_SYNC_ALL) { 4346 if (nolock) 4347 trans = btrfs_join_transaction_nolock(root, 1); 4348 else 4349 trans = btrfs_join_transaction(root, 1); 4350 btrfs_set_trans_block_group(trans, inode); 4351 if (nolock) 4352 ret = btrfs_end_transaction_nolock(trans, root); 4353 else 4354 ret = btrfs_commit_transaction(trans, root); 4355 } 4356 return ret; 4357 } 4358 4359 /* 4360 * This is somewhat expensive, updating the tree every time the 4361 * inode changes. But, it is most likely to find the inode in cache. 4362 * FIXME, needs more benchmarking...there are no reasons other than performance 4363 * to keep or drop this code. 4364 */ 4365 void btrfs_dirty_inode(struct inode *inode) 4366 { 4367 struct btrfs_root *root = BTRFS_I(inode)->root; 4368 struct btrfs_trans_handle *trans; 4369 int ret; 4370 4371 if (BTRFS_I(inode)->dummy_inode) 4372 return; 4373 4374 trans = btrfs_join_transaction(root, 1); 4375 btrfs_set_trans_block_group(trans, inode); 4376 4377 ret = btrfs_update_inode(trans, root, inode); 4378 if (ret && ret == -ENOSPC) { 4379 /* whoops, lets try again with the full transaction */ 4380 btrfs_end_transaction(trans, root); 4381 trans = btrfs_start_transaction(root, 1); 4382 if (IS_ERR(trans)) { 4383 if (printk_ratelimit()) { 4384 printk(KERN_ERR "btrfs: fail to " 4385 "dirty inode %lu error %ld\n", 4386 inode->i_ino, PTR_ERR(trans)); 4387 } 4388 return; 4389 } 4390 btrfs_set_trans_block_group(trans, inode); 4391 4392 ret = btrfs_update_inode(trans, root, inode); 4393 if (ret) { 4394 if (printk_ratelimit()) { 4395 printk(KERN_ERR "btrfs: fail to " 4396 "dirty inode %lu error %d\n", 4397 inode->i_ino, ret); 4398 } 4399 } 4400 } 4401 btrfs_end_transaction(trans, root); 4402 } 4403 4404 /* 4405 * find the highest existing sequence number in a directory 4406 * and then set the in-memory index_cnt variable to reflect 4407 * free sequence numbers 4408 */ 4409 static int btrfs_set_inode_index_count(struct inode *inode) 4410 { 4411 struct btrfs_root *root = BTRFS_I(inode)->root; 4412 struct btrfs_key key, found_key; 4413 struct btrfs_path *path; 4414 struct extent_buffer *leaf; 4415 int ret; 4416 4417 key.objectid = inode->i_ino; 4418 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 4419 key.offset = (u64)-1; 4420 4421 path = btrfs_alloc_path(); 4422 if (!path) 4423 return -ENOMEM; 4424 4425 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4426 if (ret < 0) 4427 goto out; 4428 /* FIXME: we should be able to handle this */ 4429 if (ret == 0) 4430 goto out; 4431 ret = 0; 4432 4433 /* 4434 * MAGIC NUMBER EXPLANATION: 4435 * since we search a directory based on f_pos we have to start at 2 4436 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 4437 * else has to start at 2 4438 */ 4439 if (path->slots[0] == 0) { 4440 BTRFS_I(inode)->index_cnt = 2; 4441 goto out; 4442 } 4443 4444 path->slots[0]--; 4445 4446 leaf = path->nodes[0]; 4447 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4448 4449 if (found_key.objectid != inode->i_ino || 4450 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 4451 BTRFS_I(inode)->index_cnt = 2; 4452 goto out; 4453 } 4454 4455 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 4456 out: 4457 btrfs_free_path(path); 4458 return ret; 4459 } 4460 4461 /* 4462 * helper to find a free sequence number in a given directory. This current 4463 * code is very simple, later versions will do smarter things in the btree 4464 */ 4465 int btrfs_set_inode_index(struct inode *dir, u64 *index) 4466 { 4467 int ret = 0; 4468 4469 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 4470 ret = btrfs_set_inode_index_count(dir); 4471 if (ret) 4472 return ret; 4473 } 4474 4475 *index = BTRFS_I(dir)->index_cnt; 4476 BTRFS_I(dir)->index_cnt++; 4477 4478 return ret; 4479 } 4480 4481 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 4482 struct btrfs_root *root, 4483 struct inode *dir, 4484 const char *name, int name_len, 4485 u64 ref_objectid, u64 objectid, 4486 u64 alloc_hint, int mode, u64 *index) 4487 { 4488 struct inode *inode; 4489 struct btrfs_inode_item *inode_item; 4490 struct btrfs_key *location; 4491 struct btrfs_path *path; 4492 struct btrfs_inode_ref *ref; 4493 struct btrfs_key key[2]; 4494 u32 sizes[2]; 4495 unsigned long ptr; 4496 int ret; 4497 int owner; 4498 4499 path = btrfs_alloc_path(); 4500 BUG_ON(!path); 4501 4502 inode = new_inode(root->fs_info->sb); 4503 if (!inode) 4504 return ERR_PTR(-ENOMEM); 4505 4506 if (dir) { 4507 ret = btrfs_set_inode_index(dir, index); 4508 if (ret) { 4509 iput(inode); 4510 return ERR_PTR(ret); 4511 } 4512 } 4513 /* 4514 * index_cnt is ignored for everything but a dir, 4515 * btrfs_get_inode_index_count has an explanation for the magic 4516 * number 4517 */ 4518 BTRFS_I(inode)->index_cnt = 2; 4519 BTRFS_I(inode)->root = root; 4520 BTRFS_I(inode)->generation = trans->transid; 4521 inode->i_generation = BTRFS_I(inode)->generation; 4522 btrfs_set_inode_space_info(root, inode); 4523 4524 if (mode & S_IFDIR) 4525 owner = 0; 4526 else 4527 owner = 1; 4528 BTRFS_I(inode)->block_group = 4529 btrfs_find_block_group(root, 0, alloc_hint, owner); 4530 4531 key[0].objectid = objectid; 4532 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4533 key[0].offset = 0; 4534 4535 key[1].objectid = objectid; 4536 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4537 key[1].offset = ref_objectid; 4538 4539 sizes[0] = sizeof(struct btrfs_inode_item); 4540 sizes[1] = name_len + sizeof(*ref); 4541 4542 path->leave_spinning = 1; 4543 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 4544 if (ret != 0) 4545 goto fail; 4546 4547 inode_init_owner(inode, dir, mode); 4548 inode->i_ino = objectid; 4549 inode_set_bytes(inode, 0); 4550 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4551 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4552 struct btrfs_inode_item); 4553 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4554 4555 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4556 struct btrfs_inode_ref); 4557 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 4558 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 4559 ptr = (unsigned long)(ref + 1); 4560 write_extent_buffer(path->nodes[0], name, ptr, name_len); 4561 4562 btrfs_mark_buffer_dirty(path->nodes[0]); 4563 btrfs_free_path(path); 4564 4565 location = &BTRFS_I(inode)->location; 4566 location->objectid = objectid; 4567 location->offset = 0; 4568 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 4569 4570 btrfs_inherit_iflags(inode, dir); 4571 4572 if ((mode & S_IFREG)) { 4573 if (btrfs_test_opt(root, NODATASUM)) 4574 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4575 if (btrfs_test_opt(root, NODATACOW)) 4576 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4577 } 4578 4579 insert_inode_hash(inode); 4580 inode_tree_add(inode); 4581 return inode; 4582 fail: 4583 if (dir) 4584 BTRFS_I(dir)->index_cnt--; 4585 btrfs_free_path(path); 4586 iput(inode); 4587 return ERR_PTR(ret); 4588 } 4589 4590 static inline u8 btrfs_inode_type(struct inode *inode) 4591 { 4592 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 4593 } 4594 4595 /* 4596 * utility function to add 'inode' into 'parent_inode' with 4597 * a give name and a given sequence number. 4598 * if 'add_backref' is true, also insert a backref from the 4599 * inode to the parent directory. 4600 */ 4601 int btrfs_add_link(struct btrfs_trans_handle *trans, 4602 struct inode *parent_inode, struct inode *inode, 4603 const char *name, int name_len, int add_backref, u64 index) 4604 { 4605 int ret = 0; 4606 struct btrfs_key key; 4607 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4608 4609 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 4610 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4611 } else { 4612 key.objectid = inode->i_ino; 4613 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4614 key.offset = 0; 4615 } 4616 4617 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 4618 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4619 key.objectid, root->root_key.objectid, 4620 parent_inode->i_ino, 4621 index, name, name_len); 4622 } else if (add_backref) { 4623 ret = btrfs_insert_inode_ref(trans, root, 4624 name, name_len, inode->i_ino, 4625 parent_inode->i_ino, index); 4626 } 4627 4628 if (ret == 0) { 4629 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4630 parent_inode->i_ino, &key, 4631 btrfs_inode_type(inode), index); 4632 BUG_ON(ret); 4633 4634 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4635 name_len * 2); 4636 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4637 ret = btrfs_update_inode(trans, root, parent_inode); 4638 } 4639 return ret; 4640 } 4641 4642 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4643 struct inode *dir, struct dentry *dentry, 4644 struct inode *inode, int backref, u64 index) 4645 { 4646 int err = btrfs_add_link(trans, dir, inode, 4647 dentry->d_name.name, dentry->d_name.len, 4648 backref, index); 4649 if (!err) { 4650 d_instantiate(dentry, inode); 4651 return 0; 4652 } 4653 if (err > 0) 4654 err = -EEXIST; 4655 return err; 4656 } 4657 4658 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4659 int mode, dev_t rdev) 4660 { 4661 struct btrfs_trans_handle *trans; 4662 struct btrfs_root *root = BTRFS_I(dir)->root; 4663 struct inode *inode = NULL; 4664 int err; 4665 int drop_inode = 0; 4666 u64 objectid; 4667 unsigned long nr = 0; 4668 u64 index = 0; 4669 4670 if (!new_valid_dev(rdev)) 4671 return -EINVAL; 4672 4673 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); 4674 if (err) 4675 return err; 4676 4677 /* 4678 * 2 for inode item and ref 4679 * 2 for dir items 4680 * 1 for xattr if selinux is on 4681 */ 4682 trans = btrfs_start_transaction(root, 5); 4683 if (IS_ERR(trans)) 4684 return PTR_ERR(trans); 4685 4686 btrfs_set_trans_block_group(trans, dir); 4687 4688 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4689 dentry->d_name.len, dir->i_ino, objectid, 4690 BTRFS_I(dir)->block_group, mode, &index); 4691 err = PTR_ERR(inode); 4692 if (IS_ERR(inode)) 4693 goto out_unlock; 4694 4695 err = btrfs_init_inode_security(trans, inode, dir); 4696 if (err) { 4697 drop_inode = 1; 4698 goto out_unlock; 4699 } 4700 4701 btrfs_set_trans_block_group(trans, inode); 4702 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4703 if (err) 4704 drop_inode = 1; 4705 else { 4706 inode->i_op = &btrfs_special_inode_operations; 4707 init_special_inode(inode, inode->i_mode, rdev); 4708 btrfs_update_inode(trans, root, inode); 4709 } 4710 btrfs_update_inode_block_group(trans, inode); 4711 btrfs_update_inode_block_group(trans, dir); 4712 out_unlock: 4713 nr = trans->blocks_used; 4714 btrfs_end_transaction_throttle(trans, root); 4715 btrfs_btree_balance_dirty(root, nr); 4716 if (drop_inode) { 4717 inode_dec_link_count(inode); 4718 iput(inode); 4719 } 4720 return err; 4721 } 4722 4723 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4724 int mode, struct nameidata *nd) 4725 { 4726 struct btrfs_trans_handle *trans; 4727 struct btrfs_root *root = BTRFS_I(dir)->root; 4728 struct inode *inode = NULL; 4729 int drop_inode = 0; 4730 int err; 4731 unsigned long nr = 0; 4732 u64 objectid; 4733 u64 index = 0; 4734 4735 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); 4736 if (err) 4737 return err; 4738 /* 4739 * 2 for inode item and ref 4740 * 2 for dir items 4741 * 1 for xattr if selinux is on 4742 */ 4743 trans = btrfs_start_transaction(root, 5); 4744 if (IS_ERR(trans)) 4745 return PTR_ERR(trans); 4746 4747 btrfs_set_trans_block_group(trans, dir); 4748 4749 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4750 dentry->d_name.len, dir->i_ino, objectid, 4751 BTRFS_I(dir)->block_group, mode, &index); 4752 err = PTR_ERR(inode); 4753 if (IS_ERR(inode)) 4754 goto out_unlock; 4755 4756 err = btrfs_init_inode_security(trans, inode, dir); 4757 if (err) { 4758 drop_inode = 1; 4759 goto out_unlock; 4760 } 4761 4762 btrfs_set_trans_block_group(trans, inode); 4763 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4764 if (err) 4765 drop_inode = 1; 4766 else { 4767 inode->i_mapping->a_ops = &btrfs_aops; 4768 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4769 inode->i_fop = &btrfs_file_operations; 4770 inode->i_op = &btrfs_file_inode_operations; 4771 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4772 } 4773 btrfs_update_inode_block_group(trans, inode); 4774 btrfs_update_inode_block_group(trans, dir); 4775 out_unlock: 4776 nr = trans->blocks_used; 4777 btrfs_end_transaction_throttle(trans, root); 4778 if (drop_inode) { 4779 inode_dec_link_count(inode); 4780 iput(inode); 4781 } 4782 btrfs_btree_balance_dirty(root, nr); 4783 return err; 4784 } 4785 4786 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 4787 struct dentry *dentry) 4788 { 4789 struct btrfs_trans_handle *trans; 4790 struct btrfs_root *root = BTRFS_I(dir)->root; 4791 struct inode *inode = old_dentry->d_inode; 4792 u64 index; 4793 unsigned long nr = 0; 4794 int err; 4795 int drop_inode = 0; 4796 4797 if (inode->i_nlink == 0) 4798 return -ENOENT; 4799 4800 /* do not allow sys_link's with other subvols of the same device */ 4801 if (root->objectid != BTRFS_I(inode)->root->objectid) 4802 return -EPERM; 4803 4804 btrfs_inc_nlink(inode); 4805 inode->i_ctime = CURRENT_TIME; 4806 4807 err = btrfs_set_inode_index(dir, &index); 4808 if (err) 4809 goto fail; 4810 4811 /* 4812 * 1 item for inode ref 4813 * 2 items for dir items 4814 */ 4815 trans = btrfs_start_transaction(root, 3); 4816 if (IS_ERR(trans)) { 4817 err = PTR_ERR(trans); 4818 goto fail; 4819 } 4820 4821 btrfs_set_trans_block_group(trans, dir); 4822 ihold(inode); 4823 4824 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 4825 4826 if (err) { 4827 drop_inode = 1; 4828 } else { 4829 struct dentry *parent = dget_parent(dentry); 4830 btrfs_update_inode_block_group(trans, dir); 4831 err = btrfs_update_inode(trans, root, inode); 4832 BUG_ON(err); 4833 btrfs_log_new_name(trans, inode, NULL, parent); 4834 dput(parent); 4835 } 4836 4837 nr = trans->blocks_used; 4838 btrfs_end_transaction_throttle(trans, root); 4839 fail: 4840 if (drop_inode) { 4841 inode_dec_link_count(inode); 4842 iput(inode); 4843 } 4844 btrfs_btree_balance_dirty(root, nr); 4845 return err; 4846 } 4847 4848 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 4849 { 4850 struct inode *inode = NULL; 4851 struct btrfs_trans_handle *trans; 4852 struct btrfs_root *root = BTRFS_I(dir)->root; 4853 int err = 0; 4854 int drop_on_err = 0; 4855 u64 objectid = 0; 4856 u64 index = 0; 4857 unsigned long nr = 1; 4858 4859 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); 4860 if (err) 4861 return err; 4862 4863 /* 4864 * 2 items for inode and ref 4865 * 2 items for dir items 4866 * 1 for xattr if selinux is on 4867 */ 4868 trans = btrfs_start_transaction(root, 5); 4869 if (IS_ERR(trans)) 4870 return PTR_ERR(trans); 4871 btrfs_set_trans_block_group(trans, dir); 4872 4873 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4874 dentry->d_name.len, dir->i_ino, objectid, 4875 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4876 &index); 4877 if (IS_ERR(inode)) { 4878 err = PTR_ERR(inode); 4879 goto out_fail; 4880 } 4881 4882 drop_on_err = 1; 4883 4884 err = btrfs_init_inode_security(trans, inode, dir); 4885 if (err) 4886 goto out_fail; 4887 4888 inode->i_op = &btrfs_dir_inode_operations; 4889 inode->i_fop = &btrfs_dir_file_operations; 4890 btrfs_set_trans_block_group(trans, inode); 4891 4892 btrfs_i_size_write(inode, 0); 4893 err = btrfs_update_inode(trans, root, inode); 4894 if (err) 4895 goto out_fail; 4896 4897 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 4898 dentry->d_name.len, 0, index); 4899 if (err) 4900 goto out_fail; 4901 4902 d_instantiate(dentry, inode); 4903 drop_on_err = 0; 4904 btrfs_update_inode_block_group(trans, inode); 4905 btrfs_update_inode_block_group(trans, dir); 4906 4907 out_fail: 4908 nr = trans->blocks_used; 4909 btrfs_end_transaction_throttle(trans, root); 4910 if (drop_on_err) 4911 iput(inode); 4912 btrfs_btree_balance_dirty(root, nr); 4913 return err; 4914 } 4915 4916 /* helper for btfs_get_extent. Given an existing extent in the tree, 4917 * and an extent that you want to insert, deal with overlap and insert 4918 * the new extent into the tree. 4919 */ 4920 static int merge_extent_mapping(struct extent_map_tree *em_tree, 4921 struct extent_map *existing, 4922 struct extent_map *em, 4923 u64 map_start, u64 map_len) 4924 { 4925 u64 start_diff; 4926 4927 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 4928 start_diff = map_start - em->start; 4929 em->start = map_start; 4930 em->len = map_len; 4931 if (em->block_start < EXTENT_MAP_LAST_BYTE && 4932 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 4933 em->block_start += start_diff; 4934 em->block_len -= start_diff; 4935 } 4936 return add_extent_mapping(em_tree, em); 4937 } 4938 4939 static noinline int uncompress_inline(struct btrfs_path *path, 4940 struct inode *inode, struct page *page, 4941 size_t pg_offset, u64 extent_offset, 4942 struct btrfs_file_extent_item *item) 4943 { 4944 int ret; 4945 struct extent_buffer *leaf = path->nodes[0]; 4946 char *tmp; 4947 size_t max_size; 4948 unsigned long inline_size; 4949 unsigned long ptr; 4950 int compress_type; 4951 4952 WARN_ON(pg_offset != 0); 4953 compress_type = btrfs_file_extent_compression(leaf, item); 4954 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4955 inline_size = btrfs_file_extent_inline_item_len(leaf, 4956 btrfs_item_nr(leaf, path->slots[0])); 4957 tmp = kmalloc(inline_size, GFP_NOFS); 4958 ptr = btrfs_file_extent_inline_start(item); 4959 4960 read_extent_buffer(leaf, tmp, ptr, inline_size); 4961 4962 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4963 ret = btrfs_decompress(compress_type, tmp, page, 4964 extent_offset, inline_size, max_size); 4965 if (ret) { 4966 char *kaddr = kmap_atomic(page, KM_USER0); 4967 unsigned long copy_size = min_t(u64, 4968 PAGE_CACHE_SIZE - pg_offset, 4969 max_size - extent_offset); 4970 memset(kaddr + pg_offset, 0, copy_size); 4971 kunmap_atomic(kaddr, KM_USER0); 4972 } 4973 kfree(tmp); 4974 return 0; 4975 } 4976 4977 /* 4978 * a bit scary, this does extent mapping from logical file offset to the disk. 4979 * the ugly parts come from merging extents from the disk with the in-ram 4980 * representation. This gets more complex because of the data=ordered code, 4981 * where the in-ram extents might be locked pending data=ordered completion. 4982 * 4983 * This also copies inline extents directly into the page. 4984 */ 4985 4986 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 4987 size_t pg_offset, u64 start, u64 len, 4988 int create) 4989 { 4990 int ret; 4991 int err = 0; 4992 u64 bytenr; 4993 u64 extent_start = 0; 4994 u64 extent_end = 0; 4995 u64 objectid = inode->i_ino; 4996 u32 found_type; 4997 struct btrfs_path *path = NULL; 4998 struct btrfs_root *root = BTRFS_I(inode)->root; 4999 struct btrfs_file_extent_item *item; 5000 struct extent_buffer *leaf; 5001 struct btrfs_key found_key; 5002 struct extent_map *em = NULL; 5003 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5004 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5005 struct btrfs_trans_handle *trans = NULL; 5006 int compress_type; 5007 5008 again: 5009 read_lock(&em_tree->lock); 5010 em = lookup_extent_mapping(em_tree, start, len); 5011 if (em) 5012 em->bdev = root->fs_info->fs_devices->latest_bdev; 5013 read_unlock(&em_tree->lock); 5014 5015 if (em) { 5016 if (em->start > start || em->start + em->len <= start) 5017 free_extent_map(em); 5018 else if (em->block_start == EXTENT_MAP_INLINE && page) 5019 free_extent_map(em); 5020 else 5021 goto out; 5022 } 5023 em = alloc_extent_map(GFP_NOFS); 5024 if (!em) { 5025 err = -ENOMEM; 5026 goto out; 5027 } 5028 em->bdev = root->fs_info->fs_devices->latest_bdev; 5029 em->start = EXTENT_MAP_HOLE; 5030 em->orig_start = EXTENT_MAP_HOLE; 5031 em->len = (u64)-1; 5032 em->block_len = (u64)-1; 5033 5034 if (!path) { 5035 path = btrfs_alloc_path(); 5036 BUG_ON(!path); 5037 } 5038 5039 ret = btrfs_lookup_file_extent(trans, root, path, 5040 objectid, start, trans != NULL); 5041 if (ret < 0) { 5042 err = ret; 5043 goto out; 5044 } 5045 5046 if (ret != 0) { 5047 if (path->slots[0] == 0) 5048 goto not_found; 5049 path->slots[0]--; 5050 } 5051 5052 leaf = path->nodes[0]; 5053 item = btrfs_item_ptr(leaf, path->slots[0], 5054 struct btrfs_file_extent_item); 5055 /* are we inside the extent that was found? */ 5056 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5057 found_type = btrfs_key_type(&found_key); 5058 if (found_key.objectid != objectid || 5059 found_type != BTRFS_EXTENT_DATA_KEY) { 5060 goto not_found; 5061 } 5062 5063 found_type = btrfs_file_extent_type(leaf, item); 5064 extent_start = found_key.offset; 5065 compress_type = btrfs_file_extent_compression(leaf, item); 5066 if (found_type == BTRFS_FILE_EXTENT_REG || 5067 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5068 extent_end = extent_start + 5069 btrfs_file_extent_num_bytes(leaf, item); 5070 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5071 size_t size; 5072 size = btrfs_file_extent_inline_len(leaf, item); 5073 extent_end = (extent_start + size + root->sectorsize - 1) & 5074 ~((u64)root->sectorsize - 1); 5075 } 5076 5077 if (start >= extent_end) { 5078 path->slots[0]++; 5079 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 5080 ret = btrfs_next_leaf(root, path); 5081 if (ret < 0) { 5082 err = ret; 5083 goto out; 5084 } 5085 if (ret > 0) 5086 goto not_found; 5087 leaf = path->nodes[0]; 5088 } 5089 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5090 if (found_key.objectid != objectid || 5091 found_key.type != BTRFS_EXTENT_DATA_KEY) 5092 goto not_found; 5093 if (start + len <= found_key.offset) 5094 goto not_found; 5095 em->start = start; 5096 em->len = found_key.offset - start; 5097 goto not_found_em; 5098 } 5099 5100 if (found_type == BTRFS_FILE_EXTENT_REG || 5101 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5102 em->start = extent_start; 5103 em->len = extent_end - extent_start; 5104 em->orig_start = extent_start - 5105 btrfs_file_extent_offset(leaf, item); 5106 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5107 if (bytenr == 0) { 5108 em->block_start = EXTENT_MAP_HOLE; 5109 goto insert; 5110 } 5111 if (compress_type != BTRFS_COMPRESS_NONE) { 5112 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5113 em->compress_type = compress_type; 5114 em->block_start = bytenr; 5115 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5116 item); 5117 } else { 5118 bytenr += btrfs_file_extent_offset(leaf, item); 5119 em->block_start = bytenr; 5120 em->block_len = em->len; 5121 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 5122 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5123 } 5124 goto insert; 5125 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5126 unsigned long ptr; 5127 char *map; 5128 size_t size; 5129 size_t extent_offset; 5130 size_t copy_size; 5131 5132 em->block_start = EXTENT_MAP_INLINE; 5133 if (!page || create) { 5134 em->start = extent_start; 5135 em->len = extent_end - extent_start; 5136 goto out; 5137 } 5138 5139 size = btrfs_file_extent_inline_len(leaf, item); 5140 extent_offset = page_offset(page) + pg_offset - extent_start; 5141 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 5142 size - extent_offset); 5143 em->start = extent_start + extent_offset; 5144 em->len = (copy_size + root->sectorsize - 1) & 5145 ~((u64)root->sectorsize - 1); 5146 em->orig_start = EXTENT_MAP_INLINE; 5147 if (compress_type) { 5148 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5149 em->compress_type = compress_type; 5150 } 5151 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5152 if (create == 0 && !PageUptodate(page)) { 5153 if (btrfs_file_extent_compression(leaf, item) != 5154 BTRFS_COMPRESS_NONE) { 5155 ret = uncompress_inline(path, inode, page, 5156 pg_offset, 5157 extent_offset, item); 5158 BUG_ON(ret); 5159 } else { 5160 map = kmap(page); 5161 read_extent_buffer(leaf, map + pg_offset, ptr, 5162 copy_size); 5163 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 5164 memset(map + pg_offset + copy_size, 0, 5165 PAGE_CACHE_SIZE - pg_offset - 5166 copy_size); 5167 } 5168 kunmap(page); 5169 } 5170 flush_dcache_page(page); 5171 } else if (create && PageUptodate(page)) { 5172 WARN_ON(1); 5173 if (!trans) { 5174 kunmap(page); 5175 free_extent_map(em); 5176 em = NULL; 5177 btrfs_release_path(root, path); 5178 trans = btrfs_join_transaction(root, 1); 5179 goto again; 5180 } 5181 map = kmap(page); 5182 write_extent_buffer(leaf, map + pg_offset, ptr, 5183 copy_size); 5184 kunmap(page); 5185 btrfs_mark_buffer_dirty(leaf); 5186 } 5187 set_extent_uptodate(io_tree, em->start, 5188 extent_map_end(em) - 1, GFP_NOFS); 5189 goto insert; 5190 } else { 5191 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5192 WARN_ON(1); 5193 } 5194 not_found: 5195 em->start = start; 5196 em->len = len; 5197 not_found_em: 5198 em->block_start = EXTENT_MAP_HOLE; 5199 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 5200 insert: 5201 btrfs_release_path(root, path); 5202 if (em->start > start || extent_map_end(em) <= start) { 5203 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 5204 "[%llu %llu]\n", (unsigned long long)em->start, 5205 (unsigned long long)em->len, 5206 (unsigned long long)start, 5207 (unsigned long long)len); 5208 err = -EIO; 5209 goto out; 5210 } 5211 5212 err = 0; 5213 write_lock(&em_tree->lock); 5214 ret = add_extent_mapping(em_tree, em); 5215 /* it is possible that someone inserted the extent into the tree 5216 * while we had the lock dropped. It is also possible that 5217 * an overlapping map exists in the tree 5218 */ 5219 if (ret == -EEXIST) { 5220 struct extent_map *existing; 5221 5222 ret = 0; 5223 5224 existing = lookup_extent_mapping(em_tree, start, len); 5225 if (existing && (existing->start > start || 5226 existing->start + existing->len <= start)) { 5227 free_extent_map(existing); 5228 existing = NULL; 5229 } 5230 if (!existing) { 5231 existing = lookup_extent_mapping(em_tree, em->start, 5232 em->len); 5233 if (existing) { 5234 err = merge_extent_mapping(em_tree, existing, 5235 em, start, 5236 root->sectorsize); 5237 free_extent_map(existing); 5238 if (err) { 5239 free_extent_map(em); 5240 em = NULL; 5241 } 5242 } else { 5243 err = -EIO; 5244 free_extent_map(em); 5245 em = NULL; 5246 } 5247 } else { 5248 free_extent_map(em); 5249 em = existing; 5250 err = 0; 5251 } 5252 } 5253 write_unlock(&em_tree->lock); 5254 out: 5255 if (path) 5256 btrfs_free_path(path); 5257 if (trans) { 5258 ret = btrfs_end_transaction(trans, root); 5259 if (!err) 5260 err = ret; 5261 } 5262 if (err) { 5263 free_extent_map(em); 5264 return ERR_PTR(err); 5265 } 5266 return em; 5267 } 5268 5269 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5270 u64 start, u64 len) 5271 { 5272 struct btrfs_root *root = BTRFS_I(inode)->root; 5273 struct btrfs_trans_handle *trans; 5274 struct extent_map *em; 5275 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5276 struct btrfs_key ins; 5277 u64 alloc_hint; 5278 int ret; 5279 5280 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5281 5282 trans = btrfs_join_transaction(root, 0); 5283 if (!trans) 5284 return ERR_PTR(-ENOMEM); 5285 5286 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5287 5288 alloc_hint = get_extent_allocation_hint(inode, start, len); 5289 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, 5290 alloc_hint, (u64)-1, &ins, 1); 5291 if (ret) { 5292 em = ERR_PTR(ret); 5293 goto out; 5294 } 5295 5296 em = alloc_extent_map(GFP_NOFS); 5297 if (!em) { 5298 em = ERR_PTR(-ENOMEM); 5299 goto out; 5300 } 5301 5302 em->start = start; 5303 em->orig_start = em->start; 5304 em->len = ins.offset; 5305 5306 em->block_start = ins.objectid; 5307 em->block_len = ins.offset; 5308 em->bdev = root->fs_info->fs_devices->latest_bdev; 5309 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5310 5311 while (1) { 5312 write_lock(&em_tree->lock); 5313 ret = add_extent_mapping(em_tree, em); 5314 write_unlock(&em_tree->lock); 5315 if (ret != -EEXIST) 5316 break; 5317 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5318 } 5319 5320 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5321 ins.offset, ins.offset, 0); 5322 if (ret) { 5323 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 5324 em = ERR_PTR(ret); 5325 } 5326 out: 5327 btrfs_end_transaction(trans, root); 5328 return em; 5329 } 5330 5331 /* 5332 * returns 1 when the nocow is safe, < 1 on error, 0 if the 5333 * block must be cow'd 5334 */ 5335 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 5336 struct inode *inode, u64 offset, u64 len) 5337 { 5338 struct btrfs_path *path; 5339 int ret; 5340 struct extent_buffer *leaf; 5341 struct btrfs_root *root = BTRFS_I(inode)->root; 5342 struct btrfs_file_extent_item *fi; 5343 struct btrfs_key key; 5344 u64 disk_bytenr; 5345 u64 backref_offset; 5346 u64 extent_end; 5347 u64 num_bytes; 5348 int slot; 5349 int found_type; 5350 5351 path = btrfs_alloc_path(); 5352 if (!path) 5353 return -ENOMEM; 5354 5355 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 5356 offset, 0); 5357 if (ret < 0) 5358 goto out; 5359 5360 slot = path->slots[0]; 5361 if (ret == 1) { 5362 if (slot == 0) { 5363 /* can't find the item, must cow */ 5364 ret = 0; 5365 goto out; 5366 } 5367 slot--; 5368 } 5369 ret = 0; 5370 leaf = path->nodes[0]; 5371 btrfs_item_key_to_cpu(leaf, &key, slot); 5372 if (key.objectid != inode->i_ino || 5373 key.type != BTRFS_EXTENT_DATA_KEY) { 5374 /* not our file or wrong item type, must cow */ 5375 goto out; 5376 } 5377 5378 if (key.offset > offset) { 5379 /* Wrong offset, must cow */ 5380 goto out; 5381 } 5382 5383 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5384 found_type = btrfs_file_extent_type(leaf, fi); 5385 if (found_type != BTRFS_FILE_EXTENT_REG && 5386 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 5387 /* not a regular extent, must cow */ 5388 goto out; 5389 } 5390 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 5391 backref_offset = btrfs_file_extent_offset(leaf, fi); 5392 5393 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 5394 if (extent_end < offset + len) { 5395 /* extent doesn't include our full range, must cow */ 5396 goto out; 5397 } 5398 5399 if (btrfs_extent_readonly(root, disk_bytenr)) 5400 goto out; 5401 5402 /* 5403 * look for other files referencing this extent, if we 5404 * find any we must cow 5405 */ 5406 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 5407 key.offset - backref_offset, disk_bytenr)) 5408 goto out; 5409 5410 /* 5411 * adjust disk_bytenr and num_bytes to cover just the bytes 5412 * in this extent we are about to write. If there 5413 * are any csums in that range we have to cow in order 5414 * to keep the csums correct 5415 */ 5416 disk_bytenr += backref_offset; 5417 disk_bytenr += offset - key.offset; 5418 num_bytes = min(offset + len, extent_end) - offset; 5419 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 5420 goto out; 5421 /* 5422 * all of the above have passed, it is safe to overwrite this extent 5423 * without cow 5424 */ 5425 ret = 1; 5426 out: 5427 btrfs_free_path(path); 5428 return ret; 5429 } 5430 5431 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5432 struct buffer_head *bh_result, int create) 5433 { 5434 struct extent_map *em; 5435 struct btrfs_root *root = BTRFS_I(inode)->root; 5436 u64 start = iblock << inode->i_blkbits; 5437 u64 len = bh_result->b_size; 5438 struct btrfs_trans_handle *trans; 5439 5440 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5441 if (IS_ERR(em)) 5442 return PTR_ERR(em); 5443 5444 /* 5445 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5446 * io. INLINE is special, and we could probably kludge it in here, but 5447 * it's still buffered so for safety lets just fall back to the generic 5448 * buffered path. 5449 * 5450 * For COMPRESSED we _have_ to read the entire extent in so we can 5451 * decompress it, so there will be buffering required no matter what we 5452 * do, so go ahead and fallback to buffered. 5453 * 5454 * We return -ENOTBLK because thats what makes DIO go ahead and go back 5455 * to buffered IO. Don't blame me, this is the price we pay for using 5456 * the generic code. 5457 */ 5458 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5459 em->block_start == EXTENT_MAP_INLINE) { 5460 free_extent_map(em); 5461 return -ENOTBLK; 5462 } 5463 5464 /* Just a good old fashioned hole, return */ 5465 if (!create && (em->block_start == EXTENT_MAP_HOLE || 5466 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5467 free_extent_map(em); 5468 /* DIO will do one hole at a time, so just unlock a sector */ 5469 unlock_extent(&BTRFS_I(inode)->io_tree, start, 5470 start + root->sectorsize - 1, GFP_NOFS); 5471 return 0; 5472 } 5473 5474 /* 5475 * We don't allocate a new extent in the following cases 5476 * 5477 * 1) The inode is marked as NODATACOW. In this case we'll just use the 5478 * existing extent. 5479 * 2) The extent is marked as PREALLOC. We're good to go here and can 5480 * just use the extent. 5481 * 5482 */ 5483 if (!create) { 5484 len = em->len - (start - em->start); 5485 goto map; 5486 } 5487 5488 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 5489 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 5490 em->block_start != EXTENT_MAP_HOLE)) { 5491 int type; 5492 int ret; 5493 u64 block_start; 5494 5495 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5496 type = BTRFS_ORDERED_PREALLOC; 5497 else 5498 type = BTRFS_ORDERED_NOCOW; 5499 len = min(len, em->len - (start - em->start)); 5500 block_start = em->block_start + (start - em->start); 5501 5502 /* 5503 * we're not going to log anything, but we do need 5504 * to make sure the current transaction stays open 5505 * while we look for nocow cross refs 5506 */ 5507 trans = btrfs_join_transaction(root, 0); 5508 if (!trans) 5509 goto must_cow; 5510 5511 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5512 ret = btrfs_add_ordered_extent_dio(inode, start, 5513 block_start, len, len, type); 5514 btrfs_end_transaction(trans, root); 5515 if (ret) { 5516 free_extent_map(em); 5517 return ret; 5518 } 5519 goto unlock; 5520 } 5521 btrfs_end_transaction(trans, root); 5522 } 5523 must_cow: 5524 /* 5525 * this will cow the extent, reset the len in case we changed 5526 * it above 5527 */ 5528 len = bh_result->b_size; 5529 free_extent_map(em); 5530 em = btrfs_new_extent_direct(inode, start, len); 5531 if (IS_ERR(em)) 5532 return PTR_ERR(em); 5533 len = min(len, em->len - (start - em->start)); 5534 unlock: 5535 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, 5536 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, 5537 0, NULL, GFP_NOFS); 5538 map: 5539 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 5540 inode->i_blkbits; 5541 bh_result->b_size = len; 5542 bh_result->b_bdev = em->bdev; 5543 set_buffer_mapped(bh_result); 5544 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5545 set_buffer_new(bh_result); 5546 5547 free_extent_map(em); 5548 5549 return 0; 5550 } 5551 5552 struct btrfs_dio_private { 5553 struct inode *inode; 5554 u64 logical_offset; 5555 u64 disk_bytenr; 5556 u64 bytes; 5557 u32 *csums; 5558 void *private; 5559 5560 /* number of bios pending for this dio */ 5561 atomic_t pending_bios; 5562 5563 /* IO errors */ 5564 int errors; 5565 5566 struct bio *orig_bio; 5567 }; 5568 5569 static void btrfs_endio_direct_read(struct bio *bio, int err) 5570 { 5571 struct btrfs_dio_private *dip = bio->bi_private; 5572 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5573 struct bio_vec *bvec = bio->bi_io_vec; 5574 struct inode *inode = dip->inode; 5575 struct btrfs_root *root = BTRFS_I(inode)->root; 5576 u64 start; 5577 u32 *private = dip->csums; 5578 5579 start = dip->logical_offset; 5580 do { 5581 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 5582 struct page *page = bvec->bv_page; 5583 char *kaddr; 5584 u32 csum = ~(u32)0; 5585 unsigned long flags; 5586 5587 local_irq_save(flags); 5588 kaddr = kmap_atomic(page, KM_IRQ0); 5589 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, 5590 csum, bvec->bv_len); 5591 btrfs_csum_final(csum, (char *)&csum); 5592 kunmap_atomic(kaddr, KM_IRQ0); 5593 local_irq_restore(flags); 5594 5595 flush_dcache_page(bvec->bv_page); 5596 if (csum != *private) { 5597 printk(KERN_ERR "btrfs csum failed ino %lu off" 5598 " %llu csum %u private %u\n", 5599 inode->i_ino, (unsigned long long)start, 5600 csum, *private); 5601 err = -EIO; 5602 } 5603 } 5604 5605 start += bvec->bv_len; 5606 private++; 5607 bvec++; 5608 } while (bvec <= bvec_end); 5609 5610 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 5611 dip->logical_offset + dip->bytes - 1, GFP_NOFS); 5612 bio->bi_private = dip->private; 5613 5614 kfree(dip->csums); 5615 kfree(dip); 5616 dio_end_io(bio, err); 5617 } 5618 5619 static void btrfs_endio_direct_write(struct bio *bio, int err) 5620 { 5621 struct btrfs_dio_private *dip = bio->bi_private; 5622 struct inode *inode = dip->inode; 5623 struct btrfs_root *root = BTRFS_I(inode)->root; 5624 struct btrfs_trans_handle *trans; 5625 struct btrfs_ordered_extent *ordered = NULL; 5626 struct extent_state *cached_state = NULL; 5627 u64 ordered_offset = dip->logical_offset; 5628 u64 ordered_bytes = dip->bytes; 5629 int ret; 5630 5631 if (err) 5632 goto out_done; 5633 again: 5634 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5635 &ordered_offset, 5636 ordered_bytes); 5637 if (!ret) 5638 goto out_test; 5639 5640 BUG_ON(!ordered); 5641 5642 trans = btrfs_join_transaction(root, 1); 5643 if (!trans) { 5644 err = -ENOMEM; 5645 goto out; 5646 } 5647 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5648 5649 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5650 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5651 if (!ret) 5652 ret = btrfs_update_inode(trans, root, inode); 5653 err = ret; 5654 goto out; 5655 } 5656 5657 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5658 ordered->file_offset + ordered->len - 1, 0, 5659 &cached_state, GFP_NOFS); 5660 5661 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { 5662 ret = btrfs_mark_extent_written(trans, inode, 5663 ordered->file_offset, 5664 ordered->file_offset + 5665 ordered->len); 5666 if (ret) { 5667 err = ret; 5668 goto out_unlock; 5669 } 5670 } else { 5671 ret = insert_reserved_file_extent(trans, inode, 5672 ordered->file_offset, 5673 ordered->start, 5674 ordered->disk_len, 5675 ordered->len, 5676 ordered->len, 5677 0, 0, 0, 5678 BTRFS_FILE_EXTENT_REG); 5679 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 5680 ordered->file_offset, ordered->len); 5681 if (ret) { 5682 err = ret; 5683 WARN_ON(1); 5684 goto out_unlock; 5685 } 5686 } 5687 5688 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5689 btrfs_ordered_update_i_size(inode, 0, ordered); 5690 btrfs_update_inode(trans, root, inode); 5691 out_unlock: 5692 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5693 ordered->file_offset + ordered->len - 1, 5694 &cached_state, GFP_NOFS); 5695 out: 5696 btrfs_delalloc_release_metadata(inode, ordered->len); 5697 btrfs_end_transaction(trans, root); 5698 ordered_offset = ordered->file_offset + ordered->len; 5699 btrfs_put_ordered_extent(ordered); 5700 btrfs_put_ordered_extent(ordered); 5701 5702 out_test: 5703 /* 5704 * our bio might span multiple ordered extents. If we haven't 5705 * completed the accounting for the whole dio, go back and try again 5706 */ 5707 if (ordered_offset < dip->logical_offset + dip->bytes) { 5708 ordered_bytes = dip->logical_offset + dip->bytes - 5709 ordered_offset; 5710 goto again; 5711 } 5712 out_done: 5713 bio->bi_private = dip->private; 5714 5715 kfree(dip->csums); 5716 kfree(dip); 5717 dio_end_io(bio, err); 5718 } 5719 5720 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 5721 struct bio *bio, int mirror_num, 5722 unsigned long bio_flags, u64 offset) 5723 { 5724 int ret; 5725 struct btrfs_root *root = BTRFS_I(inode)->root; 5726 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 5727 BUG_ON(ret); 5728 return 0; 5729 } 5730 5731 static void btrfs_end_dio_bio(struct bio *bio, int err) 5732 { 5733 struct btrfs_dio_private *dip = bio->bi_private; 5734 5735 if (err) { 5736 printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu " 5737 "sector %#Lx len %u err no %d\n", 5738 dip->inode->i_ino, bio->bi_rw, 5739 (unsigned long long)bio->bi_sector, bio->bi_size, err); 5740 dip->errors = 1; 5741 5742 /* 5743 * before atomic variable goto zero, we must make sure 5744 * dip->errors is perceived to be set. 5745 */ 5746 smp_mb__before_atomic_dec(); 5747 } 5748 5749 /* if there are more bios still pending for this dio, just exit */ 5750 if (!atomic_dec_and_test(&dip->pending_bios)) 5751 goto out; 5752 5753 if (dip->errors) 5754 bio_io_error(dip->orig_bio); 5755 else { 5756 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); 5757 bio_endio(dip->orig_bio, 0); 5758 } 5759 out: 5760 bio_put(bio); 5761 } 5762 5763 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 5764 u64 first_sector, gfp_t gfp_flags) 5765 { 5766 int nr_vecs = bio_get_nr_vecs(bdev); 5767 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 5768 } 5769 5770 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 5771 int rw, u64 file_offset, int skip_sum, 5772 u32 *csums) 5773 { 5774 int write = rw & REQ_WRITE; 5775 struct btrfs_root *root = BTRFS_I(inode)->root; 5776 int ret; 5777 5778 bio_get(bio); 5779 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 5780 if (ret) 5781 goto err; 5782 5783 if (write && !skip_sum) { 5784 ret = btrfs_wq_submit_bio(root->fs_info, 5785 inode, rw, bio, 0, 0, 5786 file_offset, 5787 __btrfs_submit_bio_start_direct_io, 5788 __btrfs_submit_bio_done); 5789 goto err; 5790 } else if (!skip_sum) 5791 btrfs_lookup_bio_sums_dio(root, inode, bio, 5792 file_offset, csums); 5793 5794 ret = btrfs_map_bio(root, rw, bio, 0, 1); 5795 err: 5796 bio_put(bio); 5797 return ret; 5798 } 5799 5800 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 5801 int skip_sum) 5802 { 5803 struct inode *inode = dip->inode; 5804 struct btrfs_root *root = BTRFS_I(inode)->root; 5805 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 5806 struct bio *bio; 5807 struct bio *orig_bio = dip->orig_bio; 5808 struct bio_vec *bvec = orig_bio->bi_io_vec; 5809 u64 start_sector = orig_bio->bi_sector; 5810 u64 file_offset = dip->logical_offset; 5811 u64 submit_len = 0; 5812 u64 map_length; 5813 int nr_pages = 0; 5814 u32 *csums = dip->csums; 5815 int ret = 0; 5816 5817 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 5818 if (!bio) 5819 return -ENOMEM; 5820 bio->bi_private = dip; 5821 bio->bi_end_io = btrfs_end_dio_bio; 5822 atomic_inc(&dip->pending_bios); 5823 5824 map_length = orig_bio->bi_size; 5825 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 5826 &map_length, NULL, 0); 5827 if (ret) { 5828 bio_put(bio); 5829 return -EIO; 5830 } 5831 5832 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 5833 if (unlikely(map_length < submit_len + bvec->bv_len || 5834 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 5835 bvec->bv_offset) < bvec->bv_len)) { 5836 /* 5837 * inc the count before we submit the bio so 5838 * we know the end IO handler won't happen before 5839 * we inc the count. Otherwise, the dip might get freed 5840 * before we're done setting it up 5841 */ 5842 atomic_inc(&dip->pending_bios); 5843 ret = __btrfs_submit_dio_bio(bio, inode, rw, 5844 file_offset, skip_sum, 5845 csums); 5846 if (ret) { 5847 bio_put(bio); 5848 atomic_dec(&dip->pending_bios); 5849 goto out_err; 5850 } 5851 5852 if (!skip_sum) 5853 csums = csums + nr_pages; 5854 start_sector += submit_len >> 9; 5855 file_offset += submit_len; 5856 5857 submit_len = 0; 5858 nr_pages = 0; 5859 5860 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 5861 start_sector, GFP_NOFS); 5862 if (!bio) 5863 goto out_err; 5864 bio->bi_private = dip; 5865 bio->bi_end_io = btrfs_end_dio_bio; 5866 5867 map_length = orig_bio->bi_size; 5868 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 5869 &map_length, NULL, 0); 5870 if (ret) { 5871 bio_put(bio); 5872 goto out_err; 5873 } 5874 } else { 5875 submit_len += bvec->bv_len; 5876 nr_pages ++; 5877 bvec++; 5878 } 5879 } 5880 5881 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 5882 csums); 5883 if (!ret) 5884 return 0; 5885 5886 bio_put(bio); 5887 out_err: 5888 dip->errors = 1; 5889 /* 5890 * before atomic variable goto zero, we must 5891 * make sure dip->errors is perceived to be set. 5892 */ 5893 smp_mb__before_atomic_dec(); 5894 if (atomic_dec_and_test(&dip->pending_bios)) 5895 bio_io_error(dip->orig_bio); 5896 5897 /* bio_end_io() will handle error, so we needn't return it */ 5898 return 0; 5899 } 5900 5901 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 5902 loff_t file_offset) 5903 { 5904 struct btrfs_root *root = BTRFS_I(inode)->root; 5905 struct btrfs_dio_private *dip; 5906 struct bio_vec *bvec = bio->bi_io_vec; 5907 int skip_sum; 5908 int write = rw & REQ_WRITE; 5909 int ret = 0; 5910 5911 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 5912 5913 dip = kmalloc(sizeof(*dip), GFP_NOFS); 5914 if (!dip) { 5915 ret = -ENOMEM; 5916 goto free_ordered; 5917 } 5918 dip->csums = NULL; 5919 5920 if (!skip_sum) { 5921 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 5922 if (!dip->csums) { 5923 ret = -ENOMEM; 5924 goto free_ordered; 5925 } 5926 } 5927 5928 dip->private = bio->bi_private; 5929 dip->inode = inode; 5930 dip->logical_offset = file_offset; 5931 5932 dip->bytes = 0; 5933 do { 5934 dip->bytes += bvec->bv_len; 5935 bvec++; 5936 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); 5937 5938 dip->disk_bytenr = (u64)bio->bi_sector << 9; 5939 bio->bi_private = dip; 5940 dip->errors = 0; 5941 dip->orig_bio = bio; 5942 atomic_set(&dip->pending_bios, 0); 5943 5944 if (write) 5945 bio->bi_end_io = btrfs_endio_direct_write; 5946 else 5947 bio->bi_end_io = btrfs_endio_direct_read; 5948 5949 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 5950 if (!ret) 5951 return; 5952 free_ordered: 5953 /* 5954 * If this is a write, we need to clean up the reserved space and kill 5955 * the ordered extent. 5956 */ 5957 if (write) { 5958 struct btrfs_ordered_extent *ordered; 5959 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 5960 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 5961 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 5962 btrfs_free_reserved_extent(root, ordered->start, 5963 ordered->disk_len); 5964 btrfs_put_ordered_extent(ordered); 5965 btrfs_put_ordered_extent(ordered); 5966 } 5967 bio_endio(bio, ret); 5968 } 5969 5970 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 5971 const struct iovec *iov, loff_t offset, 5972 unsigned long nr_segs) 5973 { 5974 int seg; 5975 size_t size; 5976 unsigned long addr; 5977 unsigned blocksize_mask = root->sectorsize - 1; 5978 ssize_t retval = -EINVAL; 5979 loff_t end = offset; 5980 5981 if (offset & blocksize_mask) 5982 goto out; 5983 5984 /* Check the memory alignment. Blocks cannot straddle pages */ 5985 for (seg = 0; seg < nr_segs; seg++) { 5986 addr = (unsigned long)iov[seg].iov_base; 5987 size = iov[seg].iov_len; 5988 end += size; 5989 if ((addr & blocksize_mask) || (size & blocksize_mask)) 5990 goto out; 5991 } 5992 retval = 0; 5993 out: 5994 return retval; 5995 } 5996 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5997 const struct iovec *iov, loff_t offset, 5998 unsigned long nr_segs) 5999 { 6000 struct file *file = iocb->ki_filp; 6001 struct inode *inode = file->f_mapping->host; 6002 struct btrfs_ordered_extent *ordered; 6003 struct extent_state *cached_state = NULL; 6004 u64 lockstart, lockend; 6005 ssize_t ret; 6006 int writing = rw & WRITE; 6007 int write_bits = 0; 6008 size_t count = iov_length(iov, nr_segs); 6009 6010 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6011 offset, nr_segs)) { 6012 return 0; 6013 } 6014 6015 lockstart = offset; 6016 lockend = offset + count - 1; 6017 6018 if (writing) { 6019 ret = btrfs_delalloc_reserve_space(inode, count); 6020 if (ret) 6021 goto out; 6022 } 6023 6024 while (1) { 6025 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6026 0, &cached_state, GFP_NOFS); 6027 /* 6028 * We're concerned with the entire range that we're going to be 6029 * doing DIO to, so we need to make sure theres no ordered 6030 * extents in this range. 6031 */ 6032 ordered = btrfs_lookup_ordered_range(inode, lockstart, 6033 lockend - lockstart + 1); 6034 if (!ordered) 6035 break; 6036 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6037 &cached_state, GFP_NOFS); 6038 btrfs_start_ordered_extent(inode, ordered, 1); 6039 btrfs_put_ordered_extent(ordered); 6040 cond_resched(); 6041 } 6042 6043 /* 6044 * we don't use btrfs_set_extent_delalloc because we don't want 6045 * the dirty or uptodate bits 6046 */ 6047 if (writing) { 6048 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; 6049 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6050 EXTENT_DELALLOC, 0, NULL, &cached_state, 6051 GFP_NOFS); 6052 if (ret) { 6053 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6054 lockend, EXTENT_LOCKED | write_bits, 6055 1, 0, &cached_state, GFP_NOFS); 6056 goto out; 6057 } 6058 } 6059 6060 free_extent_state(cached_state); 6061 cached_state = NULL; 6062 6063 ret = __blockdev_direct_IO(rw, iocb, inode, 6064 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6065 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6066 btrfs_submit_direct, 0); 6067 6068 if (ret < 0 && ret != -EIOCBQUEUED) { 6069 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, 6070 offset + iov_length(iov, nr_segs) - 1, 6071 EXTENT_LOCKED | write_bits, 1, 0, 6072 &cached_state, GFP_NOFS); 6073 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { 6074 /* 6075 * We're falling back to buffered, unlock the section we didn't 6076 * do IO on. 6077 */ 6078 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, 6079 offset + iov_length(iov, nr_segs) - 1, 6080 EXTENT_LOCKED | write_bits, 1, 0, 6081 &cached_state, GFP_NOFS); 6082 } 6083 out: 6084 free_extent_state(cached_state); 6085 return ret; 6086 } 6087 6088 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6089 __u64 start, __u64 len) 6090 { 6091 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); 6092 } 6093 6094 int btrfs_readpage(struct file *file, struct page *page) 6095 { 6096 struct extent_io_tree *tree; 6097 tree = &BTRFS_I(page->mapping->host)->io_tree; 6098 return extent_read_full_page(tree, page, btrfs_get_extent); 6099 } 6100 6101 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6102 { 6103 struct extent_io_tree *tree; 6104 6105 6106 if (current->flags & PF_MEMALLOC) { 6107 redirty_page_for_writepage(wbc, page); 6108 unlock_page(page); 6109 return 0; 6110 } 6111 tree = &BTRFS_I(page->mapping->host)->io_tree; 6112 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 6113 } 6114 6115 int btrfs_writepages(struct address_space *mapping, 6116 struct writeback_control *wbc) 6117 { 6118 struct extent_io_tree *tree; 6119 6120 tree = &BTRFS_I(mapping->host)->io_tree; 6121 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 6122 } 6123 6124 static int 6125 btrfs_readpages(struct file *file, struct address_space *mapping, 6126 struct list_head *pages, unsigned nr_pages) 6127 { 6128 struct extent_io_tree *tree; 6129 tree = &BTRFS_I(mapping->host)->io_tree; 6130 return extent_readpages(tree, mapping, pages, nr_pages, 6131 btrfs_get_extent); 6132 } 6133 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6134 { 6135 struct extent_io_tree *tree; 6136 struct extent_map_tree *map; 6137 int ret; 6138 6139 tree = &BTRFS_I(page->mapping->host)->io_tree; 6140 map = &BTRFS_I(page->mapping->host)->extent_tree; 6141 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 6142 if (ret == 1) { 6143 ClearPagePrivate(page); 6144 set_page_private(page, 0); 6145 page_cache_release(page); 6146 } 6147 return ret; 6148 } 6149 6150 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6151 { 6152 if (PageWriteback(page) || PageDirty(page)) 6153 return 0; 6154 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 6155 } 6156 6157 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6158 { 6159 struct extent_io_tree *tree; 6160 struct btrfs_ordered_extent *ordered; 6161 struct extent_state *cached_state = NULL; 6162 u64 page_start = page_offset(page); 6163 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6164 6165 6166 /* 6167 * we have the page locked, so new writeback can't start, 6168 * and the dirty bit won't be cleared while we are here. 6169 * 6170 * Wait for IO on this page so that we can safely clear 6171 * the PagePrivate2 bit and do ordered accounting 6172 */ 6173 wait_on_page_writeback(page); 6174 6175 tree = &BTRFS_I(page->mapping->host)->io_tree; 6176 if (offset) { 6177 btrfs_releasepage(page, GFP_NOFS); 6178 return; 6179 } 6180 lock_extent_bits(tree, page_start, page_end, 0, &cached_state, 6181 GFP_NOFS); 6182 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 6183 page_offset(page)); 6184 if (ordered) { 6185 /* 6186 * IO on this page will never be started, so we need 6187 * to account for any ordered extents now 6188 */ 6189 clear_extent_bit(tree, page_start, page_end, 6190 EXTENT_DIRTY | EXTENT_DELALLOC | 6191 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6192 &cached_state, GFP_NOFS); 6193 /* 6194 * whoever cleared the private bit is responsible 6195 * for the finish_ordered_io 6196 */ 6197 if (TestClearPagePrivate2(page)) { 6198 btrfs_finish_ordered_io(page->mapping->host, 6199 page_start, page_end); 6200 } 6201 btrfs_put_ordered_extent(ordered); 6202 cached_state = NULL; 6203 lock_extent_bits(tree, page_start, page_end, 0, &cached_state, 6204 GFP_NOFS); 6205 } 6206 clear_extent_bit(tree, page_start, page_end, 6207 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6208 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6209 __btrfs_releasepage(page, GFP_NOFS); 6210 6211 ClearPageChecked(page); 6212 if (PagePrivate(page)) { 6213 ClearPagePrivate(page); 6214 set_page_private(page, 0); 6215 page_cache_release(page); 6216 } 6217 } 6218 6219 /* 6220 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 6221 * called from a page fault handler when a page is first dirtied. Hence we must 6222 * be careful to check for EOF conditions here. We set the page up correctly 6223 * for a written page which means we get ENOSPC checking when writing into 6224 * holes and correct delalloc and unwritten extent mapping on filesystems that 6225 * support these features. 6226 * 6227 * We are not allowed to take the i_mutex here so we have to play games to 6228 * protect against truncate races as the page could now be beyond EOF. Because 6229 * vmtruncate() writes the inode size before removing pages, once we have the 6230 * page lock we can determine safely if the page is beyond EOF. If it is not 6231 * beyond EOF, then the page is guaranteed safe against truncation until we 6232 * unlock the page. 6233 */ 6234 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 6235 { 6236 struct page *page = vmf->page; 6237 struct inode *inode = fdentry(vma->vm_file)->d_inode; 6238 struct btrfs_root *root = BTRFS_I(inode)->root; 6239 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6240 struct btrfs_ordered_extent *ordered; 6241 struct extent_state *cached_state = NULL; 6242 char *kaddr; 6243 unsigned long zero_start; 6244 loff_t size; 6245 int ret; 6246 u64 page_start; 6247 u64 page_end; 6248 6249 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6250 if (ret) { 6251 if (ret == -ENOMEM) 6252 ret = VM_FAULT_OOM; 6253 else /* -ENOSPC, -EIO, etc */ 6254 ret = VM_FAULT_SIGBUS; 6255 goto out; 6256 } 6257 6258 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6259 again: 6260 lock_page(page); 6261 size = i_size_read(inode); 6262 page_start = page_offset(page); 6263 page_end = page_start + PAGE_CACHE_SIZE - 1; 6264 6265 if ((page->mapping != inode->i_mapping) || 6266 (page_start >= size)) { 6267 /* page got truncated out from underneath us */ 6268 goto out_unlock; 6269 } 6270 wait_on_page_writeback(page); 6271 6272 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, 6273 GFP_NOFS); 6274 set_page_extent_mapped(page); 6275 6276 /* 6277 * we can't set the delalloc bits if there are pending ordered 6278 * extents. Drop our locks and wait for them to finish 6279 */ 6280 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6281 if (ordered) { 6282 unlock_extent_cached(io_tree, page_start, page_end, 6283 &cached_state, GFP_NOFS); 6284 unlock_page(page); 6285 btrfs_start_ordered_extent(inode, ordered, 1); 6286 btrfs_put_ordered_extent(ordered); 6287 goto again; 6288 } 6289 6290 /* 6291 * XXX - page_mkwrite gets called every time the page is dirtied, even 6292 * if it was already dirty, so for space accounting reasons we need to 6293 * clear any delalloc bits for the range we are fixing to save. There 6294 * is probably a better way to do this, but for now keep consistent with 6295 * prepare_pages in the normal write path. 6296 */ 6297 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6298 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6299 0, 0, &cached_state, GFP_NOFS); 6300 6301 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6302 &cached_state); 6303 if (ret) { 6304 unlock_extent_cached(io_tree, page_start, page_end, 6305 &cached_state, GFP_NOFS); 6306 ret = VM_FAULT_SIGBUS; 6307 goto out_unlock; 6308 } 6309 ret = 0; 6310 6311 /* page is wholly or partially inside EOF */ 6312 if (page_start + PAGE_CACHE_SIZE > size) 6313 zero_start = size & ~PAGE_CACHE_MASK; 6314 else 6315 zero_start = PAGE_CACHE_SIZE; 6316 6317 if (zero_start != PAGE_CACHE_SIZE) { 6318 kaddr = kmap(page); 6319 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 6320 flush_dcache_page(page); 6321 kunmap(page); 6322 } 6323 ClearPageChecked(page); 6324 set_page_dirty(page); 6325 SetPageUptodate(page); 6326 6327 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6328 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6329 6330 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6331 6332 out_unlock: 6333 if (!ret) 6334 return VM_FAULT_LOCKED; 6335 unlock_page(page); 6336 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6337 out: 6338 return ret; 6339 } 6340 6341 static void btrfs_truncate(struct inode *inode) 6342 { 6343 struct btrfs_root *root = BTRFS_I(inode)->root; 6344 int ret; 6345 struct btrfs_trans_handle *trans; 6346 unsigned long nr; 6347 u64 mask = root->sectorsize - 1; 6348 6349 if (!S_ISREG(inode->i_mode)) { 6350 WARN_ON(1); 6351 return; 6352 } 6353 6354 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6355 if (ret) 6356 return; 6357 6358 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6359 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6360 6361 trans = btrfs_start_transaction(root, 0); 6362 BUG_ON(IS_ERR(trans)); 6363 btrfs_set_trans_block_group(trans, inode); 6364 trans->block_rsv = root->orphan_block_rsv; 6365 6366 /* 6367 * setattr is responsible for setting the ordered_data_close flag, 6368 * but that is only tested during the last file release. That 6369 * could happen well after the next commit, leaving a great big 6370 * window where new writes may get lost if someone chooses to write 6371 * to this file after truncating to zero 6372 * 6373 * The inode doesn't have any dirty data here, and so if we commit 6374 * this is a noop. If someone immediately starts writing to the inode 6375 * it is very likely we'll catch some of their writes in this 6376 * transaction, and the commit will find this file on the ordered 6377 * data list with good things to send down. 6378 * 6379 * This is a best effort solution, there is still a window where 6380 * using truncate to replace the contents of the file will 6381 * end up with a zero length file after a crash. 6382 */ 6383 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 6384 btrfs_add_ordered_operation(trans, root, inode); 6385 6386 while (1) { 6387 if (!trans) { 6388 trans = btrfs_start_transaction(root, 0); 6389 BUG_ON(IS_ERR(trans)); 6390 btrfs_set_trans_block_group(trans, inode); 6391 trans->block_rsv = root->orphan_block_rsv; 6392 } 6393 6394 ret = btrfs_block_rsv_check(trans, root, 6395 root->orphan_block_rsv, 0, 5); 6396 if (ret) { 6397 BUG_ON(ret != -EAGAIN); 6398 ret = btrfs_commit_transaction(trans, root); 6399 BUG_ON(ret); 6400 trans = NULL; 6401 continue; 6402 } 6403 6404 ret = btrfs_truncate_inode_items(trans, root, inode, 6405 inode->i_size, 6406 BTRFS_EXTENT_DATA_KEY); 6407 if (ret != -EAGAIN) 6408 break; 6409 6410 ret = btrfs_update_inode(trans, root, inode); 6411 BUG_ON(ret); 6412 6413 nr = trans->blocks_used; 6414 btrfs_end_transaction(trans, root); 6415 trans = NULL; 6416 btrfs_btree_balance_dirty(root, nr); 6417 } 6418 6419 if (ret == 0 && inode->i_nlink > 0) { 6420 ret = btrfs_orphan_del(trans, inode); 6421 BUG_ON(ret); 6422 } 6423 6424 ret = btrfs_update_inode(trans, root, inode); 6425 BUG_ON(ret); 6426 6427 nr = trans->blocks_used; 6428 ret = btrfs_end_transaction_throttle(trans, root); 6429 BUG_ON(ret); 6430 btrfs_btree_balance_dirty(root, nr); 6431 } 6432 6433 /* 6434 * create a new subvolume directory/inode (helper for the ioctl). 6435 */ 6436 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 6437 struct btrfs_root *new_root, 6438 u64 new_dirid, u64 alloc_hint) 6439 { 6440 struct inode *inode; 6441 int err; 6442 u64 index = 0; 6443 6444 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 6445 new_dirid, alloc_hint, S_IFDIR | 0700, &index); 6446 if (IS_ERR(inode)) 6447 return PTR_ERR(inode); 6448 inode->i_op = &btrfs_dir_inode_operations; 6449 inode->i_fop = &btrfs_dir_file_operations; 6450 6451 inode->i_nlink = 1; 6452 btrfs_i_size_write(inode, 0); 6453 6454 err = btrfs_update_inode(trans, new_root, inode); 6455 BUG_ON(err); 6456 6457 iput(inode); 6458 return 0; 6459 } 6460 6461 /* helper function for file defrag and space balancing. This 6462 * forces readahead on a given range of bytes in an inode 6463 */ 6464 unsigned long btrfs_force_ra(struct address_space *mapping, 6465 struct file_ra_state *ra, struct file *file, 6466 pgoff_t offset, pgoff_t last_index) 6467 { 6468 pgoff_t req_size = last_index - offset + 1; 6469 6470 page_cache_sync_readahead(mapping, ra, file, offset, req_size); 6471 return offset + req_size; 6472 } 6473 6474 struct inode *btrfs_alloc_inode(struct super_block *sb) 6475 { 6476 struct btrfs_inode *ei; 6477 struct inode *inode; 6478 6479 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6480 if (!ei) 6481 return NULL; 6482 6483 ei->root = NULL; 6484 ei->space_info = NULL; 6485 ei->generation = 0; 6486 ei->sequence = 0; 6487 ei->last_trans = 0; 6488 ei->last_sub_trans = 0; 6489 ei->logged_trans = 0; 6490 ei->delalloc_bytes = 0; 6491 ei->reserved_bytes = 0; 6492 ei->disk_i_size = 0; 6493 ei->flags = 0; 6494 ei->index_cnt = (u64)-1; 6495 ei->last_unlink_trans = 0; 6496 6497 spin_lock_init(&ei->accounting_lock); 6498 atomic_set(&ei->outstanding_extents, 0); 6499 ei->reserved_extents = 0; 6500 6501 ei->ordered_data_close = 0; 6502 ei->orphan_meta_reserved = 0; 6503 ei->dummy_inode = 0; 6504 ei->force_compress = BTRFS_COMPRESS_NONE; 6505 6506 inode = &ei->vfs_inode; 6507 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6508 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS); 6509 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS); 6510 mutex_init(&ei->log_mutex); 6511 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6512 INIT_LIST_HEAD(&ei->i_orphan); 6513 INIT_LIST_HEAD(&ei->delalloc_inodes); 6514 INIT_LIST_HEAD(&ei->ordered_operations); 6515 RB_CLEAR_NODE(&ei->rb_node); 6516 6517 return inode; 6518 } 6519 6520 static void btrfs_i_callback(struct rcu_head *head) 6521 { 6522 struct inode *inode = container_of(head, struct inode, i_rcu); 6523 INIT_LIST_HEAD(&inode->i_dentry); 6524 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6525 } 6526 6527 void btrfs_destroy_inode(struct inode *inode) 6528 { 6529 struct btrfs_ordered_extent *ordered; 6530 struct btrfs_root *root = BTRFS_I(inode)->root; 6531 6532 WARN_ON(!list_empty(&inode->i_dentry)); 6533 WARN_ON(inode->i_data.nrpages); 6534 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6535 WARN_ON(BTRFS_I(inode)->reserved_extents); 6536 6537 /* 6538 * This can happen where we create an inode, but somebody else also 6539 * created the same inode and we need to destroy the one we already 6540 * created. 6541 */ 6542 if (!root) 6543 goto free; 6544 6545 /* 6546 * Make sure we're properly removed from the ordered operation 6547 * lists. 6548 */ 6549 smp_mb(); 6550 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 6551 spin_lock(&root->fs_info->ordered_extent_lock); 6552 list_del_init(&BTRFS_I(inode)->ordered_operations); 6553 spin_unlock(&root->fs_info->ordered_extent_lock); 6554 } 6555 6556 if (root == root->fs_info->tree_root) { 6557 struct btrfs_block_group_cache *block_group; 6558 6559 block_group = btrfs_lookup_block_group(root->fs_info, 6560 BTRFS_I(inode)->block_group); 6561 if (block_group && block_group->inode == inode) { 6562 spin_lock(&block_group->lock); 6563 block_group->inode = NULL; 6564 spin_unlock(&block_group->lock); 6565 btrfs_put_block_group(block_group); 6566 } else if (block_group) { 6567 btrfs_put_block_group(block_group); 6568 } 6569 } 6570 6571 spin_lock(&root->orphan_lock); 6572 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6573 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6574 inode->i_ino); 6575 list_del_init(&BTRFS_I(inode)->i_orphan); 6576 } 6577 spin_unlock(&root->orphan_lock); 6578 6579 while (1) { 6580 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6581 if (!ordered) 6582 break; 6583 else { 6584 printk(KERN_ERR "btrfs found ordered " 6585 "extent %llu %llu on inode cleanup\n", 6586 (unsigned long long)ordered->file_offset, 6587 (unsigned long long)ordered->len); 6588 btrfs_remove_ordered_extent(inode, ordered); 6589 btrfs_put_ordered_extent(ordered); 6590 btrfs_put_ordered_extent(ordered); 6591 } 6592 } 6593 inode_tree_del(inode); 6594 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 6595 free: 6596 call_rcu(&inode->i_rcu, btrfs_i_callback); 6597 } 6598 6599 int btrfs_drop_inode(struct inode *inode) 6600 { 6601 struct btrfs_root *root = BTRFS_I(inode)->root; 6602 6603 if (btrfs_root_refs(&root->root_item) == 0 && 6604 root != root->fs_info->tree_root) 6605 return 1; 6606 else 6607 return generic_drop_inode(inode); 6608 } 6609 6610 static void init_once(void *foo) 6611 { 6612 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 6613 6614 inode_init_once(&ei->vfs_inode); 6615 } 6616 6617 void btrfs_destroy_cachep(void) 6618 { 6619 if (btrfs_inode_cachep) 6620 kmem_cache_destroy(btrfs_inode_cachep); 6621 if (btrfs_trans_handle_cachep) 6622 kmem_cache_destroy(btrfs_trans_handle_cachep); 6623 if (btrfs_transaction_cachep) 6624 kmem_cache_destroy(btrfs_transaction_cachep); 6625 if (btrfs_path_cachep) 6626 kmem_cache_destroy(btrfs_path_cachep); 6627 } 6628 6629 int btrfs_init_cachep(void) 6630 { 6631 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 6632 sizeof(struct btrfs_inode), 0, 6633 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 6634 if (!btrfs_inode_cachep) 6635 goto fail; 6636 6637 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 6638 sizeof(struct btrfs_trans_handle), 0, 6639 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6640 if (!btrfs_trans_handle_cachep) 6641 goto fail; 6642 6643 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 6644 sizeof(struct btrfs_transaction), 0, 6645 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6646 if (!btrfs_transaction_cachep) 6647 goto fail; 6648 6649 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 6650 sizeof(struct btrfs_path), 0, 6651 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6652 if (!btrfs_path_cachep) 6653 goto fail; 6654 6655 return 0; 6656 fail: 6657 btrfs_destroy_cachep(); 6658 return -ENOMEM; 6659 } 6660 6661 static int btrfs_getattr(struct vfsmount *mnt, 6662 struct dentry *dentry, struct kstat *stat) 6663 { 6664 struct inode *inode = dentry->d_inode; 6665 generic_fillattr(inode, stat); 6666 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; 6667 stat->blksize = PAGE_CACHE_SIZE; 6668 stat->blocks = (inode_get_bytes(inode) + 6669 BTRFS_I(inode)->delalloc_bytes) >> 9; 6670 return 0; 6671 } 6672 6673 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 6674 struct inode *new_dir, struct dentry *new_dentry) 6675 { 6676 struct btrfs_trans_handle *trans; 6677 struct btrfs_root *root = BTRFS_I(old_dir)->root; 6678 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 6679 struct inode *new_inode = new_dentry->d_inode; 6680 struct inode *old_inode = old_dentry->d_inode; 6681 struct timespec ctime = CURRENT_TIME; 6682 u64 index = 0; 6683 u64 root_objectid; 6684 int ret; 6685 6686 if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 6687 return -EPERM; 6688 6689 /* we only allow rename subvolume link between subvolumes */ 6690 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 6691 return -EXDEV; 6692 6693 if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 6694 (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) 6695 return -ENOTEMPTY; 6696 6697 if (S_ISDIR(old_inode->i_mode) && new_inode && 6698 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6699 return -ENOTEMPTY; 6700 /* 6701 * we're using rename to replace one file with another. 6702 * and the replacement file is large. Start IO on it now so 6703 * we don't add too much work to the end of the transaction 6704 */ 6705 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 6706 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 6707 filemap_flush(old_inode->i_mapping); 6708 6709 /* close the racy window with snapshot create/destroy ioctl */ 6710 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6711 down_read(&root->fs_info->subvol_sem); 6712 /* 6713 * We want to reserve the absolute worst case amount of items. So if 6714 * both inodes are subvols and we need to unlink them then that would 6715 * require 4 item modifications, but if they are both normal inodes it 6716 * would require 5 item modifications, so we'll assume their normal 6717 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 6718 * should cover the worst case number of items we'll modify. 6719 */ 6720 trans = btrfs_start_transaction(root, 20); 6721 if (IS_ERR(trans)) 6722 return PTR_ERR(trans); 6723 6724 btrfs_set_trans_block_group(trans, new_dir); 6725 6726 if (dest != root) 6727 btrfs_record_root_in_trans(trans, dest); 6728 6729 ret = btrfs_set_inode_index(new_dir, &index); 6730 if (ret) 6731 goto out_fail; 6732 6733 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 6734 /* force full log commit if subvolume involved. */ 6735 root->fs_info->last_trans_log_full_commit = trans->transid; 6736 } else { 6737 ret = btrfs_insert_inode_ref(trans, dest, 6738 new_dentry->d_name.name, 6739 new_dentry->d_name.len, 6740 old_inode->i_ino, 6741 new_dir->i_ino, index); 6742 if (ret) 6743 goto out_fail; 6744 /* 6745 * this is an ugly little race, but the rename is required 6746 * to make sure that if we crash, the inode is either at the 6747 * old name or the new one. pinning the log transaction lets 6748 * us make sure we don't allow a log commit to come in after 6749 * we unlink the name but before we add the new name back in. 6750 */ 6751 btrfs_pin_log_trans(root); 6752 } 6753 /* 6754 * make sure the inode gets flushed if it is replacing 6755 * something. 6756 */ 6757 if (new_inode && new_inode->i_size && 6758 old_inode && S_ISREG(old_inode->i_mode)) { 6759 btrfs_add_ordered_operation(trans, root, old_inode); 6760 } 6761 6762 old_dir->i_ctime = old_dir->i_mtime = ctime; 6763 new_dir->i_ctime = new_dir->i_mtime = ctime; 6764 old_inode->i_ctime = ctime; 6765 6766 if (old_dentry->d_parent != new_dentry->d_parent) 6767 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 6768 6769 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 6770 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 6771 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 6772 old_dentry->d_name.name, 6773 old_dentry->d_name.len); 6774 } else { 6775 btrfs_inc_nlink(old_dentry->d_inode); 6776 ret = btrfs_unlink_inode(trans, root, old_dir, 6777 old_dentry->d_inode, 6778 old_dentry->d_name.name, 6779 old_dentry->d_name.len); 6780 } 6781 BUG_ON(ret); 6782 6783 if (new_inode) { 6784 new_inode->i_ctime = CURRENT_TIME; 6785 if (unlikely(new_inode->i_ino == 6786 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 6787 root_objectid = BTRFS_I(new_inode)->location.objectid; 6788 ret = btrfs_unlink_subvol(trans, dest, new_dir, 6789 root_objectid, 6790 new_dentry->d_name.name, 6791 new_dentry->d_name.len); 6792 BUG_ON(new_inode->i_nlink == 0); 6793 } else { 6794 ret = btrfs_unlink_inode(trans, dest, new_dir, 6795 new_dentry->d_inode, 6796 new_dentry->d_name.name, 6797 new_dentry->d_name.len); 6798 } 6799 BUG_ON(ret); 6800 if (new_inode->i_nlink == 0) { 6801 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 6802 BUG_ON(ret); 6803 } 6804 } 6805 6806 ret = btrfs_add_link(trans, new_dir, old_inode, 6807 new_dentry->d_name.name, 6808 new_dentry->d_name.len, 0, index); 6809 BUG_ON(ret); 6810 6811 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 6812 struct dentry *parent = dget_parent(new_dentry); 6813 btrfs_log_new_name(trans, old_inode, old_dir, parent); 6814 dput(parent); 6815 btrfs_end_log_trans(root); 6816 } 6817 out_fail: 6818 btrfs_end_transaction_throttle(trans, root); 6819 6820 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6821 up_read(&root->fs_info->subvol_sem); 6822 6823 return ret; 6824 } 6825 6826 /* 6827 * some fairly slow code that needs optimization. This walks the list 6828 * of all the inodes with pending delalloc and forces them to disk. 6829 */ 6830 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 6831 { 6832 struct list_head *head = &root->fs_info->delalloc_inodes; 6833 struct btrfs_inode *binode; 6834 struct inode *inode; 6835 6836 if (root->fs_info->sb->s_flags & MS_RDONLY) 6837 return -EROFS; 6838 6839 spin_lock(&root->fs_info->delalloc_lock); 6840 while (!list_empty(head)) { 6841 binode = list_entry(head->next, struct btrfs_inode, 6842 delalloc_inodes); 6843 inode = igrab(&binode->vfs_inode); 6844 if (!inode) 6845 list_del_init(&binode->delalloc_inodes); 6846 spin_unlock(&root->fs_info->delalloc_lock); 6847 if (inode) { 6848 filemap_flush(inode->i_mapping); 6849 if (delay_iput) 6850 btrfs_add_delayed_iput(inode); 6851 else 6852 iput(inode); 6853 } 6854 cond_resched(); 6855 spin_lock(&root->fs_info->delalloc_lock); 6856 } 6857 spin_unlock(&root->fs_info->delalloc_lock); 6858 6859 /* the filemap_flush will queue IO into the worker threads, but 6860 * we have to make sure the IO is actually started and that 6861 * ordered extents get created before we return 6862 */ 6863 atomic_inc(&root->fs_info->async_submit_draining); 6864 while (atomic_read(&root->fs_info->nr_async_submits) || 6865 atomic_read(&root->fs_info->async_delalloc_pages)) { 6866 wait_event(root->fs_info->async_submit_wait, 6867 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 6868 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 6869 } 6870 atomic_dec(&root->fs_info->async_submit_draining); 6871 return 0; 6872 } 6873 6874 int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, 6875 int sync) 6876 { 6877 struct btrfs_inode *binode; 6878 struct inode *inode = NULL; 6879 6880 spin_lock(&root->fs_info->delalloc_lock); 6881 while (!list_empty(&root->fs_info->delalloc_inodes)) { 6882 binode = list_entry(root->fs_info->delalloc_inodes.next, 6883 struct btrfs_inode, delalloc_inodes); 6884 inode = igrab(&binode->vfs_inode); 6885 if (inode) { 6886 list_move_tail(&binode->delalloc_inodes, 6887 &root->fs_info->delalloc_inodes); 6888 break; 6889 } 6890 6891 list_del_init(&binode->delalloc_inodes); 6892 cond_resched_lock(&root->fs_info->delalloc_lock); 6893 } 6894 spin_unlock(&root->fs_info->delalloc_lock); 6895 6896 if (inode) { 6897 if (sync) { 6898 filemap_write_and_wait(inode->i_mapping); 6899 /* 6900 * We have to do this because compression doesn't 6901 * actually set PG_writeback until it submits the pages 6902 * for IO, which happens in an async thread, so we could 6903 * race and not actually wait for any writeback pages 6904 * because they've not been submitted yet. Technically 6905 * this could still be the case for the ordered stuff 6906 * since the async thread may not have started to do its 6907 * work yet. If this becomes the case then we need to 6908 * figure out a way to make sure that in writepage we 6909 * wait for any async pages to be submitted before 6910 * returning so that fdatawait does what its supposed to 6911 * do. 6912 */ 6913 btrfs_wait_ordered_range(inode, 0, (u64)-1); 6914 } else { 6915 filemap_flush(inode->i_mapping); 6916 } 6917 if (delay_iput) 6918 btrfs_add_delayed_iput(inode); 6919 else 6920 iput(inode); 6921 return 1; 6922 } 6923 return 0; 6924 } 6925 6926 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6927 const char *symname) 6928 { 6929 struct btrfs_trans_handle *trans; 6930 struct btrfs_root *root = BTRFS_I(dir)->root; 6931 struct btrfs_path *path; 6932 struct btrfs_key key; 6933 struct inode *inode = NULL; 6934 int err; 6935 int drop_inode = 0; 6936 u64 objectid; 6937 u64 index = 0 ; 6938 int name_len; 6939 int datasize; 6940 unsigned long ptr; 6941 struct btrfs_file_extent_item *ei; 6942 struct extent_buffer *leaf; 6943 unsigned long nr = 0; 6944 6945 name_len = strlen(symname) + 1; 6946 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6947 return -ENAMETOOLONG; 6948 6949 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); 6950 if (err) 6951 return err; 6952 /* 6953 * 2 items for inode item and ref 6954 * 2 items for dir items 6955 * 1 item for xattr if selinux is on 6956 */ 6957 trans = btrfs_start_transaction(root, 5); 6958 if (IS_ERR(trans)) 6959 return PTR_ERR(trans); 6960 6961 btrfs_set_trans_block_group(trans, dir); 6962 6963 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6964 dentry->d_name.len, dir->i_ino, objectid, 6965 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 6966 &index); 6967 err = PTR_ERR(inode); 6968 if (IS_ERR(inode)) 6969 goto out_unlock; 6970 6971 err = btrfs_init_inode_security(trans, inode, dir); 6972 if (err) { 6973 drop_inode = 1; 6974 goto out_unlock; 6975 } 6976 6977 btrfs_set_trans_block_group(trans, inode); 6978 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 6979 if (err) 6980 drop_inode = 1; 6981 else { 6982 inode->i_mapping->a_ops = &btrfs_aops; 6983 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 6984 inode->i_fop = &btrfs_file_operations; 6985 inode->i_op = &btrfs_file_inode_operations; 6986 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 6987 } 6988 btrfs_update_inode_block_group(trans, inode); 6989 btrfs_update_inode_block_group(trans, dir); 6990 if (drop_inode) 6991 goto out_unlock; 6992 6993 path = btrfs_alloc_path(); 6994 BUG_ON(!path); 6995 key.objectid = inode->i_ino; 6996 key.offset = 0; 6997 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 6998 datasize = btrfs_file_extent_calc_inline_size(name_len); 6999 err = btrfs_insert_empty_item(trans, root, path, &key, 7000 datasize); 7001 if (err) { 7002 drop_inode = 1; 7003 goto out_unlock; 7004 } 7005 leaf = path->nodes[0]; 7006 ei = btrfs_item_ptr(leaf, path->slots[0], 7007 struct btrfs_file_extent_item); 7008 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 7009 btrfs_set_file_extent_type(leaf, ei, 7010 BTRFS_FILE_EXTENT_INLINE); 7011 btrfs_set_file_extent_encryption(leaf, ei, 0); 7012 btrfs_set_file_extent_compression(leaf, ei, 0); 7013 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 7014 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 7015 7016 ptr = btrfs_file_extent_inline_start(ei); 7017 write_extent_buffer(leaf, symname, ptr, name_len); 7018 btrfs_mark_buffer_dirty(leaf); 7019 btrfs_free_path(path); 7020 7021 inode->i_op = &btrfs_symlink_inode_operations; 7022 inode->i_mapping->a_ops = &btrfs_symlink_aops; 7023 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7024 inode_set_bytes(inode, name_len); 7025 btrfs_i_size_write(inode, name_len - 1); 7026 err = btrfs_update_inode(trans, root, inode); 7027 if (err) 7028 drop_inode = 1; 7029 7030 out_unlock: 7031 nr = trans->blocks_used; 7032 btrfs_end_transaction_throttle(trans, root); 7033 if (drop_inode) { 7034 inode_dec_link_count(inode); 7035 iput(inode); 7036 } 7037 btrfs_btree_balance_dirty(root, nr); 7038 return err; 7039 } 7040 7041 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 7042 u64 start, u64 num_bytes, u64 min_size, 7043 loff_t actual_len, u64 *alloc_hint, 7044 struct btrfs_trans_handle *trans) 7045 { 7046 struct btrfs_root *root = BTRFS_I(inode)->root; 7047 struct btrfs_key ins; 7048 u64 cur_offset = start; 7049 u64 i_size; 7050 int ret = 0; 7051 bool own_trans = true; 7052 7053 if (trans) 7054 own_trans = false; 7055 while (num_bytes > 0) { 7056 if (own_trans) { 7057 trans = btrfs_start_transaction(root, 3); 7058 if (IS_ERR(trans)) { 7059 ret = PTR_ERR(trans); 7060 break; 7061 } 7062 } 7063 7064 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 7065 0, *alloc_hint, (u64)-1, &ins, 1); 7066 if (ret) { 7067 if (own_trans) 7068 btrfs_end_transaction(trans, root); 7069 break; 7070 } 7071 7072 ret = insert_reserved_file_extent(trans, inode, 7073 cur_offset, ins.objectid, 7074 ins.offset, ins.offset, 7075 ins.offset, 0, 0, 0, 7076 BTRFS_FILE_EXTENT_PREALLOC); 7077 BUG_ON(ret); 7078 btrfs_drop_extent_cache(inode, cur_offset, 7079 cur_offset + ins.offset -1, 0); 7080 7081 num_bytes -= ins.offset; 7082 cur_offset += ins.offset; 7083 *alloc_hint = ins.objectid + ins.offset; 7084 7085 inode->i_ctime = CURRENT_TIME; 7086 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7087 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7088 (actual_len > inode->i_size) && 7089 (cur_offset > inode->i_size)) { 7090 if (cur_offset > actual_len) 7091 i_size = actual_len; 7092 else 7093 i_size = cur_offset; 7094 i_size_write(inode, i_size); 7095 btrfs_ordered_update_i_size(inode, i_size, NULL); 7096 } 7097 7098 ret = btrfs_update_inode(trans, root, inode); 7099 BUG_ON(ret); 7100 7101 if (own_trans) 7102 btrfs_end_transaction(trans, root); 7103 } 7104 return ret; 7105 } 7106 7107 int btrfs_prealloc_file_range(struct inode *inode, int mode, 7108 u64 start, u64 num_bytes, u64 min_size, 7109 loff_t actual_len, u64 *alloc_hint) 7110 { 7111 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7112 min_size, actual_len, alloc_hint, 7113 NULL); 7114 } 7115 7116 int btrfs_prealloc_file_range_trans(struct inode *inode, 7117 struct btrfs_trans_handle *trans, int mode, 7118 u64 start, u64 num_bytes, u64 min_size, 7119 loff_t actual_len, u64 *alloc_hint) 7120 { 7121 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7122 min_size, actual_len, alloc_hint, trans); 7123 } 7124 7125 static int btrfs_set_page_dirty(struct page *page) 7126 { 7127 return __set_page_dirty_nobuffers(page); 7128 } 7129 7130 static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) 7131 { 7132 struct btrfs_root *root = BTRFS_I(inode)->root; 7133 7134 if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) 7135 return -EROFS; 7136 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7137 return -EACCES; 7138 return generic_permission(inode, mask, flags, btrfs_check_acl); 7139 } 7140 7141 static const struct inode_operations btrfs_dir_inode_operations = { 7142 .getattr = btrfs_getattr, 7143 .lookup = btrfs_lookup, 7144 .create = btrfs_create, 7145 .unlink = btrfs_unlink, 7146 .link = btrfs_link, 7147 .mkdir = btrfs_mkdir, 7148 .rmdir = btrfs_rmdir, 7149 .rename = btrfs_rename, 7150 .symlink = btrfs_symlink, 7151 .setattr = btrfs_setattr, 7152 .mknod = btrfs_mknod, 7153 .setxattr = btrfs_setxattr, 7154 .getxattr = btrfs_getxattr, 7155 .listxattr = btrfs_listxattr, 7156 .removexattr = btrfs_removexattr, 7157 .permission = btrfs_permission, 7158 }; 7159 static const struct inode_operations btrfs_dir_ro_inode_operations = { 7160 .lookup = btrfs_lookup, 7161 .permission = btrfs_permission, 7162 }; 7163 7164 static const struct file_operations btrfs_dir_file_operations = { 7165 .llseek = generic_file_llseek, 7166 .read = generic_read_dir, 7167 .readdir = btrfs_real_readdir, 7168 .unlocked_ioctl = btrfs_ioctl, 7169 #ifdef CONFIG_COMPAT 7170 .compat_ioctl = btrfs_ioctl, 7171 #endif 7172 .release = btrfs_release_file, 7173 .fsync = btrfs_sync_file, 7174 }; 7175 7176 static struct extent_io_ops btrfs_extent_io_ops = { 7177 .fill_delalloc = run_delalloc_range, 7178 .submit_bio_hook = btrfs_submit_bio_hook, 7179 .merge_bio_hook = btrfs_merge_bio_hook, 7180 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7181 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7182 .writepage_start_hook = btrfs_writepage_start_hook, 7183 .readpage_io_failed_hook = btrfs_io_failed_hook, 7184 .set_bit_hook = btrfs_set_bit_hook, 7185 .clear_bit_hook = btrfs_clear_bit_hook, 7186 .merge_extent_hook = btrfs_merge_extent_hook, 7187 .split_extent_hook = btrfs_split_extent_hook, 7188 }; 7189 7190 /* 7191 * btrfs doesn't support the bmap operation because swapfiles 7192 * use bmap to make a mapping of extents in the file. They assume 7193 * these extents won't change over the life of the file and they 7194 * use the bmap result to do IO directly to the drive. 7195 * 7196 * the btrfs bmap call would return logical addresses that aren't 7197 * suitable for IO and they also will change frequently as COW 7198 * operations happen. So, swapfile + btrfs == corruption. 7199 * 7200 * For now we're avoiding this by dropping bmap. 7201 */ 7202 static const struct address_space_operations btrfs_aops = { 7203 .readpage = btrfs_readpage, 7204 .writepage = btrfs_writepage, 7205 .writepages = btrfs_writepages, 7206 .readpages = btrfs_readpages, 7207 .sync_page = block_sync_page, 7208 .direct_IO = btrfs_direct_IO, 7209 .invalidatepage = btrfs_invalidatepage, 7210 .releasepage = btrfs_releasepage, 7211 .set_page_dirty = btrfs_set_page_dirty, 7212 .error_remove_page = generic_error_remove_page, 7213 }; 7214 7215 static const struct address_space_operations btrfs_symlink_aops = { 7216 .readpage = btrfs_readpage, 7217 .writepage = btrfs_writepage, 7218 .invalidatepage = btrfs_invalidatepage, 7219 .releasepage = btrfs_releasepage, 7220 }; 7221 7222 static const struct inode_operations btrfs_file_inode_operations = { 7223 .truncate = btrfs_truncate, 7224 .getattr = btrfs_getattr, 7225 .setattr = btrfs_setattr, 7226 .setxattr = btrfs_setxattr, 7227 .getxattr = btrfs_getxattr, 7228 .listxattr = btrfs_listxattr, 7229 .removexattr = btrfs_removexattr, 7230 .permission = btrfs_permission, 7231 .fiemap = btrfs_fiemap, 7232 }; 7233 static const struct inode_operations btrfs_special_inode_operations = { 7234 .getattr = btrfs_getattr, 7235 .setattr = btrfs_setattr, 7236 .permission = btrfs_permission, 7237 .setxattr = btrfs_setxattr, 7238 .getxattr = btrfs_getxattr, 7239 .listxattr = btrfs_listxattr, 7240 .removexattr = btrfs_removexattr, 7241 }; 7242 static const struct inode_operations btrfs_symlink_inode_operations = { 7243 .readlink = generic_readlink, 7244 .follow_link = page_follow_link_light, 7245 .put_link = page_put_link, 7246 .getattr = btrfs_getattr, 7247 .permission = btrfs_permission, 7248 .setxattr = btrfs_setxattr, 7249 .getxattr = btrfs_getxattr, 7250 .listxattr = btrfs_listxattr, 7251 .removexattr = btrfs_removexattr, 7252 }; 7253 7254 const struct dentry_operations btrfs_dentry_operations = { 7255 .d_delete = btrfs_dentry_delete, 7256 }; 7257