1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include <linux/slab.h> 40 #include <linux/ratelimit.h> 41 #include "compat.h" 42 #include "ctree.h" 43 #include "disk-io.h" 44 #include "transaction.h" 45 #include "btrfs_inode.h" 46 #include "ioctl.h" 47 #include "print-tree.h" 48 #include "ordered-data.h" 49 #include "xattr.h" 50 #include "tree-log.h" 51 #include "volumes.h" 52 #include "compression.h" 53 #include "locking.h" 54 #include "free-space-cache.h" 55 #include "inode-map.h" 56 57 struct btrfs_iget_args { 58 u64 ino; 59 struct btrfs_root *root; 60 }; 61 62 static const struct inode_operations btrfs_dir_inode_operations; 63 static const struct inode_operations btrfs_symlink_inode_operations; 64 static const struct inode_operations btrfs_dir_ro_inode_operations; 65 static const struct inode_operations btrfs_special_inode_operations; 66 static const struct inode_operations btrfs_file_inode_operations; 67 static const struct address_space_operations btrfs_aops; 68 static const struct address_space_operations btrfs_symlink_aops; 69 static const struct file_operations btrfs_dir_file_operations; 70 static struct extent_io_ops btrfs_extent_io_ops; 71 72 static struct kmem_cache *btrfs_inode_cachep; 73 struct kmem_cache *btrfs_trans_handle_cachep; 74 struct kmem_cache *btrfs_transaction_cachep; 75 struct kmem_cache *btrfs_path_cachep; 76 struct kmem_cache *btrfs_free_space_cachep; 77 78 #define S_SHIFT 12 79 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 80 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 81 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 82 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 83 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 84 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 85 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 86 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 87 }; 88 89 static int btrfs_setsize(struct inode *inode, loff_t newsize); 90 static int btrfs_truncate(struct inode *inode); 91 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 92 static noinline int cow_file_range(struct inode *inode, 93 struct page *locked_page, 94 u64 start, u64 end, int *page_started, 95 unsigned long *nr_written, int unlock); 96 97 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 98 struct inode *inode, struct inode *dir, 99 const struct qstr *qstr) 100 { 101 int err; 102 103 err = btrfs_init_acl(trans, inode, dir); 104 if (!err) 105 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 106 return err; 107 } 108 109 /* 110 * this does all the hard work for inserting an inline extent into 111 * the btree. The caller should have done a btrfs_drop_extents so that 112 * no overlapping inline items exist in the btree 113 */ 114 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 115 struct btrfs_root *root, struct inode *inode, 116 u64 start, size_t size, size_t compressed_size, 117 int compress_type, 118 struct page **compressed_pages) 119 { 120 struct btrfs_key key; 121 struct btrfs_path *path; 122 struct extent_buffer *leaf; 123 struct page *page = NULL; 124 char *kaddr; 125 unsigned long ptr; 126 struct btrfs_file_extent_item *ei; 127 int err = 0; 128 int ret; 129 size_t cur_size = size; 130 size_t datasize; 131 unsigned long offset; 132 133 if (compressed_size && compressed_pages) 134 cur_size = compressed_size; 135 136 path = btrfs_alloc_path(); 137 if (!path) 138 return -ENOMEM; 139 140 path->leave_spinning = 1; 141 142 key.objectid = btrfs_ino(inode); 143 key.offset = start; 144 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 145 datasize = btrfs_file_extent_calc_inline_size(cur_size); 146 147 inode_add_bytes(inode, size); 148 ret = btrfs_insert_empty_item(trans, root, path, &key, 149 datasize); 150 BUG_ON(ret); 151 if (ret) { 152 err = ret; 153 goto fail; 154 } 155 leaf = path->nodes[0]; 156 ei = btrfs_item_ptr(leaf, path->slots[0], 157 struct btrfs_file_extent_item); 158 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 159 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 160 btrfs_set_file_extent_encryption(leaf, ei, 0); 161 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 162 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 163 ptr = btrfs_file_extent_inline_start(ei); 164 165 if (compress_type != BTRFS_COMPRESS_NONE) { 166 struct page *cpage; 167 int i = 0; 168 while (compressed_size > 0) { 169 cpage = compressed_pages[i]; 170 cur_size = min_t(unsigned long, compressed_size, 171 PAGE_CACHE_SIZE); 172 173 kaddr = kmap_atomic(cpage, KM_USER0); 174 write_extent_buffer(leaf, kaddr, ptr, cur_size); 175 kunmap_atomic(kaddr, KM_USER0); 176 177 i++; 178 ptr += cur_size; 179 compressed_size -= cur_size; 180 } 181 btrfs_set_file_extent_compression(leaf, ei, 182 compress_type); 183 } else { 184 page = find_get_page(inode->i_mapping, 185 start >> PAGE_CACHE_SHIFT); 186 btrfs_set_file_extent_compression(leaf, ei, 0); 187 kaddr = kmap_atomic(page, KM_USER0); 188 offset = start & (PAGE_CACHE_SIZE - 1); 189 write_extent_buffer(leaf, kaddr + offset, ptr, size); 190 kunmap_atomic(kaddr, KM_USER0); 191 page_cache_release(page); 192 } 193 btrfs_mark_buffer_dirty(leaf); 194 btrfs_free_path(path); 195 196 /* 197 * we're an inline extent, so nobody can 198 * extend the file past i_size without locking 199 * a page we already have locked. 200 * 201 * We must do any isize and inode updates 202 * before we unlock the pages. Otherwise we 203 * could end up racing with unlink. 204 */ 205 BTRFS_I(inode)->disk_i_size = inode->i_size; 206 btrfs_update_inode(trans, root, inode); 207 208 return 0; 209 fail: 210 btrfs_free_path(path); 211 return err; 212 } 213 214 215 /* 216 * conditionally insert an inline extent into the file. This 217 * does the checks required to make sure the data is small enough 218 * to fit as an inline extent. 219 */ 220 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 221 struct btrfs_root *root, 222 struct inode *inode, u64 start, u64 end, 223 size_t compressed_size, int compress_type, 224 struct page **compressed_pages) 225 { 226 u64 isize = i_size_read(inode); 227 u64 actual_end = min(end + 1, isize); 228 u64 inline_len = actual_end - start; 229 u64 aligned_end = (end + root->sectorsize - 1) & 230 ~((u64)root->sectorsize - 1); 231 u64 hint_byte; 232 u64 data_len = inline_len; 233 int ret; 234 235 if (compressed_size) 236 data_len = compressed_size; 237 238 if (start > 0 || 239 actual_end >= PAGE_CACHE_SIZE || 240 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 241 (!compressed_size && 242 (actual_end & (root->sectorsize - 1)) == 0) || 243 end + 1 < isize || 244 data_len > root->fs_info->max_inline) { 245 return 1; 246 } 247 248 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 249 &hint_byte, 1); 250 BUG_ON(ret); 251 252 if (isize > actual_end) 253 inline_len = min_t(u64, isize, actual_end); 254 ret = insert_inline_extent(trans, root, inode, start, 255 inline_len, compressed_size, 256 compress_type, compressed_pages); 257 BUG_ON(ret); 258 btrfs_delalloc_release_metadata(inode, end + 1 - start); 259 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 260 return 0; 261 } 262 263 struct async_extent { 264 u64 start; 265 u64 ram_size; 266 u64 compressed_size; 267 struct page **pages; 268 unsigned long nr_pages; 269 int compress_type; 270 struct list_head list; 271 }; 272 273 struct async_cow { 274 struct inode *inode; 275 struct btrfs_root *root; 276 struct page *locked_page; 277 u64 start; 278 u64 end; 279 struct list_head extents; 280 struct btrfs_work work; 281 }; 282 283 static noinline int add_async_extent(struct async_cow *cow, 284 u64 start, u64 ram_size, 285 u64 compressed_size, 286 struct page **pages, 287 unsigned long nr_pages, 288 int compress_type) 289 { 290 struct async_extent *async_extent; 291 292 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 293 BUG_ON(!async_extent); 294 async_extent->start = start; 295 async_extent->ram_size = ram_size; 296 async_extent->compressed_size = compressed_size; 297 async_extent->pages = pages; 298 async_extent->nr_pages = nr_pages; 299 async_extent->compress_type = compress_type; 300 list_add_tail(&async_extent->list, &cow->extents); 301 return 0; 302 } 303 304 /* 305 * we create compressed extents in two phases. The first 306 * phase compresses a range of pages that have already been 307 * locked (both pages and state bits are locked). 308 * 309 * This is done inside an ordered work queue, and the compression 310 * is spread across many cpus. The actual IO submission is step 311 * two, and the ordered work queue takes care of making sure that 312 * happens in the same order things were put onto the queue by 313 * writepages and friends. 314 * 315 * If this code finds it can't get good compression, it puts an 316 * entry onto the work queue to write the uncompressed bytes. This 317 * makes sure that both compressed inodes and uncompressed inodes 318 * are written in the same order that pdflush sent them down. 319 */ 320 static noinline int compress_file_range(struct inode *inode, 321 struct page *locked_page, 322 u64 start, u64 end, 323 struct async_cow *async_cow, 324 int *num_added) 325 { 326 struct btrfs_root *root = BTRFS_I(inode)->root; 327 struct btrfs_trans_handle *trans; 328 u64 num_bytes; 329 u64 blocksize = root->sectorsize; 330 u64 actual_end; 331 u64 isize = i_size_read(inode); 332 int ret = 0; 333 struct page **pages = NULL; 334 unsigned long nr_pages; 335 unsigned long nr_pages_ret = 0; 336 unsigned long total_compressed = 0; 337 unsigned long total_in = 0; 338 unsigned long max_compressed = 128 * 1024; 339 unsigned long max_uncompressed = 128 * 1024; 340 int i; 341 int will_compress; 342 int compress_type = root->fs_info->compress_type; 343 344 /* if this is a small write inside eof, kick off a defragbot */ 345 if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024) 346 btrfs_add_inode_defrag(NULL, inode); 347 348 actual_end = min_t(u64, isize, end + 1); 349 again: 350 will_compress = 0; 351 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 352 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 353 354 /* 355 * we don't want to send crud past the end of i_size through 356 * compression, that's just a waste of CPU time. So, if the 357 * end of the file is before the start of our current 358 * requested range of bytes, we bail out to the uncompressed 359 * cleanup code that can deal with all of this. 360 * 361 * It isn't really the fastest way to fix things, but this is a 362 * very uncommon corner. 363 */ 364 if (actual_end <= start) 365 goto cleanup_and_bail_uncompressed; 366 367 total_compressed = actual_end - start; 368 369 /* we want to make sure that amount of ram required to uncompress 370 * an extent is reasonable, so we limit the total size in ram 371 * of a compressed extent to 128k. This is a crucial number 372 * because it also controls how easily we can spread reads across 373 * cpus for decompression. 374 * 375 * We also want to make sure the amount of IO required to do 376 * a random read is reasonably small, so we limit the size of 377 * a compressed extent to 128k. 378 */ 379 total_compressed = min(total_compressed, max_uncompressed); 380 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 381 num_bytes = max(blocksize, num_bytes); 382 total_in = 0; 383 ret = 0; 384 385 /* 386 * we do compression for mount -o compress and when the 387 * inode has not been flagged as nocompress. This flag can 388 * change at any time if we discover bad compression ratios. 389 */ 390 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 391 (btrfs_test_opt(root, COMPRESS) || 392 (BTRFS_I(inode)->force_compress) || 393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 394 WARN_ON(pages); 395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 396 if (!pages) { 397 /* just bail out to the uncompressed code */ 398 goto cont; 399 } 400 401 if (BTRFS_I(inode)->force_compress) 402 compress_type = BTRFS_I(inode)->force_compress; 403 404 ret = btrfs_compress_pages(compress_type, 405 inode->i_mapping, start, 406 total_compressed, pages, 407 nr_pages, &nr_pages_ret, 408 &total_in, 409 &total_compressed, 410 max_compressed); 411 412 if (!ret) { 413 unsigned long offset = total_compressed & 414 (PAGE_CACHE_SIZE - 1); 415 struct page *page = pages[nr_pages_ret - 1]; 416 char *kaddr; 417 418 /* zero the tail end of the last page, we might be 419 * sending it down to disk 420 */ 421 if (offset) { 422 kaddr = kmap_atomic(page, KM_USER0); 423 memset(kaddr + offset, 0, 424 PAGE_CACHE_SIZE - offset); 425 kunmap_atomic(kaddr, KM_USER0); 426 } 427 will_compress = 1; 428 } 429 } 430 cont: 431 if (start == 0) { 432 trans = btrfs_join_transaction(root); 433 BUG_ON(IS_ERR(trans)); 434 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 435 436 /* lets try to make an inline extent */ 437 if (ret || total_in < (actual_end - start)) { 438 /* we didn't compress the entire range, try 439 * to make an uncompressed inline extent. 440 */ 441 ret = cow_file_range_inline(trans, root, inode, 442 start, end, 0, 0, NULL); 443 } else { 444 /* try making a compressed inline extent */ 445 ret = cow_file_range_inline(trans, root, inode, 446 start, end, 447 total_compressed, 448 compress_type, pages); 449 } 450 if (ret == 0) { 451 /* 452 * inline extent creation worked, we don't need 453 * to create any more async work items. Unlock 454 * and free up our temp pages. 455 */ 456 extent_clear_unlock_delalloc(inode, 457 &BTRFS_I(inode)->io_tree, 458 start, end, NULL, 459 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 460 EXTENT_CLEAR_DELALLOC | 461 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 462 463 btrfs_end_transaction(trans, root); 464 goto free_pages_out; 465 } 466 btrfs_end_transaction(trans, root); 467 } 468 469 if (will_compress) { 470 /* 471 * we aren't doing an inline extent round the compressed size 472 * up to a block size boundary so the allocator does sane 473 * things 474 */ 475 total_compressed = (total_compressed + blocksize - 1) & 476 ~(blocksize - 1); 477 478 /* 479 * one last check to make sure the compression is really a 480 * win, compare the page count read with the blocks on disk 481 */ 482 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 483 ~(PAGE_CACHE_SIZE - 1); 484 if (total_compressed >= total_in) { 485 will_compress = 0; 486 } else { 487 num_bytes = total_in; 488 } 489 } 490 if (!will_compress && pages) { 491 /* 492 * the compression code ran but failed to make things smaller, 493 * free any pages it allocated and our page pointer array 494 */ 495 for (i = 0; i < nr_pages_ret; i++) { 496 WARN_ON(pages[i]->mapping); 497 page_cache_release(pages[i]); 498 } 499 kfree(pages); 500 pages = NULL; 501 total_compressed = 0; 502 nr_pages_ret = 0; 503 504 /* flag the file so we don't compress in the future */ 505 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 506 !(BTRFS_I(inode)->force_compress)) { 507 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 508 } 509 } 510 if (will_compress) { 511 *num_added += 1; 512 513 /* the async work queues will take care of doing actual 514 * allocation on disk for these compressed pages, 515 * and will submit them to the elevator. 516 */ 517 add_async_extent(async_cow, start, num_bytes, 518 total_compressed, pages, nr_pages_ret, 519 compress_type); 520 521 if (start + num_bytes < end) { 522 start += num_bytes; 523 pages = NULL; 524 cond_resched(); 525 goto again; 526 } 527 } else { 528 cleanup_and_bail_uncompressed: 529 /* 530 * No compression, but we still need to write the pages in 531 * the file we've been given so far. redirty the locked 532 * page if it corresponds to our extent and set things up 533 * for the async work queue to run cow_file_range to do 534 * the normal delalloc dance 535 */ 536 if (page_offset(locked_page) >= start && 537 page_offset(locked_page) <= end) { 538 __set_page_dirty_nobuffers(locked_page); 539 /* unlocked later on in the async handlers */ 540 } 541 add_async_extent(async_cow, start, end - start + 1, 542 0, NULL, 0, BTRFS_COMPRESS_NONE); 543 *num_added += 1; 544 } 545 546 out: 547 return 0; 548 549 free_pages_out: 550 for (i = 0; i < nr_pages_ret; i++) { 551 WARN_ON(pages[i]->mapping); 552 page_cache_release(pages[i]); 553 } 554 kfree(pages); 555 556 goto out; 557 } 558 559 /* 560 * phase two of compressed writeback. This is the ordered portion 561 * of the code, which only gets called in the order the work was 562 * queued. We walk all the async extents created by compress_file_range 563 * and send them down to the disk. 564 */ 565 static noinline int submit_compressed_extents(struct inode *inode, 566 struct async_cow *async_cow) 567 { 568 struct async_extent *async_extent; 569 u64 alloc_hint = 0; 570 struct btrfs_trans_handle *trans; 571 struct btrfs_key ins; 572 struct extent_map *em; 573 struct btrfs_root *root = BTRFS_I(inode)->root; 574 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 575 struct extent_io_tree *io_tree; 576 int ret = 0; 577 578 if (list_empty(&async_cow->extents)) 579 return 0; 580 581 582 while (!list_empty(&async_cow->extents)) { 583 async_extent = list_entry(async_cow->extents.next, 584 struct async_extent, list); 585 list_del(&async_extent->list); 586 587 io_tree = &BTRFS_I(inode)->io_tree; 588 589 retry: 590 /* did the compression code fall back to uncompressed IO? */ 591 if (!async_extent->pages) { 592 int page_started = 0; 593 unsigned long nr_written = 0; 594 595 lock_extent(io_tree, async_extent->start, 596 async_extent->start + 597 async_extent->ram_size - 1, GFP_NOFS); 598 599 /* allocate blocks */ 600 ret = cow_file_range(inode, async_cow->locked_page, 601 async_extent->start, 602 async_extent->start + 603 async_extent->ram_size - 1, 604 &page_started, &nr_written, 0); 605 606 /* 607 * if page_started, cow_file_range inserted an 608 * inline extent and took care of all the unlocking 609 * and IO for us. Otherwise, we need to submit 610 * all those pages down to the drive. 611 */ 612 if (!page_started && !ret) 613 extent_write_locked_range(io_tree, 614 inode, async_extent->start, 615 async_extent->start + 616 async_extent->ram_size - 1, 617 btrfs_get_extent, 618 WB_SYNC_ALL); 619 kfree(async_extent); 620 cond_resched(); 621 continue; 622 } 623 624 lock_extent(io_tree, async_extent->start, 625 async_extent->start + async_extent->ram_size - 1, 626 GFP_NOFS); 627 628 trans = btrfs_join_transaction(root); 629 BUG_ON(IS_ERR(trans)); 630 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 631 ret = btrfs_reserve_extent(trans, root, 632 async_extent->compressed_size, 633 async_extent->compressed_size, 634 0, alloc_hint, 635 (u64)-1, &ins, 1); 636 btrfs_end_transaction(trans, root); 637 638 if (ret) { 639 int i; 640 for (i = 0; i < async_extent->nr_pages; i++) { 641 WARN_ON(async_extent->pages[i]->mapping); 642 page_cache_release(async_extent->pages[i]); 643 } 644 kfree(async_extent->pages); 645 async_extent->nr_pages = 0; 646 async_extent->pages = NULL; 647 unlock_extent(io_tree, async_extent->start, 648 async_extent->start + 649 async_extent->ram_size - 1, GFP_NOFS); 650 goto retry; 651 } 652 653 /* 654 * here we're doing allocation and writeback of the 655 * compressed pages 656 */ 657 btrfs_drop_extent_cache(inode, async_extent->start, 658 async_extent->start + 659 async_extent->ram_size - 1, 0); 660 661 em = alloc_extent_map(); 662 BUG_ON(!em); 663 em->start = async_extent->start; 664 em->len = async_extent->ram_size; 665 em->orig_start = em->start; 666 667 em->block_start = ins.objectid; 668 em->block_len = ins.offset; 669 em->bdev = root->fs_info->fs_devices->latest_bdev; 670 em->compress_type = async_extent->compress_type; 671 set_bit(EXTENT_FLAG_PINNED, &em->flags); 672 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 673 674 while (1) { 675 write_lock(&em_tree->lock); 676 ret = add_extent_mapping(em_tree, em); 677 write_unlock(&em_tree->lock); 678 if (ret != -EEXIST) { 679 free_extent_map(em); 680 break; 681 } 682 btrfs_drop_extent_cache(inode, async_extent->start, 683 async_extent->start + 684 async_extent->ram_size - 1, 0); 685 } 686 687 ret = btrfs_add_ordered_extent_compress(inode, 688 async_extent->start, 689 ins.objectid, 690 async_extent->ram_size, 691 ins.offset, 692 BTRFS_ORDERED_COMPRESSED, 693 async_extent->compress_type); 694 BUG_ON(ret); 695 696 /* 697 * clear dirty, set writeback and unlock the pages. 698 */ 699 extent_clear_unlock_delalloc(inode, 700 &BTRFS_I(inode)->io_tree, 701 async_extent->start, 702 async_extent->start + 703 async_extent->ram_size - 1, 704 NULL, EXTENT_CLEAR_UNLOCK_PAGE | 705 EXTENT_CLEAR_UNLOCK | 706 EXTENT_CLEAR_DELALLOC | 707 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); 708 709 ret = btrfs_submit_compressed_write(inode, 710 async_extent->start, 711 async_extent->ram_size, 712 ins.objectid, 713 ins.offset, async_extent->pages, 714 async_extent->nr_pages); 715 716 BUG_ON(ret); 717 alloc_hint = ins.objectid + ins.offset; 718 kfree(async_extent); 719 cond_resched(); 720 } 721 722 return 0; 723 } 724 725 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 726 u64 num_bytes) 727 { 728 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 729 struct extent_map *em; 730 u64 alloc_hint = 0; 731 732 read_lock(&em_tree->lock); 733 em = search_extent_mapping(em_tree, start, num_bytes); 734 if (em) { 735 /* 736 * if block start isn't an actual block number then find the 737 * first block in this inode and use that as a hint. If that 738 * block is also bogus then just don't worry about it. 739 */ 740 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 741 free_extent_map(em); 742 em = search_extent_mapping(em_tree, 0, 0); 743 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 744 alloc_hint = em->block_start; 745 if (em) 746 free_extent_map(em); 747 } else { 748 alloc_hint = em->block_start; 749 free_extent_map(em); 750 } 751 } 752 read_unlock(&em_tree->lock); 753 754 return alloc_hint; 755 } 756 757 /* 758 * when extent_io.c finds a delayed allocation range in the file, 759 * the call backs end up in this code. The basic idea is to 760 * allocate extents on disk for the range, and create ordered data structs 761 * in ram to track those extents. 762 * 763 * locked_page is the page that writepage had locked already. We use 764 * it to make sure we don't do extra locks or unlocks. 765 * 766 * *page_started is set to one if we unlock locked_page and do everything 767 * required to start IO on it. It may be clean and already done with 768 * IO when we return. 769 */ 770 static noinline int cow_file_range(struct inode *inode, 771 struct page *locked_page, 772 u64 start, u64 end, int *page_started, 773 unsigned long *nr_written, 774 int unlock) 775 { 776 struct btrfs_root *root = BTRFS_I(inode)->root; 777 struct btrfs_trans_handle *trans; 778 u64 alloc_hint = 0; 779 u64 num_bytes; 780 unsigned long ram_size; 781 u64 disk_num_bytes; 782 u64 cur_alloc_size; 783 u64 blocksize = root->sectorsize; 784 struct btrfs_key ins; 785 struct extent_map *em; 786 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 787 int ret = 0; 788 789 BUG_ON(btrfs_is_free_space_inode(root, inode)); 790 trans = btrfs_join_transaction(root); 791 BUG_ON(IS_ERR(trans)); 792 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 793 794 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 795 num_bytes = max(blocksize, num_bytes); 796 disk_num_bytes = num_bytes; 797 ret = 0; 798 799 /* if this is a small write inside eof, kick off defrag */ 800 if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024) 801 btrfs_add_inode_defrag(trans, inode); 802 803 if (start == 0) { 804 /* lets try to make an inline extent */ 805 ret = cow_file_range_inline(trans, root, inode, 806 start, end, 0, 0, NULL); 807 if (ret == 0) { 808 extent_clear_unlock_delalloc(inode, 809 &BTRFS_I(inode)->io_tree, 810 start, end, NULL, 811 EXTENT_CLEAR_UNLOCK_PAGE | 812 EXTENT_CLEAR_UNLOCK | 813 EXTENT_CLEAR_DELALLOC | 814 EXTENT_CLEAR_DIRTY | 815 EXTENT_SET_WRITEBACK | 816 EXTENT_END_WRITEBACK); 817 818 *nr_written = *nr_written + 819 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 820 *page_started = 1; 821 ret = 0; 822 goto out; 823 } 824 } 825 826 BUG_ON(disk_num_bytes > 827 btrfs_super_total_bytes(root->fs_info->super_copy)); 828 829 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 830 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 831 832 while (disk_num_bytes > 0) { 833 unsigned long op; 834 835 cur_alloc_size = disk_num_bytes; 836 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 837 root->sectorsize, 0, alloc_hint, 838 (u64)-1, &ins, 1); 839 BUG_ON(ret); 840 841 em = alloc_extent_map(); 842 BUG_ON(!em); 843 em->start = start; 844 em->orig_start = em->start; 845 ram_size = ins.offset; 846 em->len = ins.offset; 847 848 em->block_start = ins.objectid; 849 em->block_len = ins.offset; 850 em->bdev = root->fs_info->fs_devices->latest_bdev; 851 set_bit(EXTENT_FLAG_PINNED, &em->flags); 852 853 while (1) { 854 write_lock(&em_tree->lock); 855 ret = add_extent_mapping(em_tree, em); 856 write_unlock(&em_tree->lock); 857 if (ret != -EEXIST) { 858 free_extent_map(em); 859 break; 860 } 861 btrfs_drop_extent_cache(inode, start, 862 start + ram_size - 1, 0); 863 } 864 865 cur_alloc_size = ins.offset; 866 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 867 ram_size, cur_alloc_size, 0); 868 BUG_ON(ret); 869 870 if (root->root_key.objectid == 871 BTRFS_DATA_RELOC_TREE_OBJECTID) { 872 ret = btrfs_reloc_clone_csums(inode, start, 873 cur_alloc_size); 874 BUG_ON(ret); 875 } 876 877 if (disk_num_bytes < cur_alloc_size) 878 break; 879 880 /* we're not doing compressed IO, don't unlock the first 881 * page (which the caller expects to stay locked), don't 882 * clear any dirty bits and don't set any writeback bits 883 * 884 * Do set the Private2 bit so we know this page was properly 885 * setup for writepage 886 */ 887 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; 888 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 889 EXTENT_SET_PRIVATE2; 890 891 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 892 start, start + ram_size - 1, 893 locked_page, op); 894 disk_num_bytes -= cur_alloc_size; 895 num_bytes -= cur_alloc_size; 896 alloc_hint = ins.objectid + ins.offset; 897 start += cur_alloc_size; 898 } 899 out: 900 ret = 0; 901 btrfs_end_transaction(trans, root); 902 903 return ret; 904 } 905 906 /* 907 * work queue call back to started compression on a file and pages 908 */ 909 static noinline void async_cow_start(struct btrfs_work *work) 910 { 911 struct async_cow *async_cow; 912 int num_added = 0; 913 async_cow = container_of(work, struct async_cow, work); 914 915 compress_file_range(async_cow->inode, async_cow->locked_page, 916 async_cow->start, async_cow->end, async_cow, 917 &num_added); 918 if (num_added == 0) 919 async_cow->inode = NULL; 920 } 921 922 /* 923 * work queue call back to submit previously compressed pages 924 */ 925 static noinline void async_cow_submit(struct btrfs_work *work) 926 { 927 struct async_cow *async_cow; 928 struct btrfs_root *root; 929 unsigned long nr_pages; 930 931 async_cow = container_of(work, struct async_cow, work); 932 933 root = async_cow->root; 934 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 935 PAGE_CACHE_SHIFT; 936 937 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 938 939 if (atomic_read(&root->fs_info->async_delalloc_pages) < 940 5 * 1042 * 1024 && 941 waitqueue_active(&root->fs_info->async_submit_wait)) 942 wake_up(&root->fs_info->async_submit_wait); 943 944 if (async_cow->inode) 945 submit_compressed_extents(async_cow->inode, async_cow); 946 } 947 948 static noinline void async_cow_free(struct btrfs_work *work) 949 { 950 struct async_cow *async_cow; 951 async_cow = container_of(work, struct async_cow, work); 952 kfree(async_cow); 953 } 954 955 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 956 u64 start, u64 end, int *page_started, 957 unsigned long *nr_written) 958 { 959 struct async_cow *async_cow; 960 struct btrfs_root *root = BTRFS_I(inode)->root; 961 unsigned long nr_pages; 962 u64 cur_end; 963 int limit = 10 * 1024 * 1042; 964 965 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 966 1, 0, NULL, GFP_NOFS); 967 while (start < end) { 968 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 969 BUG_ON(!async_cow); 970 async_cow->inode = inode; 971 async_cow->root = root; 972 async_cow->locked_page = locked_page; 973 async_cow->start = start; 974 975 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 976 cur_end = end; 977 else 978 cur_end = min(end, start + 512 * 1024 - 1); 979 980 async_cow->end = cur_end; 981 INIT_LIST_HEAD(&async_cow->extents); 982 983 async_cow->work.func = async_cow_start; 984 async_cow->work.ordered_func = async_cow_submit; 985 async_cow->work.ordered_free = async_cow_free; 986 async_cow->work.flags = 0; 987 988 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 989 PAGE_CACHE_SHIFT; 990 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 991 992 btrfs_queue_worker(&root->fs_info->delalloc_workers, 993 &async_cow->work); 994 995 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 996 wait_event(root->fs_info->async_submit_wait, 997 (atomic_read(&root->fs_info->async_delalloc_pages) < 998 limit)); 999 } 1000 1001 while (atomic_read(&root->fs_info->async_submit_draining) && 1002 atomic_read(&root->fs_info->async_delalloc_pages)) { 1003 wait_event(root->fs_info->async_submit_wait, 1004 (atomic_read(&root->fs_info->async_delalloc_pages) == 1005 0)); 1006 } 1007 1008 *nr_written += nr_pages; 1009 start = cur_end + 1; 1010 } 1011 *page_started = 1; 1012 return 0; 1013 } 1014 1015 static noinline int csum_exist_in_range(struct btrfs_root *root, 1016 u64 bytenr, u64 num_bytes) 1017 { 1018 int ret; 1019 struct btrfs_ordered_sum *sums; 1020 LIST_HEAD(list); 1021 1022 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1023 bytenr + num_bytes - 1, &list, 0); 1024 if (ret == 0 && list_empty(&list)) 1025 return 0; 1026 1027 while (!list_empty(&list)) { 1028 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1029 list_del(&sums->list); 1030 kfree(sums); 1031 } 1032 return 1; 1033 } 1034 1035 /* 1036 * when nowcow writeback call back. This checks for snapshots or COW copies 1037 * of the extents that exist in the file, and COWs the file as required. 1038 * 1039 * If no cow copies or snapshots exist, we write directly to the existing 1040 * blocks on disk 1041 */ 1042 static noinline int run_delalloc_nocow(struct inode *inode, 1043 struct page *locked_page, 1044 u64 start, u64 end, int *page_started, int force, 1045 unsigned long *nr_written) 1046 { 1047 struct btrfs_root *root = BTRFS_I(inode)->root; 1048 struct btrfs_trans_handle *trans; 1049 struct extent_buffer *leaf; 1050 struct btrfs_path *path; 1051 struct btrfs_file_extent_item *fi; 1052 struct btrfs_key found_key; 1053 u64 cow_start; 1054 u64 cur_offset; 1055 u64 extent_end; 1056 u64 extent_offset; 1057 u64 disk_bytenr; 1058 u64 num_bytes; 1059 int extent_type; 1060 int ret; 1061 int type; 1062 int nocow; 1063 int check_prev = 1; 1064 bool nolock; 1065 u64 ino = btrfs_ino(inode); 1066 1067 path = btrfs_alloc_path(); 1068 if (!path) 1069 return -ENOMEM; 1070 1071 nolock = btrfs_is_free_space_inode(root, inode); 1072 1073 if (nolock) 1074 trans = btrfs_join_transaction_nolock(root); 1075 else 1076 trans = btrfs_join_transaction(root); 1077 1078 BUG_ON(IS_ERR(trans)); 1079 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1080 1081 cow_start = (u64)-1; 1082 cur_offset = start; 1083 while (1) { 1084 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1085 cur_offset, 0); 1086 BUG_ON(ret < 0); 1087 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1088 leaf = path->nodes[0]; 1089 btrfs_item_key_to_cpu(leaf, &found_key, 1090 path->slots[0] - 1); 1091 if (found_key.objectid == ino && 1092 found_key.type == BTRFS_EXTENT_DATA_KEY) 1093 path->slots[0]--; 1094 } 1095 check_prev = 0; 1096 next_slot: 1097 leaf = path->nodes[0]; 1098 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1099 ret = btrfs_next_leaf(root, path); 1100 if (ret < 0) 1101 BUG_ON(1); 1102 if (ret > 0) 1103 break; 1104 leaf = path->nodes[0]; 1105 } 1106 1107 nocow = 0; 1108 disk_bytenr = 0; 1109 num_bytes = 0; 1110 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1111 1112 if (found_key.objectid > ino || 1113 found_key.type > BTRFS_EXTENT_DATA_KEY || 1114 found_key.offset > end) 1115 break; 1116 1117 if (found_key.offset > cur_offset) { 1118 extent_end = found_key.offset; 1119 extent_type = 0; 1120 goto out_check; 1121 } 1122 1123 fi = btrfs_item_ptr(leaf, path->slots[0], 1124 struct btrfs_file_extent_item); 1125 extent_type = btrfs_file_extent_type(leaf, fi); 1126 1127 if (extent_type == BTRFS_FILE_EXTENT_REG || 1128 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1129 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1130 extent_offset = btrfs_file_extent_offset(leaf, fi); 1131 extent_end = found_key.offset + 1132 btrfs_file_extent_num_bytes(leaf, fi); 1133 if (extent_end <= start) { 1134 path->slots[0]++; 1135 goto next_slot; 1136 } 1137 if (disk_bytenr == 0) 1138 goto out_check; 1139 if (btrfs_file_extent_compression(leaf, fi) || 1140 btrfs_file_extent_encryption(leaf, fi) || 1141 btrfs_file_extent_other_encoding(leaf, fi)) 1142 goto out_check; 1143 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1144 goto out_check; 1145 if (btrfs_extent_readonly(root, disk_bytenr)) 1146 goto out_check; 1147 if (btrfs_cross_ref_exist(trans, root, ino, 1148 found_key.offset - 1149 extent_offset, disk_bytenr)) 1150 goto out_check; 1151 disk_bytenr += extent_offset; 1152 disk_bytenr += cur_offset - found_key.offset; 1153 num_bytes = min(end + 1, extent_end) - cur_offset; 1154 /* 1155 * force cow if csum exists in the range. 1156 * this ensure that csum for a given extent are 1157 * either valid or do not exist. 1158 */ 1159 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1160 goto out_check; 1161 nocow = 1; 1162 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1163 extent_end = found_key.offset + 1164 btrfs_file_extent_inline_len(leaf, fi); 1165 extent_end = ALIGN(extent_end, root->sectorsize); 1166 } else { 1167 BUG_ON(1); 1168 } 1169 out_check: 1170 if (extent_end <= start) { 1171 path->slots[0]++; 1172 goto next_slot; 1173 } 1174 if (!nocow) { 1175 if (cow_start == (u64)-1) 1176 cow_start = cur_offset; 1177 cur_offset = extent_end; 1178 if (cur_offset > end) 1179 break; 1180 path->slots[0]++; 1181 goto next_slot; 1182 } 1183 1184 btrfs_release_path(path); 1185 if (cow_start != (u64)-1) { 1186 ret = cow_file_range(inode, locked_page, cow_start, 1187 found_key.offset - 1, page_started, 1188 nr_written, 1); 1189 BUG_ON(ret); 1190 cow_start = (u64)-1; 1191 } 1192 1193 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1194 struct extent_map *em; 1195 struct extent_map_tree *em_tree; 1196 em_tree = &BTRFS_I(inode)->extent_tree; 1197 em = alloc_extent_map(); 1198 BUG_ON(!em); 1199 em->start = cur_offset; 1200 em->orig_start = em->start; 1201 em->len = num_bytes; 1202 em->block_len = num_bytes; 1203 em->block_start = disk_bytenr; 1204 em->bdev = root->fs_info->fs_devices->latest_bdev; 1205 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1206 while (1) { 1207 write_lock(&em_tree->lock); 1208 ret = add_extent_mapping(em_tree, em); 1209 write_unlock(&em_tree->lock); 1210 if (ret != -EEXIST) { 1211 free_extent_map(em); 1212 break; 1213 } 1214 btrfs_drop_extent_cache(inode, em->start, 1215 em->start + em->len - 1, 0); 1216 } 1217 type = BTRFS_ORDERED_PREALLOC; 1218 } else { 1219 type = BTRFS_ORDERED_NOCOW; 1220 } 1221 1222 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1223 num_bytes, num_bytes, type); 1224 BUG_ON(ret); 1225 1226 if (root->root_key.objectid == 1227 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1228 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1229 num_bytes); 1230 BUG_ON(ret); 1231 } 1232 1233 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1234 cur_offset, cur_offset + num_bytes - 1, 1235 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1236 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 1237 EXTENT_SET_PRIVATE2); 1238 cur_offset = extent_end; 1239 if (cur_offset > end) 1240 break; 1241 } 1242 btrfs_release_path(path); 1243 1244 if (cur_offset <= end && cow_start == (u64)-1) 1245 cow_start = cur_offset; 1246 if (cow_start != (u64)-1) { 1247 ret = cow_file_range(inode, locked_page, cow_start, end, 1248 page_started, nr_written, 1); 1249 BUG_ON(ret); 1250 } 1251 1252 if (nolock) { 1253 ret = btrfs_end_transaction_nolock(trans, root); 1254 BUG_ON(ret); 1255 } else { 1256 ret = btrfs_end_transaction(trans, root); 1257 BUG_ON(ret); 1258 } 1259 btrfs_free_path(path); 1260 return 0; 1261 } 1262 1263 /* 1264 * extent_io.c call back to do delayed allocation processing 1265 */ 1266 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1267 u64 start, u64 end, int *page_started, 1268 unsigned long *nr_written) 1269 { 1270 int ret; 1271 struct btrfs_root *root = BTRFS_I(inode)->root; 1272 1273 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) 1274 ret = run_delalloc_nocow(inode, locked_page, start, end, 1275 page_started, 1, nr_written); 1276 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1277 ret = run_delalloc_nocow(inode, locked_page, start, end, 1278 page_started, 0, nr_written); 1279 else if (!btrfs_test_opt(root, COMPRESS) && 1280 !(BTRFS_I(inode)->force_compress) && 1281 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) 1282 ret = cow_file_range(inode, locked_page, start, end, 1283 page_started, nr_written, 1); 1284 else 1285 ret = cow_file_range_async(inode, locked_page, start, end, 1286 page_started, nr_written); 1287 return ret; 1288 } 1289 1290 static void btrfs_split_extent_hook(struct inode *inode, 1291 struct extent_state *orig, u64 split) 1292 { 1293 /* not delalloc, ignore it */ 1294 if (!(orig->state & EXTENT_DELALLOC)) 1295 return; 1296 1297 spin_lock(&BTRFS_I(inode)->lock); 1298 BTRFS_I(inode)->outstanding_extents++; 1299 spin_unlock(&BTRFS_I(inode)->lock); 1300 } 1301 1302 /* 1303 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1304 * extents so we can keep track of new extents that are just merged onto old 1305 * extents, such as when we are doing sequential writes, so we can properly 1306 * account for the metadata space we'll need. 1307 */ 1308 static void btrfs_merge_extent_hook(struct inode *inode, 1309 struct extent_state *new, 1310 struct extent_state *other) 1311 { 1312 /* not delalloc, ignore it */ 1313 if (!(other->state & EXTENT_DELALLOC)) 1314 return; 1315 1316 spin_lock(&BTRFS_I(inode)->lock); 1317 BTRFS_I(inode)->outstanding_extents--; 1318 spin_unlock(&BTRFS_I(inode)->lock); 1319 } 1320 1321 /* 1322 * extent_io.c set_bit_hook, used to track delayed allocation 1323 * bytes in this file, and to maintain the list of inodes that 1324 * have pending delalloc work to be done. 1325 */ 1326 static void btrfs_set_bit_hook(struct inode *inode, 1327 struct extent_state *state, int *bits) 1328 { 1329 1330 /* 1331 * set_bit and clear bit hooks normally require _irqsave/restore 1332 * but in this case, we are only testing for the DELALLOC 1333 * bit, which is only set or cleared with irqs on 1334 */ 1335 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1336 struct btrfs_root *root = BTRFS_I(inode)->root; 1337 u64 len = state->end + 1 - state->start; 1338 bool do_list = !btrfs_is_free_space_inode(root, inode); 1339 1340 if (*bits & EXTENT_FIRST_DELALLOC) { 1341 *bits &= ~EXTENT_FIRST_DELALLOC; 1342 } else { 1343 spin_lock(&BTRFS_I(inode)->lock); 1344 BTRFS_I(inode)->outstanding_extents++; 1345 spin_unlock(&BTRFS_I(inode)->lock); 1346 } 1347 1348 spin_lock(&root->fs_info->delalloc_lock); 1349 BTRFS_I(inode)->delalloc_bytes += len; 1350 root->fs_info->delalloc_bytes += len; 1351 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1352 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1353 &root->fs_info->delalloc_inodes); 1354 } 1355 spin_unlock(&root->fs_info->delalloc_lock); 1356 } 1357 } 1358 1359 /* 1360 * extent_io.c clear_bit_hook, see set_bit_hook for why 1361 */ 1362 static void btrfs_clear_bit_hook(struct inode *inode, 1363 struct extent_state *state, int *bits) 1364 { 1365 /* 1366 * set_bit and clear bit hooks normally require _irqsave/restore 1367 * but in this case, we are only testing for the DELALLOC 1368 * bit, which is only set or cleared with irqs on 1369 */ 1370 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1371 struct btrfs_root *root = BTRFS_I(inode)->root; 1372 u64 len = state->end + 1 - state->start; 1373 bool do_list = !btrfs_is_free_space_inode(root, inode); 1374 1375 if (*bits & EXTENT_FIRST_DELALLOC) { 1376 *bits &= ~EXTENT_FIRST_DELALLOC; 1377 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1378 spin_lock(&BTRFS_I(inode)->lock); 1379 BTRFS_I(inode)->outstanding_extents--; 1380 spin_unlock(&BTRFS_I(inode)->lock); 1381 } 1382 1383 if (*bits & EXTENT_DO_ACCOUNTING) 1384 btrfs_delalloc_release_metadata(inode, len); 1385 1386 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1387 && do_list) 1388 btrfs_free_reserved_data_space(inode, len); 1389 1390 spin_lock(&root->fs_info->delalloc_lock); 1391 root->fs_info->delalloc_bytes -= len; 1392 BTRFS_I(inode)->delalloc_bytes -= len; 1393 1394 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1395 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1396 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1397 } 1398 spin_unlock(&root->fs_info->delalloc_lock); 1399 } 1400 } 1401 1402 /* 1403 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1404 * we don't create bios that span stripes or chunks 1405 */ 1406 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1407 size_t size, struct bio *bio, 1408 unsigned long bio_flags) 1409 { 1410 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1411 struct btrfs_mapping_tree *map_tree; 1412 u64 logical = (u64)bio->bi_sector << 9; 1413 u64 length = 0; 1414 u64 map_length; 1415 int ret; 1416 1417 if (bio_flags & EXTENT_BIO_COMPRESSED) 1418 return 0; 1419 1420 length = bio->bi_size; 1421 map_tree = &root->fs_info->mapping_tree; 1422 map_length = length; 1423 ret = btrfs_map_block(map_tree, READ, logical, 1424 &map_length, NULL, 0); 1425 1426 if (map_length < length + size) 1427 return 1; 1428 return ret; 1429 } 1430 1431 /* 1432 * in order to insert checksums into the metadata in large chunks, 1433 * we wait until bio submission time. All the pages in the bio are 1434 * checksummed and sums are attached onto the ordered extent record. 1435 * 1436 * At IO completion time the cums attached on the ordered extent record 1437 * are inserted into the btree 1438 */ 1439 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1440 struct bio *bio, int mirror_num, 1441 unsigned long bio_flags, 1442 u64 bio_offset) 1443 { 1444 struct btrfs_root *root = BTRFS_I(inode)->root; 1445 int ret = 0; 1446 1447 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1448 BUG_ON(ret); 1449 return 0; 1450 } 1451 1452 /* 1453 * in order to insert checksums into the metadata in large chunks, 1454 * we wait until bio submission time. All the pages in the bio are 1455 * checksummed and sums are attached onto the ordered extent record. 1456 * 1457 * At IO completion time the cums attached on the ordered extent record 1458 * are inserted into the btree 1459 */ 1460 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1461 int mirror_num, unsigned long bio_flags, 1462 u64 bio_offset) 1463 { 1464 struct btrfs_root *root = BTRFS_I(inode)->root; 1465 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1466 } 1467 1468 /* 1469 * extent_io.c submission hook. This does the right thing for csum calculation 1470 * on write, or reading the csums from the tree before a read 1471 */ 1472 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1473 int mirror_num, unsigned long bio_flags, 1474 u64 bio_offset) 1475 { 1476 struct btrfs_root *root = BTRFS_I(inode)->root; 1477 int ret = 0; 1478 int skip_sum; 1479 1480 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1481 1482 if (btrfs_is_free_space_inode(root, inode)) 1483 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); 1484 else 1485 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1486 BUG_ON(ret); 1487 1488 if (!(rw & REQ_WRITE)) { 1489 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1490 return btrfs_submit_compressed_read(inode, bio, 1491 mirror_num, bio_flags); 1492 } else if (!skip_sum) { 1493 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1494 if (ret) 1495 return ret; 1496 } 1497 goto mapit; 1498 } else if (!skip_sum) { 1499 /* csum items have already been cloned */ 1500 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1501 goto mapit; 1502 /* we're doing a write, do the async checksumming */ 1503 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1504 inode, rw, bio, mirror_num, 1505 bio_flags, bio_offset, 1506 __btrfs_submit_bio_start, 1507 __btrfs_submit_bio_done); 1508 } 1509 1510 mapit: 1511 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1512 } 1513 1514 /* 1515 * given a list of ordered sums record them in the inode. This happens 1516 * at IO completion time based on sums calculated at bio submission time. 1517 */ 1518 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1519 struct inode *inode, u64 file_offset, 1520 struct list_head *list) 1521 { 1522 struct btrfs_ordered_sum *sum; 1523 1524 list_for_each_entry(sum, list, list) { 1525 btrfs_csum_file_blocks(trans, 1526 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1527 } 1528 return 0; 1529 } 1530 1531 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1532 struct extent_state **cached_state) 1533 { 1534 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1535 WARN_ON(1); 1536 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1537 cached_state, GFP_NOFS); 1538 } 1539 1540 /* see btrfs_writepage_start_hook for details on why this is required */ 1541 struct btrfs_writepage_fixup { 1542 struct page *page; 1543 struct btrfs_work work; 1544 }; 1545 1546 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1547 { 1548 struct btrfs_writepage_fixup *fixup; 1549 struct btrfs_ordered_extent *ordered; 1550 struct extent_state *cached_state = NULL; 1551 struct page *page; 1552 struct inode *inode; 1553 u64 page_start; 1554 u64 page_end; 1555 1556 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1557 page = fixup->page; 1558 again: 1559 lock_page(page); 1560 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1561 ClearPageChecked(page); 1562 goto out_page; 1563 } 1564 1565 inode = page->mapping->host; 1566 page_start = page_offset(page); 1567 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1568 1569 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1570 &cached_state, GFP_NOFS); 1571 1572 /* already ordered? We're done */ 1573 if (PagePrivate2(page)) 1574 goto out; 1575 1576 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1577 if (ordered) { 1578 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1579 page_end, &cached_state, GFP_NOFS); 1580 unlock_page(page); 1581 btrfs_start_ordered_extent(inode, ordered, 1); 1582 goto again; 1583 } 1584 1585 BUG(); 1586 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1587 ClearPageChecked(page); 1588 out: 1589 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1590 &cached_state, GFP_NOFS); 1591 out_page: 1592 unlock_page(page); 1593 page_cache_release(page); 1594 kfree(fixup); 1595 } 1596 1597 /* 1598 * There are a few paths in the higher layers of the kernel that directly 1599 * set the page dirty bit without asking the filesystem if it is a 1600 * good idea. This causes problems because we want to make sure COW 1601 * properly happens and the data=ordered rules are followed. 1602 * 1603 * In our case any range that doesn't have the ORDERED bit set 1604 * hasn't been properly setup for IO. We kick off an async process 1605 * to fix it up. The async helper will wait for ordered extents, set 1606 * the delalloc bit and make it safe to write the page. 1607 */ 1608 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1609 { 1610 struct inode *inode = page->mapping->host; 1611 struct btrfs_writepage_fixup *fixup; 1612 struct btrfs_root *root = BTRFS_I(inode)->root; 1613 1614 /* this page is properly in the ordered list */ 1615 if (TestClearPagePrivate2(page)) 1616 return 0; 1617 1618 if (PageChecked(page)) 1619 return -EAGAIN; 1620 1621 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1622 if (!fixup) 1623 return -EAGAIN; 1624 1625 SetPageChecked(page); 1626 page_cache_get(page); 1627 fixup->work.func = btrfs_writepage_fixup_worker; 1628 fixup->page = page; 1629 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1630 return -EAGAIN; 1631 } 1632 1633 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1634 struct inode *inode, u64 file_pos, 1635 u64 disk_bytenr, u64 disk_num_bytes, 1636 u64 num_bytes, u64 ram_bytes, 1637 u8 compression, u8 encryption, 1638 u16 other_encoding, int extent_type) 1639 { 1640 struct btrfs_root *root = BTRFS_I(inode)->root; 1641 struct btrfs_file_extent_item *fi; 1642 struct btrfs_path *path; 1643 struct extent_buffer *leaf; 1644 struct btrfs_key ins; 1645 u64 hint; 1646 int ret; 1647 1648 path = btrfs_alloc_path(); 1649 if (!path) 1650 return -ENOMEM; 1651 1652 path->leave_spinning = 1; 1653 1654 /* 1655 * we may be replacing one extent in the tree with another. 1656 * The new extent is pinned in the extent map, and we don't want 1657 * to drop it from the cache until it is completely in the btree. 1658 * 1659 * So, tell btrfs_drop_extents to leave this extent in the cache. 1660 * the caller is expected to unpin it and allow it to be merged 1661 * with the others. 1662 */ 1663 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1664 &hint, 0); 1665 BUG_ON(ret); 1666 1667 ins.objectid = btrfs_ino(inode); 1668 ins.offset = file_pos; 1669 ins.type = BTRFS_EXTENT_DATA_KEY; 1670 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1671 BUG_ON(ret); 1672 leaf = path->nodes[0]; 1673 fi = btrfs_item_ptr(leaf, path->slots[0], 1674 struct btrfs_file_extent_item); 1675 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1676 btrfs_set_file_extent_type(leaf, fi, extent_type); 1677 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1678 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1679 btrfs_set_file_extent_offset(leaf, fi, 0); 1680 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1681 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1682 btrfs_set_file_extent_compression(leaf, fi, compression); 1683 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1684 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1685 1686 btrfs_unlock_up_safe(path, 1); 1687 btrfs_set_lock_blocking(leaf); 1688 1689 btrfs_mark_buffer_dirty(leaf); 1690 1691 inode_add_bytes(inode, num_bytes); 1692 1693 ins.objectid = disk_bytenr; 1694 ins.offset = disk_num_bytes; 1695 ins.type = BTRFS_EXTENT_ITEM_KEY; 1696 ret = btrfs_alloc_reserved_file_extent(trans, root, 1697 root->root_key.objectid, 1698 btrfs_ino(inode), file_pos, &ins); 1699 BUG_ON(ret); 1700 btrfs_free_path(path); 1701 1702 return 0; 1703 } 1704 1705 /* 1706 * helper function for btrfs_finish_ordered_io, this 1707 * just reads in some of the csum leaves to prime them into ram 1708 * before we start the transaction. It limits the amount of btree 1709 * reads required while inside the transaction. 1710 */ 1711 /* as ordered data IO finishes, this gets called so we can finish 1712 * an ordered extent if the range of bytes in the file it covers are 1713 * fully written. 1714 */ 1715 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1716 { 1717 struct btrfs_root *root = BTRFS_I(inode)->root; 1718 struct btrfs_trans_handle *trans = NULL; 1719 struct btrfs_ordered_extent *ordered_extent = NULL; 1720 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1721 struct extent_state *cached_state = NULL; 1722 int compress_type = 0; 1723 int ret; 1724 bool nolock; 1725 1726 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1727 end - start + 1); 1728 if (!ret) 1729 return 0; 1730 BUG_ON(!ordered_extent); 1731 1732 nolock = btrfs_is_free_space_inode(root, inode); 1733 1734 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1735 BUG_ON(!list_empty(&ordered_extent->list)); 1736 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1737 if (!ret) { 1738 if (nolock) 1739 trans = btrfs_join_transaction_nolock(root); 1740 else 1741 trans = btrfs_join_transaction(root); 1742 BUG_ON(IS_ERR(trans)); 1743 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1744 ret = btrfs_update_inode(trans, root, inode); 1745 BUG_ON(ret); 1746 } 1747 goto out; 1748 } 1749 1750 lock_extent_bits(io_tree, ordered_extent->file_offset, 1751 ordered_extent->file_offset + ordered_extent->len - 1, 1752 0, &cached_state, GFP_NOFS); 1753 1754 if (nolock) 1755 trans = btrfs_join_transaction_nolock(root); 1756 else 1757 trans = btrfs_join_transaction(root); 1758 BUG_ON(IS_ERR(trans)); 1759 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1760 1761 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1762 compress_type = ordered_extent->compress_type; 1763 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1764 BUG_ON(compress_type); 1765 ret = btrfs_mark_extent_written(trans, inode, 1766 ordered_extent->file_offset, 1767 ordered_extent->file_offset + 1768 ordered_extent->len); 1769 BUG_ON(ret); 1770 } else { 1771 BUG_ON(root == root->fs_info->tree_root); 1772 ret = insert_reserved_file_extent(trans, inode, 1773 ordered_extent->file_offset, 1774 ordered_extent->start, 1775 ordered_extent->disk_len, 1776 ordered_extent->len, 1777 ordered_extent->len, 1778 compress_type, 0, 0, 1779 BTRFS_FILE_EXTENT_REG); 1780 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1781 ordered_extent->file_offset, 1782 ordered_extent->len); 1783 BUG_ON(ret); 1784 } 1785 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1786 ordered_extent->file_offset + 1787 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1788 1789 add_pending_csums(trans, inode, ordered_extent->file_offset, 1790 &ordered_extent->list); 1791 1792 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1793 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1794 ret = btrfs_update_inode(trans, root, inode); 1795 BUG_ON(ret); 1796 } 1797 ret = 0; 1798 out: 1799 if (root != root->fs_info->tree_root) 1800 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1801 if (trans) { 1802 if (nolock) 1803 btrfs_end_transaction_nolock(trans, root); 1804 else 1805 btrfs_end_transaction(trans, root); 1806 } 1807 1808 /* once for us */ 1809 btrfs_put_ordered_extent(ordered_extent); 1810 /* once for the tree */ 1811 btrfs_put_ordered_extent(ordered_extent); 1812 1813 return 0; 1814 } 1815 1816 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1817 struct extent_state *state, int uptodate) 1818 { 1819 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 1820 1821 ClearPagePrivate2(page); 1822 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1823 } 1824 1825 /* 1826 * when reads are done, we need to check csums to verify the data is correct 1827 * if there's a match, we allow the bio to finish. If not, the code in 1828 * extent_io.c will try to find good copies for us. 1829 */ 1830 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1831 struct extent_state *state) 1832 { 1833 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 1834 struct inode *inode = page->mapping->host; 1835 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1836 char *kaddr; 1837 u64 private = ~(u32)0; 1838 int ret; 1839 struct btrfs_root *root = BTRFS_I(inode)->root; 1840 u32 csum = ~(u32)0; 1841 1842 if (PageChecked(page)) { 1843 ClearPageChecked(page); 1844 goto good; 1845 } 1846 1847 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 1848 goto good; 1849 1850 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1851 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 1852 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 1853 GFP_NOFS); 1854 return 0; 1855 } 1856 1857 if (state && state->start == start) { 1858 private = state->private; 1859 ret = 0; 1860 } else { 1861 ret = get_state_private(io_tree, start, &private); 1862 } 1863 kaddr = kmap_atomic(page, KM_USER0); 1864 if (ret) 1865 goto zeroit; 1866 1867 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 1868 btrfs_csum_final(csum, (char *)&csum); 1869 if (csum != private) 1870 goto zeroit; 1871 1872 kunmap_atomic(kaddr, KM_USER0); 1873 good: 1874 return 0; 1875 1876 zeroit: 1877 printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " 1878 "private %llu\n", 1879 (unsigned long long)btrfs_ino(page->mapping->host), 1880 (unsigned long long)start, csum, 1881 (unsigned long long)private); 1882 memset(kaddr + offset, 1, end - start + 1); 1883 flush_dcache_page(page); 1884 kunmap_atomic(kaddr, KM_USER0); 1885 if (private == 0) 1886 return 0; 1887 return -EIO; 1888 } 1889 1890 struct delayed_iput { 1891 struct list_head list; 1892 struct inode *inode; 1893 }; 1894 1895 void btrfs_add_delayed_iput(struct inode *inode) 1896 { 1897 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 1898 struct delayed_iput *delayed; 1899 1900 if (atomic_add_unless(&inode->i_count, -1, 1)) 1901 return; 1902 1903 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 1904 delayed->inode = inode; 1905 1906 spin_lock(&fs_info->delayed_iput_lock); 1907 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 1908 spin_unlock(&fs_info->delayed_iput_lock); 1909 } 1910 1911 void btrfs_run_delayed_iputs(struct btrfs_root *root) 1912 { 1913 LIST_HEAD(list); 1914 struct btrfs_fs_info *fs_info = root->fs_info; 1915 struct delayed_iput *delayed; 1916 int empty; 1917 1918 spin_lock(&fs_info->delayed_iput_lock); 1919 empty = list_empty(&fs_info->delayed_iputs); 1920 spin_unlock(&fs_info->delayed_iput_lock); 1921 if (empty) 1922 return; 1923 1924 down_read(&root->fs_info->cleanup_work_sem); 1925 spin_lock(&fs_info->delayed_iput_lock); 1926 list_splice_init(&fs_info->delayed_iputs, &list); 1927 spin_unlock(&fs_info->delayed_iput_lock); 1928 1929 while (!list_empty(&list)) { 1930 delayed = list_entry(list.next, struct delayed_iput, list); 1931 list_del(&delayed->list); 1932 iput(delayed->inode); 1933 kfree(delayed); 1934 } 1935 up_read(&root->fs_info->cleanup_work_sem); 1936 } 1937 1938 enum btrfs_orphan_cleanup_state { 1939 ORPHAN_CLEANUP_STARTED = 1, 1940 ORPHAN_CLEANUP_DONE = 2, 1941 }; 1942 1943 /* 1944 * This is called in transaction commmit time. If there are no orphan 1945 * files in the subvolume, it removes orphan item and frees block_rsv 1946 * structure. 1947 */ 1948 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 1949 struct btrfs_root *root) 1950 { 1951 int ret; 1952 1953 if (!list_empty(&root->orphan_list) || 1954 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 1955 return; 1956 1957 if (root->orphan_item_inserted && 1958 btrfs_root_refs(&root->root_item) > 0) { 1959 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 1960 root->root_key.objectid); 1961 BUG_ON(ret); 1962 root->orphan_item_inserted = 0; 1963 } 1964 1965 if (root->orphan_block_rsv) { 1966 WARN_ON(root->orphan_block_rsv->size > 0); 1967 btrfs_free_block_rsv(root, root->orphan_block_rsv); 1968 root->orphan_block_rsv = NULL; 1969 } 1970 } 1971 1972 /* 1973 * This creates an orphan entry for the given inode in case something goes 1974 * wrong in the middle of an unlink/truncate. 1975 * 1976 * NOTE: caller of this function should reserve 5 units of metadata for 1977 * this function. 1978 */ 1979 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 1980 { 1981 struct btrfs_root *root = BTRFS_I(inode)->root; 1982 struct btrfs_block_rsv *block_rsv = NULL; 1983 int reserve = 0; 1984 int insert = 0; 1985 int ret; 1986 1987 if (!root->orphan_block_rsv) { 1988 block_rsv = btrfs_alloc_block_rsv(root); 1989 if (!block_rsv) 1990 return -ENOMEM; 1991 } 1992 1993 spin_lock(&root->orphan_lock); 1994 if (!root->orphan_block_rsv) { 1995 root->orphan_block_rsv = block_rsv; 1996 } else if (block_rsv) { 1997 btrfs_free_block_rsv(root, block_rsv); 1998 block_rsv = NULL; 1999 } 2000 2001 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2002 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2003 #if 0 2004 /* 2005 * For proper ENOSPC handling, we should do orphan 2006 * cleanup when mounting. But this introduces backward 2007 * compatibility issue. 2008 */ 2009 if (!xchg(&root->orphan_item_inserted, 1)) 2010 insert = 2; 2011 else 2012 insert = 1; 2013 #endif 2014 insert = 1; 2015 } 2016 2017 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2018 BTRFS_I(inode)->orphan_meta_reserved = 1; 2019 reserve = 1; 2020 } 2021 spin_unlock(&root->orphan_lock); 2022 2023 /* grab metadata reservation from transaction handle */ 2024 if (reserve) { 2025 ret = btrfs_orphan_reserve_metadata(trans, inode); 2026 BUG_ON(ret); 2027 } 2028 2029 /* insert an orphan item to track this unlinked/truncated file */ 2030 if (insert >= 1) { 2031 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2032 BUG_ON(ret); 2033 } 2034 2035 /* insert an orphan item to track subvolume contains orphan files */ 2036 if (insert >= 2) { 2037 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2038 root->root_key.objectid); 2039 BUG_ON(ret); 2040 } 2041 return 0; 2042 } 2043 2044 /* 2045 * We have done the truncate/delete so we can go ahead and remove the orphan 2046 * item for this particular inode. 2047 */ 2048 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2049 { 2050 struct btrfs_root *root = BTRFS_I(inode)->root; 2051 int delete_item = 0; 2052 int release_rsv = 0; 2053 int ret = 0; 2054 2055 spin_lock(&root->orphan_lock); 2056 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2057 list_del_init(&BTRFS_I(inode)->i_orphan); 2058 delete_item = 1; 2059 } 2060 2061 if (BTRFS_I(inode)->orphan_meta_reserved) { 2062 BTRFS_I(inode)->orphan_meta_reserved = 0; 2063 release_rsv = 1; 2064 } 2065 spin_unlock(&root->orphan_lock); 2066 2067 if (trans && delete_item) { 2068 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); 2069 BUG_ON(ret); 2070 } 2071 2072 if (release_rsv) 2073 btrfs_orphan_release_metadata(inode); 2074 2075 return 0; 2076 } 2077 2078 /* 2079 * this cleans up any orphans that may be left on the list from the last use 2080 * of this root. 2081 */ 2082 int btrfs_orphan_cleanup(struct btrfs_root *root) 2083 { 2084 struct btrfs_path *path; 2085 struct extent_buffer *leaf; 2086 struct btrfs_key key, found_key; 2087 struct btrfs_trans_handle *trans; 2088 struct inode *inode; 2089 u64 last_objectid = 0; 2090 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2091 2092 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2093 return 0; 2094 2095 path = btrfs_alloc_path(); 2096 if (!path) { 2097 ret = -ENOMEM; 2098 goto out; 2099 } 2100 path->reada = -1; 2101 2102 key.objectid = BTRFS_ORPHAN_OBJECTID; 2103 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2104 key.offset = (u64)-1; 2105 2106 while (1) { 2107 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2108 if (ret < 0) 2109 goto out; 2110 2111 /* 2112 * if ret == 0 means we found what we were searching for, which 2113 * is weird, but possible, so only screw with path if we didn't 2114 * find the key and see if we have stuff that matches 2115 */ 2116 if (ret > 0) { 2117 ret = 0; 2118 if (path->slots[0] == 0) 2119 break; 2120 path->slots[0]--; 2121 } 2122 2123 /* pull out the item */ 2124 leaf = path->nodes[0]; 2125 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2126 2127 /* make sure the item matches what we want */ 2128 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 2129 break; 2130 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 2131 break; 2132 2133 /* release the path since we're done with it */ 2134 btrfs_release_path(path); 2135 2136 /* 2137 * this is where we are basically btrfs_lookup, without the 2138 * crossing root thing. we store the inode number in the 2139 * offset of the orphan item. 2140 */ 2141 2142 if (found_key.offset == last_objectid) { 2143 printk(KERN_ERR "btrfs: Error removing orphan entry, " 2144 "stopping orphan cleanup\n"); 2145 ret = -EINVAL; 2146 goto out; 2147 } 2148 2149 last_objectid = found_key.offset; 2150 2151 found_key.objectid = found_key.offset; 2152 found_key.type = BTRFS_INODE_ITEM_KEY; 2153 found_key.offset = 0; 2154 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2155 ret = PTR_RET(inode); 2156 if (ret && ret != -ESTALE) 2157 goto out; 2158 2159 /* 2160 * Inode is already gone but the orphan item is still there, 2161 * kill the orphan item. 2162 */ 2163 if (ret == -ESTALE) { 2164 trans = btrfs_start_transaction(root, 1); 2165 if (IS_ERR(trans)) { 2166 ret = PTR_ERR(trans); 2167 goto out; 2168 } 2169 ret = btrfs_del_orphan_item(trans, root, 2170 found_key.objectid); 2171 BUG_ON(ret); 2172 btrfs_end_transaction(trans, root); 2173 continue; 2174 } 2175 2176 /* 2177 * add this inode to the orphan list so btrfs_orphan_del does 2178 * the proper thing when we hit it 2179 */ 2180 spin_lock(&root->orphan_lock); 2181 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2182 spin_unlock(&root->orphan_lock); 2183 2184 /* if we have links, this was a truncate, lets do that */ 2185 if (inode->i_nlink) { 2186 if (!S_ISREG(inode->i_mode)) { 2187 WARN_ON(1); 2188 iput(inode); 2189 continue; 2190 } 2191 nr_truncate++; 2192 ret = btrfs_truncate(inode); 2193 } else { 2194 nr_unlink++; 2195 } 2196 2197 /* this will do delete_inode and everything for us */ 2198 iput(inode); 2199 if (ret) 2200 goto out; 2201 } 2202 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2203 2204 if (root->orphan_block_rsv) 2205 btrfs_block_rsv_release(root, root->orphan_block_rsv, 2206 (u64)-1); 2207 2208 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2209 trans = btrfs_join_transaction(root); 2210 if (!IS_ERR(trans)) 2211 btrfs_end_transaction(trans, root); 2212 } 2213 2214 if (nr_unlink) 2215 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2216 if (nr_truncate) 2217 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2218 2219 out: 2220 if (ret) 2221 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); 2222 btrfs_free_path(path); 2223 return ret; 2224 } 2225 2226 /* 2227 * very simple check to peek ahead in the leaf looking for xattrs. If we 2228 * don't find any xattrs, we know there can't be any acls. 2229 * 2230 * slot is the slot the inode is in, objectid is the objectid of the inode 2231 */ 2232 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2233 int slot, u64 objectid) 2234 { 2235 u32 nritems = btrfs_header_nritems(leaf); 2236 struct btrfs_key found_key; 2237 int scanned = 0; 2238 2239 slot++; 2240 while (slot < nritems) { 2241 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2242 2243 /* we found a different objectid, there must not be acls */ 2244 if (found_key.objectid != objectid) 2245 return 0; 2246 2247 /* we found an xattr, assume we've got an acl */ 2248 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2249 return 1; 2250 2251 /* 2252 * we found a key greater than an xattr key, there can't 2253 * be any acls later on 2254 */ 2255 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2256 return 0; 2257 2258 slot++; 2259 scanned++; 2260 2261 /* 2262 * it goes inode, inode backrefs, xattrs, extents, 2263 * so if there are a ton of hard links to an inode there can 2264 * be a lot of backrefs. Don't waste time searching too hard, 2265 * this is just an optimization 2266 */ 2267 if (scanned >= 8) 2268 break; 2269 } 2270 /* we hit the end of the leaf before we found an xattr or 2271 * something larger than an xattr. We have to assume the inode 2272 * has acls 2273 */ 2274 return 1; 2275 } 2276 2277 /* 2278 * read an inode from the btree into the in-memory inode 2279 */ 2280 static void btrfs_read_locked_inode(struct inode *inode) 2281 { 2282 struct btrfs_path *path; 2283 struct extent_buffer *leaf; 2284 struct btrfs_inode_item *inode_item; 2285 struct btrfs_timespec *tspec; 2286 struct btrfs_root *root = BTRFS_I(inode)->root; 2287 struct btrfs_key location; 2288 int maybe_acls; 2289 u32 rdev; 2290 int ret; 2291 bool filled = false; 2292 2293 ret = btrfs_fill_inode(inode, &rdev); 2294 if (!ret) 2295 filled = true; 2296 2297 path = btrfs_alloc_path(); 2298 if (!path) 2299 goto make_bad; 2300 2301 path->leave_spinning = 1; 2302 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2303 2304 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2305 if (ret) 2306 goto make_bad; 2307 2308 leaf = path->nodes[0]; 2309 2310 if (filled) 2311 goto cache_acl; 2312 2313 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2314 struct btrfs_inode_item); 2315 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2316 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 2317 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2318 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2319 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2320 2321 tspec = btrfs_inode_atime(inode_item); 2322 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2323 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2324 2325 tspec = btrfs_inode_mtime(inode_item); 2326 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2327 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2328 2329 tspec = btrfs_inode_ctime(inode_item); 2330 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2331 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2332 2333 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2334 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2335 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2336 inode->i_generation = BTRFS_I(inode)->generation; 2337 inode->i_rdev = 0; 2338 rdev = btrfs_inode_rdev(leaf, inode_item); 2339 2340 BTRFS_I(inode)->index_cnt = (u64)-1; 2341 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2342 cache_acl: 2343 /* 2344 * try to precache a NULL acl entry for files that don't have 2345 * any xattrs or acls 2346 */ 2347 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 2348 btrfs_ino(inode)); 2349 if (!maybe_acls) 2350 cache_no_acl(inode); 2351 2352 btrfs_free_path(path); 2353 2354 switch (inode->i_mode & S_IFMT) { 2355 case S_IFREG: 2356 inode->i_mapping->a_ops = &btrfs_aops; 2357 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2358 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2359 inode->i_fop = &btrfs_file_operations; 2360 inode->i_op = &btrfs_file_inode_operations; 2361 break; 2362 case S_IFDIR: 2363 inode->i_fop = &btrfs_dir_file_operations; 2364 if (root == root->fs_info->tree_root) 2365 inode->i_op = &btrfs_dir_ro_inode_operations; 2366 else 2367 inode->i_op = &btrfs_dir_inode_operations; 2368 break; 2369 case S_IFLNK: 2370 inode->i_op = &btrfs_symlink_inode_operations; 2371 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2372 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2373 break; 2374 default: 2375 inode->i_op = &btrfs_special_inode_operations; 2376 init_special_inode(inode, inode->i_mode, rdev); 2377 break; 2378 } 2379 2380 btrfs_update_iflags(inode); 2381 return; 2382 2383 make_bad: 2384 btrfs_free_path(path); 2385 make_bad_inode(inode); 2386 } 2387 2388 /* 2389 * given a leaf and an inode, copy the inode fields into the leaf 2390 */ 2391 static void fill_inode_item(struct btrfs_trans_handle *trans, 2392 struct extent_buffer *leaf, 2393 struct btrfs_inode_item *item, 2394 struct inode *inode) 2395 { 2396 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2397 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2398 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2399 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2400 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2401 2402 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2403 inode->i_atime.tv_sec); 2404 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2405 inode->i_atime.tv_nsec); 2406 2407 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2408 inode->i_mtime.tv_sec); 2409 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2410 inode->i_mtime.tv_nsec); 2411 2412 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2413 inode->i_ctime.tv_sec); 2414 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2415 inode->i_ctime.tv_nsec); 2416 2417 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2418 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2419 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2420 btrfs_set_inode_transid(leaf, item, trans->transid); 2421 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2422 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2423 btrfs_set_inode_block_group(leaf, item, 0); 2424 } 2425 2426 /* 2427 * copy everything in the in-memory inode into the btree. 2428 */ 2429 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2430 struct btrfs_root *root, struct inode *inode) 2431 { 2432 struct btrfs_inode_item *inode_item; 2433 struct btrfs_path *path; 2434 struct extent_buffer *leaf; 2435 int ret; 2436 2437 /* 2438 * If the inode is a free space inode, we can deadlock during commit 2439 * if we put it into the delayed code. 2440 * 2441 * The data relocation inode should also be directly updated 2442 * without delay 2443 */ 2444 if (!btrfs_is_free_space_inode(root, inode) 2445 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2446 ret = btrfs_delayed_update_inode(trans, root, inode); 2447 if (!ret) 2448 btrfs_set_inode_last_trans(trans, inode); 2449 return ret; 2450 } 2451 2452 path = btrfs_alloc_path(); 2453 if (!path) 2454 return -ENOMEM; 2455 2456 path->leave_spinning = 1; 2457 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 2458 1); 2459 if (ret) { 2460 if (ret > 0) 2461 ret = -ENOENT; 2462 goto failed; 2463 } 2464 2465 btrfs_unlock_up_safe(path, 1); 2466 leaf = path->nodes[0]; 2467 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2468 struct btrfs_inode_item); 2469 2470 fill_inode_item(trans, leaf, inode_item, inode); 2471 btrfs_mark_buffer_dirty(leaf); 2472 btrfs_set_inode_last_trans(trans, inode); 2473 ret = 0; 2474 failed: 2475 btrfs_free_path(path); 2476 return ret; 2477 } 2478 2479 /* 2480 * unlink helper that gets used here in inode.c and in the tree logging 2481 * recovery code. It remove a link in a directory with a given name, and 2482 * also drops the back refs in the inode to the directory 2483 */ 2484 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2485 struct btrfs_root *root, 2486 struct inode *dir, struct inode *inode, 2487 const char *name, int name_len) 2488 { 2489 struct btrfs_path *path; 2490 int ret = 0; 2491 struct extent_buffer *leaf; 2492 struct btrfs_dir_item *di; 2493 struct btrfs_key key; 2494 u64 index; 2495 u64 ino = btrfs_ino(inode); 2496 u64 dir_ino = btrfs_ino(dir); 2497 2498 path = btrfs_alloc_path(); 2499 if (!path) { 2500 ret = -ENOMEM; 2501 goto out; 2502 } 2503 2504 path->leave_spinning = 1; 2505 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2506 name, name_len, -1); 2507 if (IS_ERR(di)) { 2508 ret = PTR_ERR(di); 2509 goto err; 2510 } 2511 if (!di) { 2512 ret = -ENOENT; 2513 goto err; 2514 } 2515 leaf = path->nodes[0]; 2516 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2517 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2518 if (ret) 2519 goto err; 2520 btrfs_release_path(path); 2521 2522 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 2523 dir_ino, &index); 2524 if (ret) { 2525 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2526 "inode %llu parent %llu\n", name_len, name, 2527 (unsigned long long)ino, (unsigned long long)dir_ino); 2528 goto err; 2529 } 2530 2531 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 2532 if (ret) 2533 goto err; 2534 2535 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2536 inode, dir_ino); 2537 BUG_ON(ret != 0 && ret != -ENOENT); 2538 2539 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2540 dir, index); 2541 if (ret == -ENOENT) 2542 ret = 0; 2543 err: 2544 btrfs_free_path(path); 2545 if (ret) 2546 goto out; 2547 2548 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2549 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2550 btrfs_update_inode(trans, root, dir); 2551 out: 2552 return ret; 2553 } 2554 2555 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2556 struct btrfs_root *root, 2557 struct inode *dir, struct inode *inode, 2558 const char *name, int name_len) 2559 { 2560 int ret; 2561 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 2562 if (!ret) { 2563 btrfs_drop_nlink(inode); 2564 ret = btrfs_update_inode(trans, root, inode); 2565 } 2566 return ret; 2567 } 2568 2569 2570 /* helper to check if there is any shared block in the path */ 2571 static int check_path_shared(struct btrfs_root *root, 2572 struct btrfs_path *path) 2573 { 2574 struct extent_buffer *eb; 2575 int level; 2576 u64 refs = 1; 2577 2578 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2579 int ret; 2580 2581 if (!path->nodes[level]) 2582 break; 2583 eb = path->nodes[level]; 2584 if (!btrfs_block_can_be_shared(root, eb)) 2585 continue; 2586 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, 2587 &refs, NULL); 2588 if (refs > 1) 2589 return 1; 2590 } 2591 return 0; 2592 } 2593 2594 /* 2595 * helper to start transaction for unlink and rmdir. 2596 * 2597 * unlink and rmdir are special in btrfs, they do not always free space. 2598 * so in enospc case, we should make sure they will free space before 2599 * allowing them to use the global metadata reservation. 2600 */ 2601 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 2602 struct dentry *dentry) 2603 { 2604 struct btrfs_trans_handle *trans; 2605 struct btrfs_root *root = BTRFS_I(dir)->root; 2606 struct btrfs_path *path; 2607 struct btrfs_inode_ref *ref; 2608 struct btrfs_dir_item *di; 2609 struct inode *inode = dentry->d_inode; 2610 u64 index; 2611 int check_link = 1; 2612 int err = -ENOSPC; 2613 int ret; 2614 u64 ino = btrfs_ino(inode); 2615 u64 dir_ino = btrfs_ino(dir); 2616 2617 /* 2618 * 1 for the possible orphan item 2619 * 1 for the dir item 2620 * 1 for the dir index 2621 * 1 for the inode ref 2622 * 1 for the inode ref in the tree log 2623 * 2 for the dir entries in the log 2624 * 1 for the inode 2625 */ 2626 trans = btrfs_start_transaction(root, 8); 2627 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2628 return trans; 2629 2630 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 2631 return ERR_PTR(-ENOSPC); 2632 2633 /* check if there is someone else holds reference */ 2634 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) 2635 return ERR_PTR(-ENOSPC); 2636 2637 if (atomic_read(&inode->i_count) > 2) 2638 return ERR_PTR(-ENOSPC); 2639 2640 if (xchg(&root->fs_info->enospc_unlink, 1)) 2641 return ERR_PTR(-ENOSPC); 2642 2643 path = btrfs_alloc_path(); 2644 if (!path) { 2645 root->fs_info->enospc_unlink = 0; 2646 return ERR_PTR(-ENOMEM); 2647 } 2648 2649 /* 1 for the orphan item */ 2650 trans = btrfs_start_transaction(root, 1); 2651 if (IS_ERR(trans)) { 2652 btrfs_free_path(path); 2653 root->fs_info->enospc_unlink = 0; 2654 return trans; 2655 } 2656 2657 path->skip_locking = 1; 2658 path->search_commit_root = 1; 2659 2660 ret = btrfs_lookup_inode(trans, root, path, 2661 &BTRFS_I(dir)->location, 0); 2662 if (ret < 0) { 2663 err = ret; 2664 goto out; 2665 } 2666 if (ret == 0) { 2667 if (check_path_shared(root, path)) 2668 goto out; 2669 } else { 2670 check_link = 0; 2671 } 2672 btrfs_release_path(path); 2673 2674 ret = btrfs_lookup_inode(trans, root, path, 2675 &BTRFS_I(inode)->location, 0); 2676 if (ret < 0) { 2677 err = ret; 2678 goto out; 2679 } 2680 if (ret == 0) { 2681 if (check_path_shared(root, path)) 2682 goto out; 2683 } else { 2684 check_link = 0; 2685 } 2686 btrfs_release_path(path); 2687 2688 if (ret == 0 && S_ISREG(inode->i_mode)) { 2689 ret = btrfs_lookup_file_extent(trans, root, path, 2690 ino, (u64)-1, 0); 2691 if (ret < 0) { 2692 err = ret; 2693 goto out; 2694 } 2695 BUG_ON(ret == 0); 2696 if (check_path_shared(root, path)) 2697 goto out; 2698 btrfs_release_path(path); 2699 } 2700 2701 if (!check_link) { 2702 err = 0; 2703 goto out; 2704 } 2705 2706 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2707 dentry->d_name.name, dentry->d_name.len, 0); 2708 if (IS_ERR(di)) { 2709 err = PTR_ERR(di); 2710 goto out; 2711 } 2712 if (di) { 2713 if (check_path_shared(root, path)) 2714 goto out; 2715 } else { 2716 err = 0; 2717 goto out; 2718 } 2719 btrfs_release_path(path); 2720 2721 ref = btrfs_lookup_inode_ref(trans, root, path, 2722 dentry->d_name.name, dentry->d_name.len, 2723 ino, dir_ino, 0); 2724 if (IS_ERR(ref)) { 2725 err = PTR_ERR(ref); 2726 goto out; 2727 } 2728 BUG_ON(!ref); 2729 if (check_path_shared(root, path)) 2730 goto out; 2731 index = btrfs_inode_ref_index(path->nodes[0], ref); 2732 btrfs_release_path(path); 2733 2734 /* 2735 * This is a commit root search, if we can lookup inode item and other 2736 * relative items in the commit root, it means the transaction of 2737 * dir/file creation has been committed, and the dir index item that we 2738 * delay to insert has also been inserted into the commit root. So 2739 * we needn't worry about the delayed insertion of the dir index item 2740 * here. 2741 */ 2742 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, 2743 dentry->d_name.name, dentry->d_name.len, 0); 2744 if (IS_ERR(di)) { 2745 err = PTR_ERR(di); 2746 goto out; 2747 } 2748 BUG_ON(ret == -ENOENT); 2749 if (check_path_shared(root, path)) 2750 goto out; 2751 2752 err = 0; 2753 out: 2754 btrfs_free_path(path); 2755 /* Migrate the orphan reservation over */ 2756 if (!err) 2757 err = btrfs_block_rsv_migrate(trans->block_rsv, 2758 &root->fs_info->global_block_rsv, 2759 trans->bytes_reserved); 2760 2761 if (err) { 2762 btrfs_end_transaction(trans, root); 2763 root->fs_info->enospc_unlink = 0; 2764 return ERR_PTR(err); 2765 } 2766 2767 trans->block_rsv = &root->fs_info->global_block_rsv; 2768 return trans; 2769 } 2770 2771 static void __unlink_end_trans(struct btrfs_trans_handle *trans, 2772 struct btrfs_root *root) 2773 { 2774 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2775 btrfs_block_rsv_release(root, trans->block_rsv, 2776 trans->bytes_reserved); 2777 trans->block_rsv = &root->fs_info->trans_block_rsv; 2778 BUG_ON(!root->fs_info->enospc_unlink); 2779 root->fs_info->enospc_unlink = 0; 2780 } 2781 btrfs_end_transaction_throttle(trans, root); 2782 } 2783 2784 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2785 { 2786 struct btrfs_root *root = BTRFS_I(dir)->root; 2787 struct btrfs_trans_handle *trans; 2788 struct inode *inode = dentry->d_inode; 2789 int ret; 2790 unsigned long nr = 0; 2791 2792 trans = __unlink_start_trans(dir, dentry); 2793 if (IS_ERR(trans)) 2794 return PTR_ERR(trans); 2795 2796 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 2797 2798 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2799 dentry->d_name.name, dentry->d_name.len); 2800 if (ret) 2801 goto out; 2802 2803 if (inode->i_nlink == 0) { 2804 ret = btrfs_orphan_add(trans, inode); 2805 if (ret) 2806 goto out; 2807 } 2808 2809 out: 2810 nr = trans->blocks_used; 2811 __unlink_end_trans(trans, root); 2812 btrfs_btree_balance_dirty(root, nr); 2813 return ret; 2814 } 2815 2816 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 2817 struct btrfs_root *root, 2818 struct inode *dir, u64 objectid, 2819 const char *name, int name_len) 2820 { 2821 struct btrfs_path *path; 2822 struct extent_buffer *leaf; 2823 struct btrfs_dir_item *di; 2824 struct btrfs_key key; 2825 u64 index; 2826 int ret; 2827 u64 dir_ino = btrfs_ino(dir); 2828 2829 path = btrfs_alloc_path(); 2830 if (!path) 2831 return -ENOMEM; 2832 2833 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 2834 name, name_len, -1); 2835 BUG_ON(IS_ERR_OR_NULL(di)); 2836 2837 leaf = path->nodes[0]; 2838 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2839 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 2840 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2841 BUG_ON(ret); 2842 btrfs_release_path(path); 2843 2844 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 2845 objectid, root->root_key.objectid, 2846 dir_ino, &index, name, name_len); 2847 if (ret < 0) { 2848 BUG_ON(ret != -ENOENT); 2849 di = btrfs_search_dir_index_item(root, path, dir_ino, 2850 name, name_len); 2851 BUG_ON(IS_ERR_OR_NULL(di)); 2852 2853 leaf = path->nodes[0]; 2854 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2855 btrfs_release_path(path); 2856 index = key.offset; 2857 } 2858 btrfs_release_path(path); 2859 2860 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 2861 BUG_ON(ret); 2862 2863 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2864 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2865 ret = btrfs_update_inode(trans, root, dir); 2866 BUG_ON(ret); 2867 2868 btrfs_free_path(path); 2869 return 0; 2870 } 2871 2872 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 2873 { 2874 struct inode *inode = dentry->d_inode; 2875 int err = 0; 2876 struct btrfs_root *root = BTRFS_I(dir)->root; 2877 struct btrfs_trans_handle *trans; 2878 unsigned long nr = 0; 2879 2880 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 2881 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 2882 return -ENOTEMPTY; 2883 2884 trans = __unlink_start_trans(dir, dentry); 2885 if (IS_ERR(trans)) 2886 return PTR_ERR(trans); 2887 2888 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 2889 err = btrfs_unlink_subvol(trans, root, dir, 2890 BTRFS_I(inode)->location.objectid, 2891 dentry->d_name.name, 2892 dentry->d_name.len); 2893 goto out; 2894 } 2895 2896 err = btrfs_orphan_add(trans, inode); 2897 if (err) 2898 goto out; 2899 2900 /* now the directory is empty */ 2901 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2902 dentry->d_name.name, dentry->d_name.len); 2903 if (!err) 2904 btrfs_i_size_write(inode, 0); 2905 out: 2906 nr = trans->blocks_used; 2907 __unlink_end_trans(trans, root); 2908 btrfs_btree_balance_dirty(root, nr); 2909 2910 return err; 2911 } 2912 2913 /* 2914 * this can truncate away extent items, csum items and directory items. 2915 * It starts at a high offset and removes keys until it can't find 2916 * any higher than new_size 2917 * 2918 * csum items that cross the new i_size are truncated to the new size 2919 * as well. 2920 * 2921 * min_type is the minimum key type to truncate down to. If set to 0, this 2922 * will kill all the items on this inode, including the INODE_ITEM_KEY. 2923 */ 2924 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 2925 struct btrfs_root *root, 2926 struct inode *inode, 2927 u64 new_size, u32 min_type) 2928 { 2929 struct btrfs_path *path; 2930 struct extent_buffer *leaf; 2931 struct btrfs_file_extent_item *fi; 2932 struct btrfs_key key; 2933 struct btrfs_key found_key; 2934 u64 extent_start = 0; 2935 u64 extent_num_bytes = 0; 2936 u64 extent_offset = 0; 2937 u64 item_end = 0; 2938 u64 mask = root->sectorsize - 1; 2939 u32 found_type = (u8)-1; 2940 int found_extent; 2941 int del_item; 2942 int pending_del_nr = 0; 2943 int pending_del_slot = 0; 2944 int extent_type = -1; 2945 int encoding; 2946 int ret; 2947 int err = 0; 2948 u64 ino = btrfs_ino(inode); 2949 2950 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 2951 2952 path = btrfs_alloc_path(); 2953 if (!path) 2954 return -ENOMEM; 2955 path->reada = -1; 2956 2957 if (root->ref_cows || root == root->fs_info->tree_root) 2958 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 2959 2960 /* 2961 * This function is also used to drop the items in the log tree before 2962 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 2963 * it is used to drop the loged items. So we shouldn't kill the delayed 2964 * items. 2965 */ 2966 if (min_type == 0 && root == BTRFS_I(inode)->root) 2967 btrfs_kill_delayed_inode_items(inode); 2968 2969 key.objectid = ino; 2970 key.offset = (u64)-1; 2971 key.type = (u8)-1; 2972 2973 search_again: 2974 path->leave_spinning = 1; 2975 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2976 if (ret < 0) { 2977 err = ret; 2978 goto out; 2979 } 2980 2981 if (ret > 0) { 2982 /* there are no items in the tree for us to truncate, we're 2983 * done 2984 */ 2985 if (path->slots[0] == 0) 2986 goto out; 2987 path->slots[0]--; 2988 } 2989 2990 while (1) { 2991 fi = NULL; 2992 leaf = path->nodes[0]; 2993 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2994 found_type = btrfs_key_type(&found_key); 2995 encoding = 0; 2996 2997 if (found_key.objectid != ino) 2998 break; 2999 3000 if (found_type < min_type) 3001 break; 3002 3003 item_end = found_key.offset; 3004 if (found_type == BTRFS_EXTENT_DATA_KEY) { 3005 fi = btrfs_item_ptr(leaf, path->slots[0], 3006 struct btrfs_file_extent_item); 3007 extent_type = btrfs_file_extent_type(leaf, fi); 3008 encoding = btrfs_file_extent_compression(leaf, fi); 3009 encoding |= btrfs_file_extent_encryption(leaf, fi); 3010 encoding |= btrfs_file_extent_other_encoding(leaf, fi); 3011 3012 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3013 item_end += 3014 btrfs_file_extent_num_bytes(leaf, fi); 3015 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3016 item_end += btrfs_file_extent_inline_len(leaf, 3017 fi); 3018 } 3019 item_end--; 3020 } 3021 if (found_type > min_type) { 3022 del_item = 1; 3023 } else { 3024 if (item_end < new_size) 3025 break; 3026 if (found_key.offset >= new_size) 3027 del_item = 1; 3028 else 3029 del_item = 0; 3030 } 3031 found_extent = 0; 3032 /* FIXME, shrink the extent if the ref count is only 1 */ 3033 if (found_type != BTRFS_EXTENT_DATA_KEY) 3034 goto delete; 3035 3036 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3037 u64 num_dec; 3038 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3039 if (!del_item && !encoding) { 3040 u64 orig_num_bytes = 3041 btrfs_file_extent_num_bytes(leaf, fi); 3042 extent_num_bytes = new_size - 3043 found_key.offset + root->sectorsize - 1; 3044 extent_num_bytes = extent_num_bytes & 3045 ~((u64)root->sectorsize - 1); 3046 btrfs_set_file_extent_num_bytes(leaf, fi, 3047 extent_num_bytes); 3048 num_dec = (orig_num_bytes - 3049 extent_num_bytes); 3050 if (root->ref_cows && extent_start != 0) 3051 inode_sub_bytes(inode, num_dec); 3052 btrfs_mark_buffer_dirty(leaf); 3053 } else { 3054 extent_num_bytes = 3055 btrfs_file_extent_disk_num_bytes(leaf, 3056 fi); 3057 extent_offset = found_key.offset - 3058 btrfs_file_extent_offset(leaf, fi); 3059 3060 /* FIXME blocksize != 4096 */ 3061 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 3062 if (extent_start != 0) { 3063 found_extent = 1; 3064 if (root->ref_cows) 3065 inode_sub_bytes(inode, num_dec); 3066 } 3067 } 3068 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3069 /* 3070 * we can't truncate inline items that have had 3071 * special encodings 3072 */ 3073 if (!del_item && 3074 btrfs_file_extent_compression(leaf, fi) == 0 && 3075 btrfs_file_extent_encryption(leaf, fi) == 0 && 3076 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 3077 u32 size = new_size - found_key.offset; 3078 3079 if (root->ref_cows) { 3080 inode_sub_bytes(inode, item_end + 1 - 3081 new_size); 3082 } 3083 size = 3084 btrfs_file_extent_calc_inline_size(size); 3085 ret = btrfs_truncate_item(trans, root, path, 3086 size, 1); 3087 } else if (root->ref_cows) { 3088 inode_sub_bytes(inode, item_end + 1 - 3089 found_key.offset); 3090 } 3091 } 3092 delete: 3093 if (del_item) { 3094 if (!pending_del_nr) { 3095 /* no pending yet, add ourselves */ 3096 pending_del_slot = path->slots[0]; 3097 pending_del_nr = 1; 3098 } else if (pending_del_nr && 3099 path->slots[0] + 1 == pending_del_slot) { 3100 /* hop on the pending chunk */ 3101 pending_del_nr++; 3102 pending_del_slot = path->slots[0]; 3103 } else { 3104 BUG(); 3105 } 3106 } else { 3107 break; 3108 } 3109 if (found_extent && (root->ref_cows || 3110 root == root->fs_info->tree_root)) { 3111 btrfs_set_path_blocking(path); 3112 ret = btrfs_free_extent(trans, root, extent_start, 3113 extent_num_bytes, 0, 3114 btrfs_header_owner(leaf), 3115 ino, extent_offset); 3116 BUG_ON(ret); 3117 } 3118 3119 if (found_type == BTRFS_INODE_ITEM_KEY) 3120 break; 3121 3122 if (path->slots[0] == 0 || 3123 path->slots[0] != pending_del_slot) { 3124 if (root->ref_cows && 3125 BTRFS_I(inode)->location.objectid != 3126 BTRFS_FREE_INO_OBJECTID) { 3127 err = -EAGAIN; 3128 goto out; 3129 } 3130 if (pending_del_nr) { 3131 ret = btrfs_del_items(trans, root, path, 3132 pending_del_slot, 3133 pending_del_nr); 3134 BUG_ON(ret); 3135 pending_del_nr = 0; 3136 } 3137 btrfs_release_path(path); 3138 goto search_again; 3139 } else { 3140 path->slots[0]--; 3141 } 3142 } 3143 out: 3144 if (pending_del_nr) { 3145 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3146 pending_del_nr); 3147 BUG_ON(ret); 3148 } 3149 btrfs_free_path(path); 3150 return err; 3151 } 3152 3153 /* 3154 * taken from block_truncate_page, but does cow as it zeros out 3155 * any bytes left in the last page in the file. 3156 */ 3157 static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3158 { 3159 struct inode *inode = mapping->host; 3160 struct btrfs_root *root = BTRFS_I(inode)->root; 3161 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3162 struct btrfs_ordered_extent *ordered; 3163 struct extent_state *cached_state = NULL; 3164 char *kaddr; 3165 u32 blocksize = root->sectorsize; 3166 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3167 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3168 struct page *page; 3169 gfp_t mask = btrfs_alloc_write_mask(mapping); 3170 int ret = 0; 3171 u64 page_start; 3172 u64 page_end; 3173 3174 if ((offset & (blocksize - 1)) == 0) 3175 goto out; 3176 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3177 if (ret) 3178 goto out; 3179 3180 ret = -ENOMEM; 3181 again: 3182 page = find_or_create_page(mapping, index, mask); 3183 if (!page) { 3184 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3185 goto out; 3186 } 3187 3188 page_start = page_offset(page); 3189 page_end = page_start + PAGE_CACHE_SIZE - 1; 3190 3191 if (!PageUptodate(page)) { 3192 ret = btrfs_readpage(NULL, page); 3193 lock_page(page); 3194 if (page->mapping != mapping) { 3195 unlock_page(page); 3196 page_cache_release(page); 3197 goto again; 3198 } 3199 if (!PageUptodate(page)) { 3200 ret = -EIO; 3201 goto out_unlock; 3202 } 3203 } 3204 wait_on_page_writeback(page); 3205 3206 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, 3207 GFP_NOFS); 3208 set_page_extent_mapped(page); 3209 3210 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3211 if (ordered) { 3212 unlock_extent_cached(io_tree, page_start, page_end, 3213 &cached_state, GFP_NOFS); 3214 unlock_page(page); 3215 page_cache_release(page); 3216 btrfs_start_ordered_extent(inode, ordered, 1); 3217 btrfs_put_ordered_extent(ordered); 3218 goto again; 3219 } 3220 3221 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3222 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3223 0, 0, &cached_state, GFP_NOFS); 3224 3225 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3226 &cached_state); 3227 if (ret) { 3228 unlock_extent_cached(io_tree, page_start, page_end, 3229 &cached_state, GFP_NOFS); 3230 goto out_unlock; 3231 } 3232 3233 ret = 0; 3234 if (offset != PAGE_CACHE_SIZE) { 3235 kaddr = kmap(page); 3236 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3237 flush_dcache_page(page); 3238 kunmap(page); 3239 } 3240 ClearPageChecked(page); 3241 set_page_dirty(page); 3242 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 3243 GFP_NOFS); 3244 3245 out_unlock: 3246 if (ret) 3247 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3248 unlock_page(page); 3249 page_cache_release(page); 3250 out: 3251 return ret; 3252 } 3253 3254 /* 3255 * This function puts in dummy file extents for the area we're creating a hole 3256 * for. So if we are truncating this file to a larger size we need to insert 3257 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 3258 * the range between oldsize and size 3259 */ 3260 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 3261 { 3262 struct btrfs_trans_handle *trans; 3263 struct btrfs_root *root = BTRFS_I(inode)->root; 3264 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3265 struct extent_map *em = NULL; 3266 struct extent_state *cached_state = NULL; 3267 u64 mask = root->sectorsize - 1; 3268 u64 hole_start = (oldsize + mask) & ~mask; 3269 u64 block_end = (size + mask) & ~mask; 3270 u64 last_byte; 3271 u64 cur_offset; 3272 u64 hole_size; 3273 int err = 0; 3274 3275 if (size <= hole_start) 3276 return 0; 3277 3278 while (1) { 3279 struct btrfs_ordered_extent *ordered; 3280 btrfs_wait_ordered_range(inode, hole_start, 3281 block_end - hole_start); 3282 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 3283 &cached_state, GFP_NOFS); 3284 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3285 if (!ordered) 3286 break; 3287 unlock_extent_cached(io_tree, hole_start, block_end - 1, 3288 &cached_state, GFP_NOFS); 3289 btrfs_put_ordered_extent(ordered); 3290 } 3291 3292 cur_offset = hole_start; 3293 while (1) { 3294 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3295 block_end - cur_offset, 0); 3296 BUG_ON(IS_ERR_OR_NULL(em)); 3297 last_byte = min(extent_map_end(em), block_end); 3298 last_byte = (last_byte + mask) & ~mask; 3299 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3300 u64 hint_byte = 0; 3301 hole_size = last_byte - cur_offset; 3302 3303 trans = btrfs_start_transaction(root, 2); 3304 if (IS_ERR(trans)) { 3305 err = PTR_ERR(trans); 3306 break; 3307 } 3308 3309 err = btrfs_drop_extents(trans, inode, cur_offset, 3310 cur_offset + hole_size, 3311 &hint_byte, 1); 3312 if (err) { 3313 btrfs_end_transaction(trans, root); 3314 break; 3315 } 3316 3317 err = btrfs_insert_file_extent(trans, root, 3318 btrfs_ino(inode), cur_offset, 0, 3319 0, hole_size, 0, hole_size, 3320 0, 0, 0); 3321 if (err) { 3322 btrfs_end_transaction(trans, root); 3323 break; 3324 } 3325 3326 btrfs_drop_extent_cache(inode, hole_start, 3327 last_byte - 1, 0); 3328 3329 btrfs_end_transaction(trans, root); 3330 } 3331 free_extent_map(em); 3332 em = NULL; 3333 cur_offset = last_byte; 3334 if (cur_offset >= block_end) 3335 break; 3336 } 3337 3338 free_extent_map(em); 3339 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3340 GFP_NOFS); 3341 return err; 3342 } 3343 3344 static int btrfs_setsize(struct inode *inode, loff_t newsize) 3345 { 3346 loff_t oldsize = i_size_read(inode); 3347 int ret; 3348 3349 if (newsize == oldsize) 3350 return 0; 3351 3352 if (newsize > oldsize) { 3353 i_size_write(inode, newsize); 3354 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3355 truncate_pagecache(inode, oldsize, newsize); 3356 ret = btrfs_cont_expand(inode, oldsize, newsize); 3357 if (ret) { 3358 btrfs_setsize(inode, oldsize); 3359 return ret; 3360 } 3361 3362 mark_inode_dirty(inode); 3363 } else { 3364 3365 /* 3366 * We're truncating a file that used to have good data down to 3367 * zero. Make sure it gets into the ordered flush list so that 3368 * any new writes get down to disk quickly. 3369 */ 3370 if (newsize == 0) 3371 BTRFS_I(inode)->ordered_data_close = 1; 3372 3373 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3374 truncate_setsize(inode, newsize); 3375 ret = btrfs_truncate(inode); 3376 } 3377 3378 return ret; 3379 } 3380 3381 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3382 { 3383 struct inode *inode = dentry->d_inode; 3384 struct btrfs_root *root = BTRFS_I(inode)->root; 3385 int err; 3386 3387 if (btrfs_root_readonly(root)) 3388 return -EROFS; 3389 3390 err = inode_change_ok(inode, attr); 3391 if (err) 3392 return err; 3393 3394 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3395 err = btrfs_setsize(inode, attr->ia_size); 3396 if (err) 3397 return err; 3398 } 3399 3400 if (attr->ia_valid) { 3401 setattr_copy(inode, attr); 3402 mark_inode_dirty(inode); 3403 3404 if (attr->ia_valid & ATTR_MODE) 3405 err = btrfs_acl_chmod(inode); 3406 } 3407 3408 return err; 3409 } 3410 3411 void btrfs_evict_inode(struct inode *inode) 3412 { 3413 struct btrfs_trans_handle *trans; 3414 struct btrfs_root *root = BTRFS_I(inode)->root; 3415 struct btrfs_block_rsv *rsv, *global_rsv; 3416 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3417 unsigned long nr; 3418 int ret; 3419 3420 trace_btrfs_inode_evict(inode); 3421 3422 truncate_inode_pages(&inode->i_data, 0); 3423 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3424 btrfs_is_free_space_inode(root, inode))) 3425 goto no_delete; 3426 3427 if (is_bad_inode(inode)) { 3428 btrfs_orphan_del(NULL, inode); 3429 goto no_delete; 3430 } 3431 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 3432 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3433 3434 if (root->fs_info->log_root_recovering) { 3435 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); 3436 goto no_delete; 3437 } 3438 3439 if (inode->i_nlink > 0) { 3440 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3441 goto no_delete; 3442 } 3443 3444 rsv = btrfs_alloc_block_rsv(root); 3445 if (!rsv) { 3446 btrfs_orphan_del(NULL, inode); 3447 goto no_delete; 3448 } 3449 rsv->size = min_size; 3450 global_rsv = &root->fs_info->global_block_rsv; 3451 3452 btrfs_i_size_write(inode, 0); 3453 3454 /* 3455 * This is a bit simpler than btrfs_truncate since 3456 * 3457 * 1) We've already reserved our space for our orphan item in the 3458 * unlink. 3459 * 2) We're going to delete the inode item, so we don't need to update 3460 * it at all. 3461 * 3462 * So we just need to reserve some slack space in case we add bytes when 3463 * doing the truncate. 3464 */ 3465 while (1) { 3466 ret = btrfs_block_rsv_refill(root, rsv, min_size); 3467 3468 /* 3469 * Try and steal from the global reserve since we will 3470 * likely not use this space anyway, we want to try as 3471 * hard as possible to get this to work. 3472 */ 3473 if (ret) 3474 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); 3475 3476 if (ret) { 3477 printk(KERN_WARNING "Could not get space for a " 3478 "delete, will truncate on mount %d\n", ret); 3479 btrfs_orphan_del(NULL, inode); 3480 btrfs_free_block_rsv(root, rsv); 3481 goto no_delete; 3482 } 3483 3484 trans = btrfs_start_transaction(root, 0); 3485 if (IS_ERR(trans)) { 3486 btrfs_orphan_del(NULL, inode); 3487 btrfs_free_block_rsv(root, rsv); 3488 goto no_delete; 3489 } 3490 3491 trans->block_rsv = rsv; 3492 3493 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3494 if (ret != -EAGAIN) 3495 break; 3496 3497 nr = trans->blocks_used; 3498 btrfs_end_transaction(trans, root); 3499 trans = NULL; 3500 btrfs_btree_balance_dirty(root, nr); 3501 } 3502 3503 btrfs_free_block_rsv(root, rsv); 3504 3505 if (ret == 0) { 3506 trans->block_rsv = root->orphan_block_rsv; 3507 ret = btrfs_orphan_del(trans, inode); 3508 BUG_ON(ret); 3509 } 3510 3511 trans->block_rsv = &root->fs_info->trans_block_rsv; 3512 if (!(root == root->fs_info->tree_root || 3513 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3514 btrfs_return_ino(root, btrfs_ino(inode)); 3515 3516 nr = trans->blocks_used; 3517 btrfs_end_transaction(trans, root); 3518 btrfs_btree_balance_dirty(root, nr); 3519 no_delete: 3520 end_writeback(inode); 3521 return; 3522 } 3523 3524 /* 3525 * this returns the key found in the dir entry in the location pointer. 3526 * If no dir entries were found, location->objectid is 0. 3527 */ 3528 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3529 struct btrfs_key *location) 3530 { 3531 const char *name = dentry->d_name.name; 3532 int namelen = dentry->d_name.len; 3533 struct btrfs_dir_item *di; 3534 struct btrfs_path *path; 3535 struct btrfs_root *root = BTRFS_I(dir)->root; 3536 int ret = 0; 3537 3538 path = btrfs_alloc_path(); 3539 if (!path) 3540 return -ENOMEM; 3541 3542 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 3543 namelen, 0); 3544 if (IS_ERR(di)) 3545 ret = PTR_ERR(di); 3546 3547 if (IS_ERR_OR_NULL(di)) 3548 goto out_err; 3549 3550 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3551 out: 3552 btrfs_free_path(path); 3553 return ret; 3554 out_err: 3555 location->objectid = 0; 3556 goto out; 3557 } 3558 3559 /* 3560 * when we hit a tree root in a directory, the btrfs part of the inode 3561 * needs to be changed to reflect the root directory of the tree root. This 3562 * is kind of like crossing a mount point. 3563 */ 3564 static int fixup_tree_root_location(struct btrfs_root *root, 3565 struct inode *dir, 3566 struct dentry *dentry, 3567 struct btrfs_key *location, 3568 struct btrfs_root **sub_root) 3569 { 3570 struct btrfs_path *path; 3571 struct btrfs_root *new_root; 3572 struct btrfs_root_ref *ref; 3573 struct extent_buffer *leaf; 3574 int ret; 3575 int err = 0; 3576 3577 path = btrfs_alloc_path(); 3578 if (!path) { 3579 err = -ENOMEM; 3580 goto out; 3581 } 3582 3583 err = -ENOENT; 3584 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3585 BTRFS_I(dir)->root->root_key.objectid, 3586 location->objectid); 3587 if (ret) { 3588 if (ret < 0) 3589 err = ret; 3590 goto out; 3591 } 3592 3593 leaf = path->nodes[0]; 3594 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3595 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 3596 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3597 goto out; 3598 3599 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3600 (unsigned long)(ref + 1), 3601 dentry->d_name.len); 3602 if (ret) 3603 goto out; 3604 3605 btrfs_release_path(path); 3606 3607 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3608 if (IS_ERR(new_root)) { 3609 err = PTR_ERR(new_root); 3610 goto out; 3611 } 3612 3613 if (btrfs_root_refs(&new_root->root_item) == 0) { 3614 err = -ENOENT; 3615 goto out; 3616 } 3617 3618 *sub_root = new_root; 3619 location->objectid = btrfs_root_dirid(&new_root->root_item); 3620 location->type = BTRFS_INODE_ITEM_KEY; 3621 location->offset = 0; 3622 err = 0; 3623 out: 3624 btrfs_free_path(path); 3625 return err; 3626 } 3627 3628 static void inode_tree_add(struct inode *inode) 3629 { 3630 struct btrfs_root *root = BTRFS_I(inode)->root; 3631 struct btrfs_inode *entry; 3632 struct rb_node **p; 3633 struct rb_node *parent; 3634 u64 ino = btrfs_ino(inode); 3635 again: 3636 p = &root->inode_tree.rb_node; 3637 parent = NULL; 3638 3639 if (inode_unhashed(inode)) 3640 return; 3641 3642 spin_lock(&root->inode_lock); 3643 while (*p) { 3644 parent = *p; 3645 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3646 3647 if (ino < btrfs_ino(&entry->vfs_inode)) 3648 p = &parent->rb_left; 3649 else if (ino > btrfs_ino(&entry->vfs_inode)) 3650 p = &parent->rb_right; 3651 else { 3652 WARN_ON(!(entry->vfs_inode.i_state & 3653 (I_WILL_FREE | I_FREEING))); 3654 rb_erase(parent, &root->inode_tree); 3655 RB_CLEAR_NODE(parent); 3656 spin_unlock(&root->inode_lock); 3657 goto again; 3658 } 3659 } 3660 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3661 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3662 spin_unlock(&root->inode_lock); 3663 } 3664 3665 static void inode_tree_del(struct inode *inode) 3666 { 3667 struct btrfs_root *root = BTRFS_I(inode)->root; 3668 int empty = 0; 3669 3670 spin_lock(&root->inode_lock); 3671 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3672 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3673 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3674 empty = RB_EMPTY_ROOT(&root->inode_tree); 3675 } 3676 spin_unlock(&root->inode_lock); 3677 3678 /* 3679 * Free space cache has inodes in the tree root, but the tree root has a 3680 * root_refs of 0, so this could end up dropping the tree root as a 3681 * snapshot, so we need the extra !root->fs_info->tree_root check to 3682 * make sure we don't drop it. 3683 */ 3684 if (empty && btrfs_root_refs(&root->root_item) == 0 && 3685 root != root->fs_info->tree_root) { 3686 synchronize_srcu(&root->fs_info->subvol_srcu); 3687 spin_lock(&root->inode_lock); 3688 empty = RB_EMPTY_ROOT(&root->inode_tree); 3689 spin_unlock(&root->inode_lock); 3690 if (empty) 3691 btrfs_add_dead_root(root); 3692 } 3693 } 3694 3695 int btrfs_invalidate_inodes(struct btrfs_root *root) 3696 { 3697 struct rb_node *node; 3698 struct rb_node *prev; 3699 struct btrfs_inode *entry; 3700 struct inode *inode; 3701 u64 objectid = 0; 3702 3703 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 3704 3705 spin_lock(&root->inode_lock); 3706 again: 3707 node = root->inode_tree.rb_node; 3708 prev = NULL; 3709 while (node) { 3710 prev = node; 3711 entry = rb_entry(node, struct btrfs_inode, rb_node); 3712 3713 if (objectid < btrfs_ino(&entry->vfs_inode)) 3714 node = node->rb_left; 3715 else if (objectid > btrfs_ino(&entry->vfs_inode)) 3716 node = node->rb_right; 3717 else 3718 break; 3719 } 3720 if (!node) { 3721 while (prev) { 3722 entry = rb_entry(prev, struct btrfs_inode, rb_node); 3723 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 3724 node = prev; 3725 break; 3726 } 3727 prev = rb_next(prev); 3728 } 3729 } 3730 while (node) { 3731 entry = rb_entry(node, struct btrfs_inode, rb_node); 3732 objectid = btrfs_ino(&entry->vfs_inode) + 1; 3733 inode = igrab(&entry->vfs_inode); 3734 if (inode) { 3735 spin_unlock(&root->inode_lock); 3736 if (atomic_read(&inode->i_count) > 1) 3737 d_prune_aliases(inode); 3738 /* 3739 * btrfs_drop_inode will have it removed from 3740 * the inode cache when its usage count 3741 * hits zero. 3742 */ 3743 iput(inode); 3744 cond_resched(); 3745 spin_lock(&root->inode_lock); 3746 goto again; 3747 } 3748 3749 if (cond_resched_lock(&root->inode_lock)) 3750 goto again; 3751 3752 node = rb_next(node); 3753 } 3754 spin_unlock(&root->inode_lock); 3755 return 0; 3756 } 3757 3758 static int btrfs_init_locked_inode(struct inode *inode, void *p) 3759 { 3760 struct btrfs_iget_args *args = p; 3761 inode->i_ino = args->ino; 3762 BTRFS_I(inode)->root = args->root; 3763 btrfs_set_inode_space_info(args->root, inode); 3764 return 0; 3765 } 3766 3767 static int btrfs_find_actor(struct inode *inode, void *opaque) 3768 { 3769 struct btrfs_iget_args *args = opaque; 3770 return args->ino == btrfs_ino(inode) && 3771 args->root == BTRFS_I(inode)->root; 3772 } 3773 3774 static struct inode *btrfs_iget_locked(struct super_block *s, 3775 u64 objectid, 3776 struct btrfs_root *root) 3777 { 3778 struct inode *inode; 3779 struct btrfs_iget_args args; 3780 args.ino = objectid; 3781 args.root = root; 3782 3783 inode = iget5_locked(s, objectid, btrfs_find_actor, 3784 btrfs_init_locked_inode, 3785 (void *)&args); 3786 return inode; 3787 } 3788 3789 /* Get an inode object given its location and corresponding root. 3790 * Returns in *is_new if the inode was read from disk 3791 */ 3792 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3793 struct btrfs_root *root, int *new) 3794 { 3795 struct inode *inode; 3796 3797 inode = btrfs_iget_locked(s, location->objectid, root); 3798 if (!inode) 3799 return ERR_PTR(-ENOMEM); 3800 3801 if (inode->i_state & I_NEW) { 3802 BTRFS_I(inode)->root = root; 3803 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3804 btrfs_read_locked_inode(inode); 3805 if (!is_bad_inode(inode)) { 3806 inode_tree_add(inode); 3807 unlock_new_inode(inode); 3808 if (new) 3809 *new = 1; 3810 } else { 3811 unlock_new_inode(inode); 3812 iput(inode); 3813 inode = ERR_PTR(-ESTALE); 3814 } 3815 } 3816 3817 return inode; 3818 } 3819 3820 static struct inode *new_simple_dir(struct super_block *s, 3821 struct btrfs_key *key, 3822 struct btrfs_root *root) 3823 { 3824 struct inode *inode = new_inode(s); 3825 3826 if (!inode) 3827 return ERR_PTR(-ENOMEM); 3828 3829 BTRFS_I(inode)->root = root; 3830 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 3831 BTRFS_I(inode)->dummy_inode = 1; 3832 3833 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 3834 inode->i_op = &simple_dir_inode_operations; 3835 inode->i_fop = &simple_dir_operations; 3836 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 3837 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 3838 3839 return inode; 3840 } 3841 3842 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 3843 { 3844 struct inode *inode; 3845 struct btrfs_root *root = BTRFS_I(dir)->root; 3846 struct btrfs_root *sub_root = root; 3847 struct btrfs_key location; 3848 int index; 3849 int ret = 0; 3850 3851 if (dentry->d_name.len > BTRFS_NAME_LEN) 3852 return ERR_PTR(-ENAMETOOLONG); 3853 3854 if (unlikely(d_need_lookup(dentry))) { 3855 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); 3856 kfree(dentry->d_fsdata); 3857 dentry->d_fsdata = NULL; 3858 /* This thing is hashed, drop it for now */ 3859 d_drop(dentry); 3860 } else { 3861 ret = btrfs_inode_by_name(dir, dentry, &location); 3862 } 3863 3864 if (ret < 0) 3865 return ERR_PTR(ret); 3866 3867 if (location.objectid == 0) 3868 return NULL; 3869 3870 if (location.type == BTRFS_INODE_ITEM_KEY) { 3871 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 3872 return inode; 3873 } 3874 3875 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 3876 3877 index = srcu_read_lock(&root->fs_info->subvol_srcu); 3878 ret = fixup_tree_root_location(root, dir, dentry, 3879 &location, &sub_root); 3880 if (ret < 0) { 3881 if (ret != -ENOENT) 3882 inode = ERR_PTR(ret); 3883 else 3884 inode = new_simple_dir(dir->i_sb, &location, sub_root); 3885 } else { 3886 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 3887 } 3888 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3889 3890 if (!IS_ERR(inode) && root != sub_root) { 3891 down_read(&root->fs_info->cleanup_work_sem); 3892 if (!(inode->i_sb->s_flags & MS_RDONLY)) 3893 ret = btrfs_orphan_cleanup(sub_root); 3894 up_read(&root->fs_info->cleanup_work_sem); 3895 if (ret) 3896 inode = ERR_PTR(ret); 3897 } 3898 3899 return inode; 3900 } 3901 3902 static int btrfs_dentry_delete(const struct dentry *dentry) 3903 { 3904 struct btrfs_root *root; 3905 3906 if (!dentry->d_inode && !IS_ROOT(dentry)) 3907 dentry = dentry->d_parent; 3908 3909 if (dentry->d_inode) { 3910 root = BTRFS_I(dentry->d_inode)->root; 3911 if (btrfs_root_refs(&root->root_item) == 0) 3912 return 1; 3913 } 3914 return 0; 3915 } 3916 3917 static void btrfs_dentry_release(struct dentry *dentry) 3918 { 3919 if (dentry->d_fsdata) 3920 kfree(dentry->d_fsdata); 3921 } 3922 3923 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 3924 struct nameidata *nd) 3925 { 3926 struct dentry *ret; 3927 3928 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 3929 if (unlikely(d_need_lookup(dentry))) { 3930 spin_lock(&dentry->d_lock); 3931 dentry->d_flags &= ~DCACHE_NEED_LOOKUP; 3932 spin_unlock(&dentry->d_lock); 3933 } 3934 return ret; 3935 } 3936 3937 unsigned char btrfs_filetype_table[] = { 3938 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 3939 }; 3940 3941 static int btrfs_real_readdir(struct file *filp, void *dirent, 3942 filldir_t filldir) 3943 { 3944 struct inode *inode = filp->f_dentry->d_inode; 3945 struct btrfs_root *root = BTRFS_I(inode)->root; 3946 struct btrfs_item *item; 3947 struct btrfs_dir_item *di; 3948 struct btrfs_key key; 3949 struct btrfs_key found_key; 3950 struct btrfs_path *path; 3951 struct list_head ins_list; 3952 struct list_head del_list; 3953 struct qstr q; 3954 int ret; 3955 struct extent_buffer *leaf; 3956 int slot; 3957 unsigned char d_type; 3958 int over = 0; 3959 u32 di_cur; 3960 u32 di_total; 3961 u32 di_len; 3962 int key_type = BTRFS_DIR_INDEX_KEY; 3963 char tmp_name[32]; 3964 char *name_ptr; 3965 int name_len; 3966 int is_curr = 0; /* filp->f_pos points to the current index? */ 3967 3968 /* FIXME, use a real flag for deciding about the key type */ 3969 if (root->fs_info->tree_root == root) 3970 key_type = BTRFS_DIR_ITEM_KEY; 3971 3972 /* special case for "." */ 3973 if (filp->f_pos == 0) { 3974 over = filldir(dirent, ".", 1, 3975 filp->f_pos, btrfs_ino(inode), DT_DIR); 3976 if (over) 3977 return 0; 3978 filp->f_pos = 1; 3979 } 3980 /* special case for .., just use the back ref */ 3981 if (filp->f_pos == 1) { 3982 u64 pino = parent_ino(filp->f_path.dentry); 3983 over = filldir(dirent, "..", 2, 3984 filp->f_pos, pino, DT_DIR); 3985 if (over) 3986 return 0; 3987 filp->f_pos = 2; 3988 } 3989 path = btrfs_alloc_path(); 3990 if (!path) 3991 return -ENOMEM; 3992 3993 path->reada = 1; 3994 3995 if (key_type == BTRFS_DIR_INDEX_KEY) { 3996 INIT_LIST_HEAD(&ins_list); 3997 INIT_LIST_HEAD(&del_list); 3998 btrfs_get_delayed_items(inode, &ins_list, &del_list); 3999 } 4000 4001 btrfs_set_key_type(&key, key_type); 4002 key.offset = filp->f_pos; 4003 key.objectid = btrfs_ino(inode); 4004 4005 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4006 if (ret < 0) 4007 goto err; 4008 4009 while (1) { 4010 leaf = path->nodes[0]; 4011 slot = path->slots[0]; 4012 if (slot >= btrfs_header_nritems(leaf)) { 4013 ret = btrfs_next_leaf(root, path); 4014 if (ret < 0) 4015 goto err; 4016 else if (ret > 0) 4017 break; 4018 continue; 4019 } 4020 4021 item = btrfs_item_nr(leaf, slot); 4022 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4023 4024 if (found_key.objectid != key.objectid) 4025 break; 4026 if (btrfs_key_type(&found_key) != key_type) 4027 break; 4028 if (found_key.offset < filp->f_pos) 4029 goto next; 4030 if (key_type == BTRFS_DIR_INDEX_KEY && 4031 btrfs_should_delete_dir_index(&del_list, 4032 found_key.offset)) 4033 goto next; 4034 4035 filp->f_pos = found_key.offset; 4036 is_curr = 1; 4037 4038 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 4039 di_cur = 0; 4040 di_total = btrfs_item_size(leaf, item); 4041 4042 while (di_cur < di_total) { 4043 struct btrfs_key location; 4044 struct dentry *tmp; 4045 4046 if (verify_dir_item(root, leaf, di)) 4047 break; 4048 4049 name_len = btrfs_dir_name_len(leaf, di); 4050 if (name_len <= sizeof(tmp_name)) { 4051 name_ptr = tmp_name; 4052 } else { 4053 name_ptr = kmalloc(name_len, GFP_NOFS); 4054 if (!name_ptr) { 4055 ret = -ENOMEM; 4056 goto err; 4057 } 4058 } 4059 read_extent_buffer(leaf, name_ptr, 4060 (unsigned long)(di + 1), name_len); 4061 4062 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4063 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4064 4065 q.name = name_ptr; 4066 q.len = name_len; 4067 q.hash = full_name_hash(q.name, q.len); 4068 tmp = d_lookup(filp->f_dentry, &q); 4069 if (!tmp) { 4070 struct btrfs_key *newkey; 4071 4072 newkey = kzalloc(sizeof(struct btrfs_key), 4073 GFP_NOFS); 4074 if (!newkey) 4075 goto no_dentry; 4076 tmp = d_alloc(filp->f_dentry, &q); 4077 if (!tmp) { 4078 kfree(newkey); 4079 dput(tmp); 4080 goto no_dentry; 4081 } 4082 memcpy(newkey, &location, 4083 sizeof(struct btrfs_key)); 4084 tmp->d_fsdata = newkey; 4085 tmp->d_flags |= DCACHE_NEED_LOOKUP; 4086 d_rehash(tmp); 4087 dput(tmp); 4088 } else { 4089 dput(tmp); 4090 } 4091 no_dentry: 4092 /* is this a reference to our own snapshot? If so 4093 * skip it 4094 */ 4095 if (location.type == BTRFS_ROOT_ITEM_KEY && 4096 location.objectid == root->root_key.objectid) { 4097 over = 0; 4098 goto skip; 4099 } 4100 over = filldir(dirent, name_ptr, name_len, 4101 found_key.offset, location.objectid, 4102 d_type); 4103 4104 skip: 4105 if (name_ptr != tmp_name) 4106 kfree(name_ptr); 4107 4108 if (over) 4109 goto nopos; 4110 di_len = btrfs_dir_name_len(leaf, di) + 4111 btrfs_dir_data_len(leaf, di) + sizeof(*di); 4112 di_cur += di_len; 4113 di = (struct btrfs_dir_item *)((char *)di + di_len); 4114 } 4115 next: 4116 path->slots[0]++; 4117 } 4118 4119 if (key_type == BTRFS_DIR_INDEX_KEY) { 4120 if (is_curr) 4121 filp->f_pos++; 4122 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, 4123 &ins_list); 4124 if (ret) 4125 goto nopos; 4126 } 4127 4128 /* Reached end of directory/root. Bump pos past the last item. */ 4129 if (key_type == BTRFS_DIR_INDEX_KEY) 4130 /* 4131 * 32-bit glibc will use getdents64, but then strtol - 4132 * so the last number we can serve is this. 4133 */ 4134 filp->f_pos = 0x7fffffff; 4135 else 4136 filp->f_pos++; 4137 nopos: 4138 ret = 0; 4139 err: 4140 if (key_type == BTRFS_DIR_INDEX_KEY) 4141 btrfs_put_delayed_items(&ins_list, &del_list); 4142 btrfs_free_path(path); 4143 return ret; 4144 } 4145 4146 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 4147 { 4148 struct btrfs_root *root = BTRFS_I(inode)->root; 4149 struct btrfs_trans_handle *trans; 4150 int ret = 0; 4151 bool nolock = false; 4152 4153 if (BTRFS_I(inode)->dummy_inode) 4154 return 0; 4155 4156 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) 4157 nolock = true; 4158 4159 if (wbc->sync_mode == WB_SYNC_ALL) { 4160 if (nolock) 4161 trans = btrfs_join_transaction_nolock(root); 4162 else 4163 trans = btrfs_join_transaction(root); 4164 if (IS_ERR(trans)) 4165 return PTR_ERR(trans); 4166 if (nolock) 4167 ret = btrfs_end_transaction_nolock(trans, root); 4168 else 4169 ret = btrfs_commit_transaction(trans, root); 4170 } 4171 return ret; 4172 } 4173 4174 /* 4175 * This is somewhat expensive, updating the tree every time the 4176 * inode changes. But, it is most likely to find the inode in cache. 4177 * FIXME, needs more benchmarking...there are no reasons other than performance 4178 * to keep or drop this code. 4179 */ 4180 void btrfs_dirty_inode(struct inode *inode, int flags) 4181 { 4182 struct btrfs_root *root = BTRFS_I(inode)->root; 4183 struct btrfs_trans_handle *trans; 4184 int ret; 4185 4186 if (BTRFS_I(inode)->dummy_inode) 4187 return; 4188 4189 trans = btrfs_join_transaction(root); 4190 BUG_ON(IS_ERR(trans)); 4191 4192 ret = btrfs_update_inode(trans, root, inode); 4193 if (ret && ret == -ENOSPC) { 4194 /* whoops, lets try again with the full transaction */ 4195 btrfs_end_transaction(trans, root); 4196 trans = btrfs_start_transaction(root, 1); 4197 if (IS_ERR(trans)) { 4198 printk_ratelimited(KERN_ERR "btrfs: fail to " 4199 "dirty inode %llu error %ld\n", 4200 (unsigned long long)btrfs_ino(inode), 4201 PTR_ERR(trans)); 4202 return; 4203 } 4204 4205 ret = btrfs_update_inode(trans, root, inode); 4206 if (ret) { 4207 printk_ratelimited(KERN_ERR "btrfs: fail to " 4208 "dirty inode %llu error %d\n", 4209 (unsigned long long)btrfs_ino(inode), 4210 ret); 4211 } 4212 } 4213 btrfs_end_transaction(trans, root); 4214 if (BTRFS_I(inode)->delayed_node) 4215 btrfs_balance_delayed_items(root); 4216 } 4217 4218 /* 4219 * find the highest existing sequence number in a directory 4220 * and then set the in-memory index_cnt variable to reflect 4221 * free sequence numbers 4222 */ 4223 static int btrfs_set_inode_index_count(struct inode *inode) 4224 { 4225 struct btrfs_root *root = BTRFS_I(inode)->root; 4226 struct btrfs_key key, found_key; 4227 struct btrfs_path *path; 4228 struct extent_buffer *leaf; 4229 int ret; 4230 4231 key.objectid = btrfs_ino(inode); 4232 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 4233 key.offset = (u64)-1; 4234 4235 path = btrfs_alloc_path(); 4236 if (!path) 4237 return -ENOMEM; 4238 4239 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4240 if (ret < 0) 4241 goto out; 4242 /* FIXME: we should be able to handle this */ 4243 if (ret == 0) 4244 goto out; 4245 ret = 0; 4246 4247 /* 4248 * MAGIC NUMBER EXPLANATION: 4249 * since we search a directory based on f_pos we have to start at 2 4250 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 4251 * else has to start at 2 4252 */ 4253 if (path->slots[0] == 0) { 4254 BTRFS_I(inode)->index_cnt = 2; 4255 goto out; 4256 } 4257 4258 path->slots[0]--; 4259 4260 leaf = path->nodes[0]; 4261 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4262 4263 if (found_key.objectid != btrfs_ino(inode) || 4264 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 4265 BTRFS_I(inode)->index_cnt = 2; 4266 goto out; 4267 } 4268 4269 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 4270 out: 4271 btrfs_free_path(path); 4272 return ret; 4273 } 4274 4275 /* 4276 * helper to find a free sequence number in a given directory. This current 4277 * code is very simple, later versions will do smarter things in the btree 4278 */ 4279 int btrfs_set_inode_index(struct inode *dir, u64 *index) 4280 { 4281 int ret = 0; 4282 4283 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 4284 ret = btrfs_inode_delayed_dir_index_count(dir); 4285 if (ret) { 4286 ret = btrfs_set_inode_index_count(dir); 4287 if (ret) 4288 return ret; 4289 } 4290 } 4291 4292 *index = BTRFS_I(dir)->index_cnt; 4293 BTRFS_I(dir)->index_cnt++; 4294 4295 return ret; 4296 } 4297 4298 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 4299 struct btrfs_root *root, 4300 struct inode *dir, 4301 const char *name, int name_len, 4302 u64 ref_objectid, u64 objectid, int mode, 4303 u64 *index) 4304 { 4305 struct inode *inode; 4306 struct btrfs_inode_item *inode_item; 4307 struct btrfs_key *location; 4308 struct btrfs_path *path; 4309 struct btrfs_inode_ref *ref; 4310 struct btrfs_key key[2]; 4311 u32 sizes[2]; 4312 unsigned long ptr; 4313 int ret; 4314 int owner; 4315 4316 path = btrfs_alloc_path(); 4317 if (!path) 4318 return ERR_PTR(-ENOMEM); 4319 4320 inode = new_inode(root->fs_info->sb); 4321 if (!inode) { 4322 btrfs_free_path(path); 4323 return ERR_PTR(-ENOMEM); 4324 } 4325 4326 /* 4327 * we have to initialize this early, so we can reclaim the inode 4328 * number if we fail afterwards in this function. 4329 */ 4330 inode->i_ino = objectid; 4331 4332 if (dir) { 4333 trace_btrfs_inode_request(dir); 4334 4335 ret = btrfs_set_inode_index(dir, index); 4336 if (ret) { 4337 btrfs_free_path(path); 4338 iput(inode); 4339 return ERR_PTR(ret); 4340 } 4341 } 4342 /* 4343 * index_cnt is ignored for everything but a dir, 4344 * btrfs_get_inode_index_count has an explanation for the magic 4345 * number 4346 */ 4347 BTRFS_I(inode)->index_cnt = 2; 4348 BTRFS_I(inode)->root = root; 4349 BTRFS_I(inode)->generation = trans->transid; 4350 inode->i_generation = BTRFS_I(inode)->generation; 4351 btrfs_set_inode_space_info(root, inode); 4352 4353 if (S_ISDIR(mode)) 4354 owner = 0; 4355 else 4356 owner = 1; 4357 4358 key[0].objectid = objectid; 4359 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4360 key[0].offset = 0; 4361 4362 key[1].objectid = objectid; 4363 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4364 key[1].offset = ref_objectid; 4365 4366 sizes[0] = sizeof(struct btrfs_inode_item); 4367 sizes[1] = name_len + sizeof(*ref); 4368 4369 path->leave_spinning = 1; 4370 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 4371 if (ret != 0) 4372 goto fail; 4373 4374 inode_init_owner(inode, dir, mode); 4375 inode_set_bytes(inode, 0); 4376 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4377 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4378 struct btrfs_inode_item); 4379 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4380 4381 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4382 struct btrfs_inode_ref); 4383 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 4384 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 4385 ptr = (unsigned long)(ref + 1); 4386 write_extent_buffer(path->nodes[0], name, ptr, name_len); 4387 4388 btrfs_mark_buffer_dirty(path->nodes[0]); 4389 btrfs_free_path(path); 4390 4391 location = &BTRFS_I(inode)->location; 4392 location->objectid = objectid; 4393 location->offset = 0; 4394 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 4395 4396 btrfs_inherit_iflags(inode, dir); 4397 4398 if (S_ISREG(mode)) { 4399 if (btrfs_test_opt(root, NODATASUM)) 4400 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4401 if (btrfs_test_opt(root, NODATACOW) || 4402 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) 4403 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4404 } 4405 4406 insert_inode_hash(inode); 4407 inode_tree_add(inode); 4408 4409 trace_btrfs_inode_new(inode); 4410 btrfs_set_inode_last_trans(trans, inode); 4411 4412 return inode; 4413 fail: 4414 if (dir) 4415 BTRFS_I(dir)->index_cnt--; 4416 btrfs_free_path(path); 4417 iput(inode); 4418 return ERR_PTR(ret); 4419 } 4420 4421 static inline u8 btrfs_inode_type(struct inode *inode) 4422 { 4423 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 4424 } 4425 4426 /* 4427 * utility function to add 'inode' into 'parent_inode' with 4428 * a give name and a given sequence number. 4429 * if 'add_backref' is true, also insert a backref from the 4430 * inode to the parent directory. 4431 */ 4432 int btrfs_add_link(struct btrfs_trans_handle *trans, 4433 struct inode *parent_inode, struct inode *inode, 4434 const char *name, int name_len, int add_backref, u64 index) 4435 { 4436 int ret = 0; 4437 struct btrfs_key key; 4438 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4439 u64 ino = btrfs_ino(inode); 4440 u64 parent_ino = btrfs_ino(parent_inode); 4441 4442 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4443 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4444 } else { 4445 key.objectid = ino; 4446 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4447 key.offset = 0; 4448 } 4449 4450 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 4451 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4452 key.objectid, root->root_key.objectid, 4453 parent_ino, index, name, name_len); 4454 } else if (add_backref) { 4455 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 4456 parent_ino, index); 4457 } 4458 4459 if (ret == 0) { 4460 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4461 parent_inode, &key, 4462 btrfs_inode_type(inode), index); 4463 BUG_ON(ret); 4464 4465 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4466 name_len * 2); 4467 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4468 ret = btrfs_update_inode(trans, root, parent_inode); 4469 } 4470 return ret; 4471 } 4472 4473 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4474 struct inode *dir, struct dentry *dentry, 4475 struct inode *inode, int backref, u64 index) 4476 { 4477 int err = btrfs_add_link(trans, dir, inode, 4478 dentry->d_name.name, dentry->d_name.len, 4479 backref, index); 4480 if (!err) { 4481 d_instantiate(dentry, inode); 4482 return 0; 4483 } 4484 if (err > 0) 4485 err = -EEXIST; 4486 return err; 4487 } 4488 4489 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4490 int mode, dev_t rdev) 4491 { 4492 struct btrfs_trans_handle *trans; 4493 struct btrfs_root *root = BTRFS_I(dir)->root; 4494 struct inode *inode = NULL; 4495 int err; 4496 int drop_inode = 0; 4497 u64 objectid; 4498 unsigned long nr = 0; 4499 u64 index = 0; 4500 4501 if (!new_valid_dev(rdev)) 4502 return -EINVAL; 4503 4504 /* 4505 * 2 for inode item and ref 4506 * 2 for dir items 4507 * 1 for xattr if selinux is on 4508 */ 4509 trans = btrfs_start_transaction(root, 5); 4510 if (IS_ERR(trans)) 4511 return PTR_ERR(trans); 4512 4513 err = btrfs_find_free_ino(root, &objectid); 4514 if (err) 4515 goto out_unlock; 4516 4517 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4518 dentry->d_name.len, btrfs_ino(dir), objectid, 4519 mode, &index); 4520 if (IS_ERR(inode)) { 4521 err = PTR_ERR(inode); 4522 goto out_unlock; 4523 } 4524 4525 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4526 if (err) { 4527 drop_inode = 1; 4528 goto out_unlock; 4529 } 4530 4531 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4532 if (err) 4533 drop_inode = 1; 4534 else { 4535 inode->i_op = &btrfs_special_inode_operations; 4536 init_special_inode(inode, inode->i_mode, rdev); 4537 btrfs_update_inode(trans, root, inode); 4538 } 4539 out_unlock: 4540 nr = trans->blocks_used; 4541 btrfs_end_transaction_throttle(trans, root); 4542 btrfs_btree_balance_dirty(root, nr); 4543 if (drop_inode) { 4544 inode_dec_link_count(inode); 4545 iput(inode); 4546 } 4547 return err; 4548 } 4549 4550 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4551 int mode, struct nameidata *nd) 4552 { 4553 struct btrfs_trans_handle *trans; 4554 struct btrfs_root *root = BTRFS_I(dir)->root; 4555 struct inode *inode = NULL; 4556 int drop_inode = 0; 4557 int err; 4558 unsigned long nr = 0; 4559 u64 objectid; 4560 u64 index = 0; 4561 4562 /* 4563 * 2 for inode item and ref 4564 * 2 for dir items 4565 * 1 for xattr if selinux is on 4566 */ 4567 trans = btrfs_start_transaction(root, 5); 4568 if (IS_ERR(trans)) 4569 return PTR_ERR(trans); 4570 4571 err = btrfs_find_free_ino(root, &objectid); 4572 if (err) 4573 goto out_unlock; 4574 4575 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4576 dentry->d_name.len, btrfs_ino(dir), objectid, 4577 mode, &index); 4578 if (IS_ERR(inode)) { 4579 err = PTR_ERR(inode); 4580 goto out_unlock; 4581 } 4582 4583 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4584 if (err) { 4585 drop_inode = 1; 4586 goto out_unlock; 4587 } 4588 4589 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4590 if (err) 4591 drop_inode = 1; 4592 else { 4593 inode->i_mapping->a_ops = &btrfs_aops; 4594 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4595 inode->i_fop = &btrfs_file_operations; 4596 inode->i_op = &btrfs_file_inode_operations; 4597 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4598 } 4599 out_unlock: 4600 nr = trans->blocks_used; 4601 btrfs_end_transaction_throttle(trans, root); 4602 if (drop_inode) { 4603 inode_dec_link_count(inode); 4604 iput(inode); 4605 } 4606 btrfs_btree_balance_dirty(root, nr); 4607 return err; 4608 } 4609 4610 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 4611 struct dentry *dentry) 4612 { 4613 struct btrfs_trans_handle *trans; 4614 struct btrfs_root *root = BTRFS_I(dir)->root; 4615 struct inode *inode = old_dentry->d_inode; 4616 u64 index; 4617 unsigned long nr = 0; 4618 int err; 4619 int drop_inode = 0; 4620 4621 /* do not allow sys_link's with other subvols of the same device */ 4622 if (root->objectid != BTRFS_I(inode)->root->objectid) 4623 return -EXDEV; 4624 4625 if (inode->i_nlink == ~0U) 4626 return -EMLINK; 4627 4628 err = btrfs_set_inode_index(dir, &index); 4629 if (err) 4630 goto fail; 4631 4632 /* 4633 * 2 items for inode and inode ref 4634 * 2 items for dir items 4635 * 1 item for parent inode 4636 */ 4637 trans = btrfs_start_transaction(root, 5); 4638 if (IS_ERR(trans)) { 4639 err = PTR_ERR(trans); 4640 goto fail; 4641 } 4642 4643 btrfs_inc_nlink(inode); 4644 inode->i_ctime = CURRENT_TIME; 4645 ihold(inode); 4646 4647 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 4648 4649 if (err) { 4650 drop_inode = 1; 4651 } else { 4652 struct dentry *parent = dentry->d_parent; 4653 err = btrfs_update_inode(trans, root, inode); 4654 BUG_ON(err); 4655 btrfs_log_new_name(trans, inode, NULL, parent); 4656 } 4657 4658 nr = trans->blocks_used; 4659 btrfs_end_transaction_throttle(trans, root); 4660 fail: 4661 if (drop_inode) { 4662 inode_dec_link_count(inode); 4663 iput(inode); 4664 } 4665 btrfs_btree_balance_dirty(root, nr); 4666 return err; 4667 } 4668 4669 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 4670 { 4671 struct inode *inode = NULL; 4672 struct btrfs_trans_handle *trans; 4673 struct btrfs_root *root = BTRFS_I(dir)->root; 4674 int err = 0; 4675 int drop_on_err = 0; 4676 u64 objectid = 0; 4677 u64 index = 0; 4678 unsigned long nr = 1; 4679 4680 /* 4681 * 2 items for inode and ref 4682 * 2 items for dir items 4683 * 1 for xattr if selinux is on 4684 */ 4685 trans = btrfs_start_transaction(root, 5); 4686 if (IS_ERR(trans)) 4687 return PTR_ERR(trans); 4688 4689 err = btrfs_find_free_ino(root, &objectid); 4690 if (err) 4691 goto out_fail; 4692 4693 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4694 dentry->d_name.len, btrfs_ino(dir), objectid, 4695 S_IFDIR | mode, &index); 4696 if (IS_ERR(inode)) { 4697 err = PTR_ERR(inode); 4698 goto out_fail; 4699 } 4700 4701 drop_on_err = 1; 4702 4703 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4704 if (err) 4705 goto out_fail; 4706 4707 inode->i_op = &btrfs_dir_inode_operations; 4708 inode->i_fop = &btrfs_dir_file_operations; 4709 4710 btrfs_i_size_write(inode, 0); 4711 err = btrfs_update_inode(trans, root, inode); 4712 if (err) 4713 goto out_fail; 4714 4715 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 4716 dentry->d_name.len, 0, index); 4717 if (err) 4718 goto out_fail; 4719 4720 d_instantiate(dentry, inode); 4721 drop_on_err = 0; 4722 4723 out_fail: 4724 nr = trans->blocks_used; 4725 btrfs_end_transaction_throttle(trans, root); 4726 if (drop_on_err) 4727 iput(inode); 4728 btrfs_btree_balance_dirty(root, nr); 4729 return err; 4730 } 4731 4732 /* helper for btfs_get_extent. Given an existing extent in the tree, 4733 * and an extent that you want to insert, deal with overlap and insert 4734 * the new extent into the tree. 4735 */ 4736 static int merge_extent_mapping(struct extent_map_tree *em_tree, 4737 struct extent_map *existing, 4738 struct extent_map *em, 4739 u64 map_start, u64 map_len) 4740 { 4741 u64 start_diff; 4742 4743 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 4744 start_diff = map_start - em->start; 4745 em->start = map_start; 4746 em->len = map_len; 4747 if (em->block_start < EXTENT_MAP_LAST_BYTE && 4748 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 4749 em->block_start += start_diff; 4750 em->block_len -= start_diff; 4751 } 4752 return add_extent_mapping(em_tree, em); 4753 } 4754 4755 static noinline int uncompress_inline(struct btrfs_path *path, 4756 struct inode *inode, struct page *page, 4757 size_t pg_offset, u64 extent_offset, 4758 struct btrfs_file_extent_item *item) 4759 { 4760 int ret; 4761 struct extent_buffer *leaf = path->nodes[0]; 4762 char *tmp; 4763 size_t max_size; 4764 unsigned long inline_size; 4765 unsigned long ptr; 4766 int compress_type; 4767 4768 WARN_ON(pg_offset != 0); 4769 compress_type = btrfs_file_extent_compression(leaf, item); 4770 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4771 inline_size = btrfs_file_extent_inline_item_len(leaf, 4772 btrfs_item_nr(leaf, path->slots[0])); 4773 tmp = kmalloc(inline_size, GFP_NOFS); 4774 if (!tmp) 4775 return -ENOMEM; 4776 ptr = btrfs_file_extent_inline_start(item); 4777 4778 read_extent_buffer(leaf, tmp, ptr, inline_size); 4779 4780 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4781 ret = btrfs_decompress(compress_type, tmp, page, 4782 extent_offset, inline_size, max_size); 4783 if (ret) { 4784 char *kaddr = kmap_atomic(page, KM_USER0); 4785 unsigned long copy_size = min_t(u64, 4786 PAGE_CACHE_SIZE - pg_offset, 4787 max_size - extent_offset); 4788 memset(kaddr + pg_offset, 0, copy_size); 4789 kunmap_atomic(kaddr, KM_USER0); 4790 } 4791 kfree(tmp); 4792 return 0; 4793 } 4794 4795 /* 4796 * a bit scary, this does extent mapping from logical file offset to the disk. 4797 * the ugly parts come from merging extents from the disk with the in-ram 4798 * representation. This gets more complex because of the data=ordered code, 4799 * where the in-ram extents might be locked pending data=ordered completion. 4800 * 4801 * This also copies inline extents directly into the page. 4802 */ 4803 4804 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 4805 size_t pg_offset, u64 start, u64 len, 4806 int create) 4807 { 4808 int ret; 4809 int err = 0; 4810 u64 bytenr; 4811 u64 extent_start = 0; 4812 u64 extent_end = 0; 4813 u64 objectid = btrfs_ino(inode); 4814 u32 found_type; 4815 struct btrfs_path *path = NULL; 4816 struct btrfs_root *root = BTRFS_I(inode)->root; 4817 struct btrfs_file_extent_item *item; 4818 struct extent_buffer *leaf; 4819 struct btrfs_key found_key; 4820 struct extent_map *em = NULL; 4821 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4822 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4823 struct btrfs_trans_handle *trans = NULL; 4824 int compress_type; 4825 4826 again: 4827 read_lock(&em_tree->lock); 4828 em = lookup_extent_mapping(em_tree, start, len); 4829 if (em) 4830 em->bdev = root->fs_info->fs_devices->latest_bdev; 4831 read_unlock(&em_tree->lock); 4832 4833 if (em) { 4834 if (em->start > start || em->start + em->len <= start) 4835 free_extent_map(em); 4836 else if (em->block_start == EXTENT_MAP_INLINE && page) 4837 free_extent_map(em); 4838 else 4839 goto out; 4840 } 4841 em = alloc_extent_map(); 4842 if (!em) { 4843 err = -ENOMEM; 4844 goto out; 4845 } 4846 em->bdev = root->fs_info->fs_devices->latest_bdev; 4847 em->start = EXTENT_MAP_HOLE; 4848 em->orig_start = EXTENT_MAP_HOLE; 4849 em->len = (u64)-1; 4850 em->block_len = (u64)-1; 4851 4852 if (!path) { 4853 path = btrfs_alloc_path(); 4854 if (!path) { 4855 err = -ENOMEM; 4856 goto out; 4857 } 4858 /* 4859 * Chances are we'll be called again, so go ahead and do 4860 * readahead 4861 */ 4862 path->reada = 1; 4863 } 4864 4865 ret = btrfs_lookup_file_extent(trans, root, path, 4866 objectid, start, trans != NULL); 4867 if (ret < 0) { 4868 err = ret; 4869 goto out; 4870 } 4871 4872 if (ret != 0) { 4873 if (path->slots[0] == 0) 4874 goto not_found; 4875 path->slots[0]--; 4876 } 4877 4878 leaf = path->nodes[0]; 4879 item = btrfs_item_ptr(leaf, path->slots[0], 4880 struct btrfs_file_extent_item); 4881 /* are we inside the extent that was found? */ 4882 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4883 found_type = btrfs_key_type(&found_key); 4884 if (found_key.objectid != objectid || 4885 found_type != BTRFS_EXTENT_DATA_KEY) { 4886 goto not_found; 4887 } 4888 4889 found_type = btrfs_file_extent_type(leaf, item); 4890 extent_start = found_key.offset; 4891 compress_type = btrfs_file_extent_compression(leaf, item); 4892 if (found_type == BTRFS_FILE_EXTENT_REG || 4893 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 4894 extent_end = extent_start + 4895 btrfs_file_extent_num_bytes(leaf, item); 4896 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 4897 size_t size; 4898 size = btrfs_file_extent_inline_len(leaf, item); 4899 extent_end = (extent_start + size + root->sectorsize - 1) & 4900 ~((u64)root->sectorsize - 1); 4901 } 4902 4903 if (start >= extent_end) { 4904 path->slots[0]++; 4905 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 4906 ret = btrfs_next_leaf(root, path); 4907 if (ret < 0) { 4908 err = ret; 4909 goto out; 4910 } 4911 if (ret > 0) 4912 goto not_found; 4913 leaf = path->nodes[0]; 4914 } 4915 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4916 if (found_key.objectid != objectid || 4917 found_key.type != BTRFS_EXTENT_DATA_KEY) 4918 goto not_found; 4919 if (start + len <= found_key.offset) 4920 goto not_found; 4921 em->start = start; 4922 em->len = found_key.offset - start; 4923 goto not_found_em; 4924 } 4925 4926 if (found_type == BTRFS_FILE_EXTENT_REG || 4927 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 4928 em->start = extent_start; 4929 em->len = extent_end - extent_start; 4930 em->orig_start = extent_start - 4931 btrfs_file_extent_offset(leaf, item); 4932 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 4933 if (bytenr == 0) { 4934 em->block_start = EXTENT_MAP_HOLE; 4935 goto insert; 4936 } 4937 if (compress_type != BTRFS_COMPRESS_NONE) { 4938 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 4939 em->compress_type = compress_type; 4940 em->block_start = bytenr; 4941 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 4942 item); 4943 } else { 4944 bytenr += btrfs_file_extent_offset(leaf, item); 4945 em->block_start = bytenr; 4946 em->block_len = em->len; 4947 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 4948 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 4949 } 4950 goto insert; 4951 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 4952 unsigned long ptr; 4953 char *map; 4954 size_t size; 4955 size_t extent_offset; 4956 size_t copy_size; 4957 4958 em->block_start = EXTENT_MAP_INLINE; 4959 if (!page || create) { 4960 em->start = extent_start; 4961 em->len = extent_end - extent_start; 4962 goto out; 4963 } 4964 4965 size = btrfs_file_extent_inline_len(leaf, item); 4966 extent_offset = page_offset(page) + pg_offset - extent_start; 4967 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 4968 size - extent_offset); 4969 em->start = extent_start + extent_offset; 4970 em->len = (copy_size + root->sectorsize - 1) & 4971 ~((u64)root->sectorsize - 1); 4972 em->orig_start = EXTENT_MAP_INLINE; 4973 if (compress_type) { 4974 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 4975 em->compress_type = compress_type; 4976 } 4977 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 4978 if (create == 0 && !PageUptodate(page)) { 4979 if (btrfs_file_extent_compression(leaf, item) != 4980 BTRFS_COMPRESS_NONE) { 4981 ret = uncompress_inline(path, inode, page, 4982 pg_offset, 4983 extent_offset, item); 4984 BUG_ON(ret); 4985 } else { 4986 map = kmap(page); 4987 read_extent_buffer(leaf, map + pg_offset, ptr, 4988 copy_size); 4989 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 4990 memset(map + pg_offset + copy_size, 0, 4991 PAGE_CACHE_SIZE - pg_offset - 4992 copy_size); 4993 } 4994 kunmap(page); 4995 } 4996 flush_dcache_page(page); 4997 } else if (create && PageUptodate(page)) { 4998 WARN_ON(1); 4999 if (!trans) { 5000 kunmap(page); 5001 free_extent_map(em); 5002 em = NULL; 5003 5004 btrfs_release_path(path); 5005 trans = btrfs_join_transaction(root); 5006 5007 if (IS_ERR(trans)) 5008 return ERR_CAST(trans); 5009 goto again; 5010 } 5011 map = kmap(page); 5012 write_extent_buffer(leaf, map + pg_offset, ptr, 5013 copy_size); 5014 kunmap(page); 5015 btrfs_mark_buffer_dirty(leaf); 5016 } 5017 set_extent_uptodate(io_tree, em->start, 5018 extent_map_end(em) - 1, NULL, GFP_NOFS); 5019 goto insert; 5020 } else { 5021 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5022 WARN_ON(1); 5023 } 5024 not_found: 5025 em->start = start; 5026 em->len = len; 5027 not_found_em: 5028 em->block_start = EXTENT_MAP_HOLE; 5029 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 5030 insert: 5031 btrfs_release_path(path); 5032 if (em->start > start || extent_map_end(em) <= start) { 5033 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 5034 "[%llu %llu]\n", (unsigned long long)em->start, 5035 (unsigned long long)em->len, 5036 (unsigned long long)start, 5037 (unsigned long long)len); 5038 err = -EIO; 5039 goto out; 5040 } 5041 5042 err = 0; 5043 write_lock(&em_tree->lock); 5044 ret = add_extent_mapping(em_tree, em); 5045 /* it is possible that someone inserted the extent into the tree 5046 * while we had the lock dropped. It is also possible that 5047 * an overlapping map exists in the tree 5048 */ 5049 if (ret == -EEXIST) { 5050 struct extent_map *existing; 5051 5052 ret = 0; 5053 5054 existing = lookup_extent_mapping(em_tree, start, len); 5055 if (existing && (existing->start > start || 5056 existing->start + existing->len <= start)) { 5057 free_extent_map(existing); 5058 existing = NULL; 5059 } 5060 if (!existing) { 5061 existing = lookup_extent_mapping(em_tree, em->start, 5062 em->len); 5063 if (existing) { 5064 err = merge_extent_mapping(em_tree, existing, 5065 em, start, 5066 root->sectorsize); 5067 free_extent_map(existing); 5068 if (err) { 5069 free_extent_map(em); 5070 em = NULL; 5071 } 5072 } else { 5073 err = -EIO; 5074 free_extent_map(em); 5075 em = NULL; 5076 } 5077 } else { 5078 free_extent_map(em); 5079 em = existing; 5080 err = 0; 5081 } 5082 } 5083 write_unlock(&em_tree->lock); 5084 out: 5085 5086 trace_btrfs_get_extent(root, em); 5087 5088 if (path) 5089 btrfs_free_path(path); 5090 if (trans) { 5091 ret = btrfs_end_transaction(trans, root); 5092 if (!err) 5093 err = ret; 5094 } 5095 if (err) { 5096 free_extent_map(em); 5097 return ERR_PTR(err); 5098 } 5099 return em; 5100 } 5101 5102 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 5103 size_t pg_offset, u64 start, u64 len, 5104 int create) 5105 { 5106 struct extent_map *em; 5107 struct extent_map *hole_em = NULL; 5108 u64 range_start = start; 5109 u64 end; 5110 u64 found; 5111 u64 found_end; 5112 int err = 0; 5113 5114 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 5115 if (IS_ERR(em)) 5116 return em; 5117 if (em) { 5118 /* 5119 * if our em maps to a hole, there might 5120 * actually be delalloc bytes behind it 5121 */ 5122 if (em->block_start != EXTENT_MAP_HOLE) 5123 return em; 5124 else 5125 hole_em = em; 5126 } 5127 5128 /* check to see if we've wrapped (len == -1 or similar) */ 5129 end = start + len; 5130 if (end < start) 5131 end = (u64)-1; 5132 else 5133 end -= 1; 5134 5135 em = NULL; 5136 5137 /* ok, we didn't find anything, lets look for delalloc */ 5138 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 5139 end, len, EXTENT_DELALLOC, 1); 5140 found_end = range_start + found; 5141 if (found_end < range_start) 5142 found_end = (u64)-1; 5143 5144 /* 5145 * we didn't find anything useful, return 5146 * the original results from get_extent() 5147 */ 5148 if (range_start > end || found_end <= start) { 5149 em = hole_em; 5150 hole_em = NULL; 5151 goto out; 5152 } 5153 5154 /* adjust the range_start to make sure it doesn't 5155 * go backwards from the start they passed in 5156 */ 5157 range_start = max(start,range_start); 5158 found = found_end - range_start; 5159 5160 if (found > 0) { 5161 u64 hole_start = start; 5162 u64 hole_len = len; 5163 5164 em = alloc_extent_map(); 5165 if (!em) { 5166 err = -ENOMEM; 5167 goto out; 5168 } 5169 /* 5170 * when btrfs_get_extent can't find anything it 5171 * returns one huge hole 5172 * 5173 * make sure what it found really fits our range, and 5174 * adjust to make sure it is based on the start from 5175 * the caller 5176 */ 5177 if (hole_em) { 5178 u64 calc_end = extent_map_end(hole_em); 5179 5180 if (calc_end <= start || (hole_em->start > end)) { 5181 free_extent_map(hole_em); 5182 hole_em = NULL; 5183 } else { 5184 hole_start = max(hole_em->start, start); 5185 hole_len = calc_end - hole_start; 5186 } 5187 } 5188 em->bdev = NULL; 5189 if (hole_em && range_start > hole_start) { 5190 /* our hole starts before our delalloc, so we 5191 * have to return just the parts of the hole 5192 * that go until the delalloc starts 5193 */ 5194 em->len = min(hole_len, 5195 range_start - hole_start); 5196 em->start = hole_start; 5197 em->orig_start = hole_start; 5198 /* 5199 * don't adjust block start at all, 5200 * it is fixed at EXTENT_MAP_HOLE 5201 */ 5202 em->block_start = hole_em->block_start; 5203 em->block_len = hole_len; 5204 } else { 5205 em->start = range_start; 5206 em->len = found; 5207 em->orig_start = range_start; 5208 em->block_start = EXTENT_MAP_DELALLOC; 5209 em->block_len = found; 5210 } 5211 } else if (hole_em) { 5212 return hole_em; 5213 } 5214 out: 5215 5216 free_extent_map(hole_em); 5217 if (err) { 5218 free_extent_map(em); 5219 return ERR_PTR(err); 5220 } 5221 return em; 5222 } 5223 5224 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5225 struct extent_map *em, 5226 u64 start, u64 len) 5227 { 5228 struct btrfs_root *root = BTRFS_I(inode)->root; 5229 struct btrfs_trans_handle *trans; 5230 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5231 struct btrfs_key ins; 5232 u64 alloc_hint; 5233 int ret; 5234 bool insert = false; 5235 5236 /* 5237 * Ok if the extent map we looked up is a hole and is for the exact 5238 * range we want, there is no reason to allocate a new one, however if 5239 * it is not right then we need to free this one and drop the cache for 5240 * our range. 5241 */ 5242 if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5243 em->len != len) { 5244 free_extent_map(em); 5245 em = NULL; 5246 insert = true; 5247 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5248 } 5249 5250 trans = btrfs_join_transaction(root); 5251 if (IS_ERR(trans)) 5252 return ERR_CAST(trans); 5253 5254 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) 5255 btrfs_add_inode_defrag(trans, inode); 5256 5257 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5258 5259 alloc_hint = get_extent_allocation_hint(inode, start, len); 5260 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, 5261 alloc_hint, (u64)-1, &ins, 1); 5262 if (ret) { 5263 em = ERR_PTR(ret); 5264 goto out; 5265 } 5266 5267 if (!em) { 5268 em = alloc_extent_map(); 5269 if (!em) { 5270 em = ERR_PTR(-ENOMEM); 5271 goto out; 5272 } 5273 } 5274 5275 em->start = start; 5276 em->orig_start = em->start; 5277 em->len = ins.offset; 5278 5279 em->block_start = ins.objectid; 5280 em->block_len = ins.offset; 5281 em->bdev = root->fs_info->fs_devices->latest_bdev; 5282 5283 /* 5284 * We need to do this because if we're using the original em we searched 5285 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5286 */ 5287 em->flags = 0; 5288 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5289 5290 while (insert) { 5291 write_lock(&em_tree->lock); 5292 ret = add_extent_mapping(em_tree, em); 5293 write_unlock(&em_tree->lock); 5294 if (ret != -EEXIST) 5295 break; 5296 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5297 } 5298 5299 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5300 ins.offset, ins.offset, 0); 5301 if (ret) { 5302 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 5303 em = ERR_PTR(ret); 5304 } 5305 out: 5306 btrfs_end_transaction(trans, root); 5307 return em; 5308 } 5309 5310 /* 5311 * returns 1 when the nocow is safe, < 1 on error, 0 if the 5312 * block must be cow'd 5313 */ 5314 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 5315 struct inode *inode, u64 offset, u64 len) 5316 { 5317 struct btrfs_path *path; 5318 int ret; 5319 struct extent_buffer *leaf; 5320 struct btrfs_root *root = BTRFS_I(inode)->root; 5321 struct btrfs_file_extent_item *fi; 5322 struct btrfs_key key; 5323 u64 disk_bytenr; 5324 u64 backref_offset; 5325 u64 extent_end; 5326 u64 num_bytes; 5327 int slot; 5328 int found_type; 5329 5330 path = btrfs_alloc_path(); 5331 if (!path) 5332 return -ENOMEM; 5333 5334 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 5335 offset, 0); 5336 if (ret < 0) 5337 goto out; 5338 5339 slot = path->slots[0]; 5340 if (ret == 1) { 5341 if (slot == 0) { 5342 /* can't find the item, must cow */ 5343 ret = 0; 5344 goto out; 5345 } 5346 slot--; 5347 } 5348 ret = 0; 5349 leaf = path->nodes[0]; 5350 btrfs_item_key_to_cpu(leaf, &key, slot); 5351 if (key.objectid != btrfs_ino(inode) || 5352 key.type != BTRFS_EXTENT_DATA_KEY) { 5353 /* not our file or wrong item type, must cow */ 5354 goto out; 5355 } 5356 5357 if (key.offset > offset) { 5358 /* Wrong offset, must cow */ 5359 goto out; 5360 } 5361 5362 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5363 found_type = btrfs_file_extent_type(leaf, fi); 5364 if (found_type != BTRFS_FILE_EXTENT_REG && 5365 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 5366 /* not a regular extent, must cow */ 5367 goto out; 5368 } 5369 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 5370 backref_offset = btrfs_file_extent_offset(leaf, fi); 5371 5372 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 5373 if (extent_end < offset + len) { 5374 /* extent doesn't include our full range, must cow */ 5375 goto out; 5376 } 5377 5378 if (btrfs_extent_readonly(root, disk_bytenr)) 5379 goto out; 5380 5381 /* 5382 * look for other files referencing this extent, if we 5383 * find any we must cow 5384 */ 5385 if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), 5386 key.offset - backref_offset, disk_bytenr)) 5387 goto out; 5388 5389 /* 5390 * adjust disk_bytenr and num_bytes to cover just the bytes 5391 * in this extent we are about to write. If there 5392 * are any csums in that range we have to cow in order 5393 * to keep the csums correct 5394 */ 5395 disk_bytenr += backref_offset; 5396 disk_bytenr += offset - key.offset; 5397 num_bytes = min(offset + len, extent_end) - offset; 5398 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 5399 goto out; 5400 /* 5401 * all of the above have passed, it is safe to overwrite this extent 5402 * without cow 5403 */ 5404 ret = 1; 5405 out: 5406 btrfs_free_path(path); 5407 return ret; 5408 } 5409 5410 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5411 struct buffer_head *bh_result, int create) 5412 { 5413 struct extent_map *em; 5414 struct btrfs_root *root = BTRFS_I(inode)->root; 5415 u64 start = iblock << inode->i_blkbits; 5416 u64 len = bh_result->b_size; 5417 struct btrfs_trans_handle *trans; 5418 5419 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5420 if (IS_ERR(em)) 5421 return PTR_ERR(em); 5422 5423 /* 5424 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5425 * io. INLINE is special, and we could probably kludge it in here, but 5426 * it's still buffered so for safety lets just fall back to the generic 5427 * buffered path. 5428 * 5429 * For COMPRESSED we _have_ to read the entire extent in so we can 5430 * decompress it, so there will be buffering required no matter what we 5431 * do, so go ahead and fallback to buffered. 5432 * 5433 * We return -ENOTBLK because thats what makes DIO go ahead and go back 5434 * to buffered IO. Don't blame me, this is the price we pay for using 5435 * the generic code. 5436 */ 5437 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5438 em->block_start == EXTENT_MAP_INLINE) { 5439 free_extent_map(em); 5440 return -ENOTBLK; 5441 } 5442 5443 /* Just a good old fashioned hole, return */ 5444 if (!create && (em->block_start == EXTENT_MAP_HOLE || 5445 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5446 free_extent_map(em); 5447 /* DIO will do one hole at a time, so just unlock a sector */ 5448 unlock_extent(&BTRFS_I(inode)->io_tree, start, 5449 start + root->sectorsize - 1, GFP_NOFS); 5450 return 0; 5451 } 5452 5453 /* 5454 * We don't allocate a new extent in the following cases 5455 * 5456 * 1) The inode is marked as NODATACOW. In this case we'll just use the 5457 * existing extent. 5458 * 2) The extent is marked as PREALLOC. We're good to go here and can 5459 * just use the extent. 5460 * 5461 */ 5462 if (!create) { 5463 len = em->len - (start - em->start); 5464 goto map; 5465 } 5466 5467 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 5468 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 5469 em->block_start != EXTENT_MAP_HOLE)) { 5470 int type; 5471 int ret; 5472 u64 block_start; 5473 5474 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5475 type = BTRFS_ORDERED_PREALLOC; 5476 else 5477 type = BTRFS_ORDERED_NOCOW; 5478 len = min(len, em->len - (start - em->start)); 5479 block_start = em->block_start + (start - em->start); 5480 5481 /* 5482 * we're not going to log anything, but we do need 5483 * to make sure the current transaction stays open 5484 * while we look for nocow cross refs 5485 */ 5486 trans = btrfs_join_transaction(root); 5487 if (IS_ERR(trans)) 5488 goto must_cow; 5489 5490 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5491 ret = btrfs_add_ordered_extent_dio(inode, start, 5492 block_start, len, len, type); 5493 btrfs_end_transaction(trans, root); 5494 if (ret) { 5495 free_extent_map(em); 5496 return ret; 5497 } 5498 goto unlock; 5499 } 5500 btrfs_end_transaction(trans, root); 5501 } 5502 must_cow: 5503 /* 5504 * this will cow the extent, reset the len in case we changed 5505 * it above 5506 */ 5507 len = bh_result->b_size; 5508 em = btrfs_new_extent_direct(inode, em, start, len); 5509 if (IS_ERR(em)) 5510 return PTR_ERR(em); 5511 len = min(len, em->len - (start - em->start)); 5512 unlock: 5513 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, 5514 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, 5515 0, NULL, GFP_NOFS); 5516 map: 5517 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 5518 inode->i_blkbits; 5519 bh_result->b_size = len; 5520 bh_result->b_bdev = em->bdev; 5521 set_buffer_mapped(bh_result); 5522 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5523 set_buffer_new(bh_result); 5524 5525 free_extent_map(em); 5526 5527 return 0; 5528 } 5529 5530 struct btrfs_dio_private { 5531 struct inode *inode; 5532 u64 logical_offset; 5533 u64 disk_bytenr; 5534 u64 bytes; 5535 u32 *csums; 5536 void *private; 5537 5538 /* number of bios pending for this dio */ 5539 atomic_t pending_bios; 5540 5541 /* IO errors */ 5542 int errors; 5543 5544 struct bio *orig_bio; 5545 }; 5546 5547 static void btrfs_endio_direct_read(struct bio *bio, int err) 5548 { 5549 struct btrfs_dio_private *dip = bio->bi_private; 5550 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5551 struct bio_vec *bvec = bio->bi_io_vec; 5552 struct inode *inode = dip->inode; 5553 struct btrfs_root *root = BTRFS_I(inode)->root; 5554 u64 start; 5555 u32 *private = dip->csums; 5556 5557 start = dip->logical_offset; 5558 do { 5559 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 5560 struct page *page = bvec->bv_page; 5561 char *kaddr; 5562 u32 csum = ~(u32)0; 5563 unsigned long flags; 5564 5565 local_irq_save(flags); 5566 kaddr = kmap_atomic(page, KM_IRQ0); 5567 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, 5568 csum, bvec->bv_len); 5569 btrfs_csum_final(csum, (char *)&csum); 5570 kunmap_atomic(kaddr, KM_IRQ0); 5571 local_irq_restore(flags); 5572 5573 flush_dcache_page(bvec->bv_page); 5574 if (csum != *private) { 5575 printk(KERN_ERR "btrfs csum failed ino %llu off" 5576 " %llu csum %u private %u\n", 5577 (unsigned long long)btrfs_ino(inode), 5578 (unsigned long long)start, 5579 csum, *private); 5580 err = -EIO; 5581 } 5582 } 5583 5584 start += bvec->bv_len; 5585 private++; 5586 bvec++; 5587 } while (bvec <= bvec_end); 5588 5589 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 5590 dip->logical_offset + dip->bytes - 1, GFP_NOFS); 5591 bio->bi_private = dip->private; 5592 5593 kfree(dip->csums); 5594 kfree(dip); 5595 5596 /* If we had a csum failure make sure to clear the uptodate flag */ 5597 if (err) 5598 clear_bit(BIO_UPTODATE, &bio->bi_flags); 5599 dio_end_io(bio, err); 5600 } 5601 5602 static void btrfs_endio_direct_write(struct bio *bio, int err) 5603 { 5604 struct btrfs_dio_private *dip = bio->bi_private; 5605 struct inode *inode = dip->inode; 5606 struct btrfs_root *root = BTRFS_I(inode)->root; 5607 struct btrfs_trans_handle *trans; 5608 struct btrfs_ordered_extent *ordered = NULL; 5609 struct extent_state *cached_state = NULL; 5610 u64 ordered_offset = dip->logical_offset; 5611 u64 ordered_bytes = dip->bytes; 5612 int ret; 5613 5614 if (err) 5615 goto out_done; 5616 again: 5617 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5618 &ordered_offset, 5619 ordered_bytes); 5620 if (!ret) 5621 goto out_test; 5622 5623 BUG_ON(!ordered); 5624 5625 trans = btrfs_join_transaction(root); 5626 if (IS_ERR(trans)) { 5627 err = -ENOMEM; 5628 goto out; 5629 } 5630 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5631 5632 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5633 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5634 if (!ret) 5635 err = btrfs_update_inode(trans, root, inode); 5636 goto out; 5637 } 5638 5639 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5640 ordered->file_offset + ordered->len - 1, 0, 5641 &cached_state, GFP_NOFS); 5642 5643 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { 5644 ret = btrfs_mark_extent_written(trans, inode, 5645 ordered->file_offset, 5646 ordered->file_offset + 5647 ordered->len); 5648 if (ret) { 5649 err = ret; 5650 goto out_unlock; 5651 } 5652 } else { 5653 ret = insert_reserved_file_extent(trans, inode, 5654 ordered->file_offset, 5655 ordered->start, 5656 ordered->disk_len, 5657 ordered->len, 5658 ordered->len, 5659 0, 0, 0, 5660 BTRFS_FILE_EXTENT_REG); 5661 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 5662 ordered->file_offset, ordered->len); 5663 if (ret) { 5664 err = ret; 5665 WARN_ON(1); 5666 goto out_unlock; 5667 } 5668 } 5669 5670 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5671 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5672 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 5673 btrfs_update_inode(trans, root, inode); 5674 ret = 0; 5675 out_unlock: 5676 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5677 ordered->file_offset + ordered->len - 1, 5678 &cached_state, GFP_NOFS); 5679 out: 5680 btrfs_delalloc_release_metadata(inode, ordered->len); 5681 btrfs_end_transaction(trans, root); 5682 ordered_offset = ordered->file_offset + ordered->len; 5683 btrfs_put_ordered_extent(ordered); 5684 btrfs_put_ordered_extent(ordered); 5685 5686 out_test: 5687 /* 5688 * our bio might span multiple ordered extents. If we haven't 5689 * completed the accounting for the whole dio, go back and try again 5690 */ 5691 if (ordered_offset < dip->logical_offset + dip->bytes) { 5692 ordered_bytes = dip->logical_offset + dip->bytes - 5693 ordered_offset; 5694 goto again; 5695 } 5696 out_done: 5697 bio->bi_private = dip->private; 5698 5699 kfree(dip->csums); 5700 kfree(dip); 5701 5702 /* If we had an error make sure to clear the uptodate flag */ 5703 if (err) 5704 clear_bit(BIO_UPTODATE, &bio->bi_flags); 5705 dio_end_io(bio, err); 5706 } 5707 5708 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 5709 struct bio *bio, int mirror_num, 5710 unsigned long bio_flags, u64 offset) 5711 { 5712 int ret; 5713 struct btrfs_root *root = BTRFS_I(inode)->root; 5714 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 5715 BUG_ON(ret); 5716 return 0; 5717 } 5718 5719 static void btrfs_end_dio_bio(struct bio *bio, int err) 5720 { 5721 struct btrfs_dio_private *dip = bio->bi_private; 5722 5723 if (err) { 5724 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " 5725 "sector %#Lx len %u err no %d\n", 5726 (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, 5727 (unsigned long long)bio->bi_sector, bio->bi_size, err); 5728 dip->errors = 1; 5729 5730 /* 5731 * before atomic variable goto zero, we must make sure 5732 * dip->errors is perceived to be set. 5733 */ 5734 smp_mb__before_atomic_dec(); 5735 } 5736 5737 /* if there are more bios still pending for this dio, just exit */ 5738 if (!atomic_dec_and_test(&dip->pending_bios)) 5739 goto out; 5740 5741 if (dip->errors) 5742 bio_io_error(dip->orig_bio); 5743 else { 5744 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); 5745 bio_endio(dip->orig_bio, 0); 5746 } 5747 out: 5748 bio_put(bio); 5749 } 5750 5751 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 5752 u64 first_sector, gfp_t gfp_flags) 5753 { 5754 int nr_vecs = bio_get_nr_vecs(bdev); 5755 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 5756 } 5757 5758 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 5759 int rw, u64 file_offset, int skip_sum, 5760 u32 *csums, int async_submit) 5761 { 5762 int write = rw & REQ_WRITE; 5763 struct btrfs_root *root = BTRFS_I(inode)->root; 5764 int ret; 5765 5766 bio_get(bio); 5767 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 5768 if (ret) 5769 goto err; 5770 5771 if (skip_sum) 5772 goto map; 5773 5774 if (write && async_submit) { 5775 ret = btrfs_wq_submit_bio(root->fs_info, 5776 inode, rw, bio, 0, 0, 5777 file_offset, 5778 __btrfs_submit_bio_start_direct_io, 5779 __btrfs_submit_bio_done); 5780 goto err; 5781 } else if (write) { 5782 /* 5783 * If we aren't doing async submit, calculate the csum of the 5784 * bio now. 5785 */ 5786 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 5787 if (ret) 5788 goto err; 5789 } else if (!skip_sum) { 5790 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, 5791 file_offset, csums); 5792 if (ret) 5793 goto err; 5794 } 5795 5796 map: 5797 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 5798 err: 5799 bio_put(bio); 5800 return ret; 5801 } 5802 5803 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 5804 int skip_sum) 5805 { 5806 struct inode *inode = dip->inode; 5807 struct btrfs_root *root = BTRFS_I(inode)->root; 5808 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 5809 struct bio *bio; 5810 struct bio *orig_bio = dip->orig_bio; 5811 struct bio_vec *bvec = orig_bio->bi_io_vec; 5812 u64 start_sector = orig_bio->bi_sector; 5813 u64 file_offset = dip->logical_offset; 5814 u64 submit_len = 0; 5815 u64 map_length; 5816 int nr_pages = 0; 5817 u32 *csums = dip->csums; 5818 int ret = 0; 5819 int async_submit = 0; 5820 int write = rw & REQ_WRITE; 5821 5822 map_length = orig_bio->bi_size; 5823 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 5824 &map_length, NULL, 0); 5825 if (ret) { 5826 bio_put(orig_bio); 5827 return -EIO; 5828 } 5829 5830 if (map_length >= orig_bio->bi_size) { 5831 bio = orig_bio; 5832 goto submit; 5833 } 5834 5835 async_submit = 1; 5836 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 5837 if (!bio) 5838 return -ENOMEM; 5839 bio->bi_private = dip; 5840 bio->bi_end_io = btrfs_end_dio_bio; 5841 atomic_inc(&dip->pending_bios); 5842 5843 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 5844 if (unlikely(map_length < submit_len + bvec->bv_len || 5845 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 5846 bvec->bv_offset) < bvec->bv_len)) { 5847 /* 5848 * inc the count before we submit the bio so 5849 * we know the end IO handler won't happen before 5850 * we inc the count. Otherwise, the dip might get freed 5851 * before we're done setting it up 5852 */ 5853 atomic_inc(&dip->pending_bios); 5854 ret = __btrfs_submit_dio_bio(bio, inode, rw, 5855 file_offset, skip_sum, 5856 csums, async_submit); 5857 if (ret) { 5858 bio_put(bio); 5859 atomic_dec(&dip->pending_bios); 5860 goto out_err; 5861 } 5862 5863 /* Write's use the ordered csums */ 5864 if (!write && !skip_sum) 5865 csums = csums + nr_pages; 5866 start_sector += submit_len >> 9; 5867 file_offset += submit_len; 5868 5869 submit_len = 0; 5870 nr_pages = 0; 5871 5872 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 5873 start_sector, GFP_NOFS); 5874 if (!bio) 5875 goto out_err; 5876 bio->bi_private = dip; 5877 bio->bi_end_io = btrfs_end_dio_bio; 5878 5879 map_length = orig_bio->bi_size; 5880 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 5881 &map_length, NULL, 0); 5882 if (ret) { 5883 bio_put(bio); 5884 goto out_err; 5885 } 5886 } else { 5887 submit_len += bvec->bv_len; 5888 nr_pages ++; 5889 bvec++; 5890 } 5891 } 5892 5893 submit: 5894 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 5895 csums, async_submit); 5896 if (!ret) 5897 return 0; 5898 5899 bio_put(bio); 5900 out_err: 5901 dip->errors = 1; 5902 /* 5903 * before atomic variable goto zero, we must 5904 * make sure dip->errors is perceived to be set. 5905 */ 5906 smp_mb__before_atomic_dec(); 5907 if (atomic_dec_and_test(&dip->pending_bios)) 5908 bio_io_error(dip->orig_bio); 5909 5910 /* bio_end_io() will handle error, so we needn't return it */ 5911 return 0; 5912 } 5913 5914 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 5915 loff_t file_offset) 5916 { 5917 struct btrfs_root *root = BTRFS_I(inode)->root; 5918 struct btrfs_dio_private *dip; 5919 struct bio_vec *bvec = bio->bi_io_vec; 5920 int skip_sum; 5921 int write = rw & REQ_WRITE; 5922 int ret = 0; 5923 5924 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 5925 5926 dip = kmalloc(sizeof(*dip), GFP_NOFS); 5927 if (!dip) { 5928 ret = -ENOMEM; 5929 goto free_ordered; 5930 } 5931 dip->csums = NULL; 5932 5933 /* Write's use the ordered csum stuff, so we don't need dip->csums */ 5934 if (!write && !skip_sum) { 5935 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 5936 if (!dip->csums) { 5937 kfree(dip); 5938 ret = -ENOMEM; 5939 goto free_ordered; 5940 } 5941 } 5942 5943 dip->private = bio->bi_private; 5944 dip->inode = inode; 5945 dip->logical_offset = file_offset; 5946 5947 dip->bytes = 0; 5948 do { 5949 dip->bytes += bvec->bv_len; 5950 bvec++; 5951 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); 5952 5953 dip->disk_bytenr = (u64)bio->bi_sector << 9; 5954 bio->bi_private = dip; 5955 dip->errors = 0; 5956 dip->orig_bio = bio; 5957 atomic_set(&dip->pending_bios, 0); 5958 5959 if (write) 5960 bio->bi_end_io = btrfs_endio_direct_write; 5961 else 5962 bio->bi_end_io = btrfs_endio_direct_read; 5963 5964 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 5965 if (!ret) 5966 return; 5967 free_ordered: 5968 /* 5969 * If this is a write, we need to clean up the reserved space and kill 5970 * the ordered extent. 5971 */ 5972 if (write) { 5973 struct btrfs_ordered_extent *ordered; 5974 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 5975 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 5976 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 5977 btrfs_free_reserved_extent(root, ordered->start, 5978 ordered->disk_len); 5979 btrfs_put_ordered_extent(ordered); 5980 btrfs_put_ordered_extent(ordered); 5981 } 5982 bio_endio(bio, ret); 5983 } 5984 5985 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 5986 const struct iovec *iov, loff_t offset, 5987 unsigned long nr_segs) 5988 { 5989 int seg; 5990 int i; 5991 size_t size; 5992 unsigned long addr; 5993 unsigned blocksize_mask = root->sectorsize - 1; 5994 ssize_t retval = -EINVAL; 5995 loff_t end = offset; 5996 5997 if (offset & blocksize_mask) 5998 goto out; 5999 6000 /* Check the memory alignment. Blocks cannot straddle pages */ 6001 for (seg = 0; seg < nr_segs; seg++) { 6002 addr = (unsigned long)iov[seg].iov_base; 6003 size = iov[seg].iov_len; 6004 end += size; 6005 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6006 goto out; 6007 6008 /* If this is a write we don't need to check anymore */ 6009 if (rw & WRITE) 6010 continue; 6011 6012 /* 6013 * Check to make sure we don't have duplicate iov_base's in this 6014 * iovec, if so return EINVAL, otherwise we'll get csum errors 6015 * when reading back. 6016 */ 6017 for (i = seg + 1; i < nr_segs; i++) { 6018 if (iov[seg].iov_base == iov[i].iov_base) 6019 goto out; 6020 } 6021 } 6022 retval = 0; 6023 out: 6024 return retval; 6025 } 6026 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 6027 const struct iovec *iov, loff_t offset, 6028 unsigned long nr_segs) 6029 { 6030 struct file *file = iocb->ki_filp; 6031 struct inode *inode = file->f_mapping->host; 6032 struct btrfs_ordered_extent *ordered; 6033 struct extent_state *cached_state = NULL; 6034 u64 lockstart, lockend; 6035 ssize_t ret; 6036 int writing = rw & WRITE; 6037 int write_bits = 0; 6038 size_t count = iov_length(iov, nr_segs); 6039 6040 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6041 offset, nr_segs)) { 6042 return 0; 6043 } 6044 6045 lockstart = offset; 6046 lockend = offset + count - 1; 6047 6048 if (writing) { 6049 ret = btrfs_delalloc_reserve_space(inode, count); 6050 if (ret) 6051 goto out; 6052 } 6053 6054 while (1) { 6055 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6056 0, &cached_state, GFP_NOFS); 6057 /* 6058 * We're concerned with the entire range that we're going to be 6059 * doing DIO to, so we need to make sure theres no ordered 6060 * extents in this range. 6061 */ 6062 ordered = btrfs_lookup_ordered_range(inode, lockstart, 6063 lockend - lockstart + 1); 6064 if (!ordered) 6065 break; 6066 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6067 &cached_state, GFP_NOFS); 6068 btrfs_start_ordered_extent(inode, ordered, 1); 6069 btrfs_put_ordered_extent(ordered); 6070 cond_resched(); 6071 } 6072 6073 /* 6074 * we don't use btrfs_set_extent_delalloc because we don't want 6075 * the dirty or uptodate bits 6076 */ 6077 if (writing) { 6078 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; 6079 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6080 EXTENT_DELALLOC, 0, NULL, &cached_state, 6081 GFP_NOFS); 6082 if (ret) { 6083 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6084 lockend, EXTENT_LOCKED | write_bits, 6085 1, 0, &cached_state, GFP_NOFS); 6086 goto out; 6087 } 6088 } 6089 6090 free_extent_state(cached_state); 6091 cached_state = NULL; 6092 6093 ret = __blockdev_direct_IO(rw, iocb, inode, 6094 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6095 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6096 btrfs_submit_direct, 0); 6097 6098 if (ret < 0 && ret != -EIOCBQUEUED) { 6099 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, 6100 offset + iov_length(iov, nr_segs) - 1, 6101 EXTENT_LOCKED | write_bits, 1, 0, 6102 &cached_state, GFP_NOFS); 6103 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { 6104 /* 6105 * We're falling back to buffered, unlock the section we didn't 6106 * do IO on. 6107 */ 6108 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, 6109 offset + iov_length(iov, nr_segs) - 1, 6110 EXTENT_LOCKED | write_bits, 1, 0, 6111 &cached_state, GFP_NOFS); 6112 } 6113 out: 6114 free_extent_state(cached_state); 6115 return ret; 6116 } 6117 6118 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6119 __u64 start, __u64 len) 6120 { 6121 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6122 } 6123 6124 int btrfs_readpage(struct file *file, struct page *page) 6125 { 6126 struct extent_io_tree *tree; 6127 tree = &BTRFS_I(page->mapping->host)->io_tree; 6128 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 6129 } 6130 6131 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6132 { 6133 struct extent_io_tree *tree; 6134 6135 6136 if (current->flags & PF_MEMALLOC) { 6137 redirty_page_for_writepage(wbc, page); 6138 unlock_page(page); 6139 return 0; 6140 } 6141 tree = &BTRFS_I(page->mapping->host)->io_tree; 6142 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 6143 } 6144 6145 int btrfs_writepages(struct address_space *mapping, 6146 struct writeback_control *wbc) 6147 { 6148 struct extent_io_tree *tree; 6149 6150 tree = &BTRFS_I(mapping->host)->io_tree; 6151 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 6152 } 6153 6154 static int 6155 btrfs_readpages(struct file *file, struct address_space *mapping, 6156 struct list_head *pages, unsigned nr_pages) 6157 { 6158 struct extent_io_tree *tree; 6159 tree = &BTRFS_I(mapping->host)->io_tree; 6160 return extent_readpages(tree, mapping, pages, nr_pages, 6161 btrfs_get_extent); 6162 } 6163 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6164 { 6165 struct extent_io_tree *tree; 6166 struct extent_map_tree *map; 6167 int ret; 6168 6169 tree = &BTRFS_I(page->mapping->host)->io_tree; 6170 map = &BTRFS_I(page->mapping->host)->extent_tree; 6171 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 6172 if (ret == 1) { 6173 ClearPagePrivate(page); 6174 set_page_private(page, 0); 6175 page_cache_release(page); 6176 } 6177 return ret; 6178 } 6179 6180 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6181 { 6182 if (PageWriteback(page) || PageDirty(page)) 6183 return 0; 6184 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 6185 } 6186 6187 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6188 { 6189 struct extent_io_tree *tree; 6190 struct btrfs_ordered_extent *ordered; 6191 struct extent_state *cached_state = NULL; 6192 u64 page_start = page_offset(page); 6193 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6194 6195 6196 /* 6197 * we have the page locked, so new writeback can't start, 6198 * and the dirty bit won't be cleared while we are here. 6199 * 6200 * Wait for IO on this page so that we can safely clear 6201 * the PagePrivate2 bit and do ordered accounting 6202 */ 6203 wait_on_page_writeback(page); 6204 6205 tree = &BTRFS_I(page->mapping->host)->io_tree; 6206 if (offset) { 6207 btrfs_releasepage(page, GFP_NOFS); 6208 return; 6209 } 6210 lock_extent_bits(tree, page_start, page_end, 0, &cached_state, 6211 GFP_NOFS); 6212 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 6213 page_offset(page)); 6214 if (ordered) { 6215 /* 6216 * IO on this page will never be started, so we need 6217 * to account for any ordered extents now 6218 */ 6219 clear_extent_bit(tree, page_start, page_end, 6220 EXTENT_DIRTY | EXTENT_DELALLOC | 6221 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6222 &cached_state, GFP_NOFS); 6223 /* 6224 * whoever cleared the private bit is responsible 6225 * for the finish_ordered_io 6226 */ 6227 if (TestClearPagePrivate2(page)) { 6228 btrfs_finish_ordered_io(page->mapping->host, 6229 page_start, page_end); 6230 } 6231 btrfs_put_ordered_extent(ordered); 6232 cached_state = NULL; 6233 lock_extent_bits(tree, page_start, page_end, 0, &cached_state, 6234 GFP_NOFS); 6235 } 6236 clear_extent_bit(tree, page_start, page_end, 6237 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6238 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6239 __btrfs_releasepage(page, GFP_NOFS); 6240 6241 ClearPageChecked(page); 6242 if (PagePrivate(page)) { 6243 ClearPagePrivate(page); 6244 set_page_private(page, 0); 6245 page_cache_release(page); 6246 } 6247 } 6248 6249 /* 6250 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 6251 * called from a page fault handler when a page is first dirtied. Hence we must 6252 * be careful to check for EOF conditions here. We set the page up correctly 6253 * for a written page which means we get ENOSPC checking when writing into 6254 * holes and correct delalloc and unwritten extent mapping on filesystems that 6255 * support these features. 6256 * 6257 * We are not allowed to take the i_mutex here so we have to play games to 6258 * protect against truncate races as the page could now be beyond EOF. Because 6259 * vmtruncate() writes the inode size before removing pages, once we have the 6260 * page lock we can determine safely if the page is beyond EOF. If it is not 6261 * beyond EOF, then the page is guaranteed safe against truncation until we 6262 * unlock the page. 6263 */ 6264 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 6265 { 6266 struct page *page = vmf->page; 6267 struct inode *inode = fdentry(vma->vm_file)->d_inode; 6268 struct btrfs_root *root = BTRFS_I(inode)->root; 6269 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6270 struct btrfs_ordered_extent *ordered; 6271 struct extent_state *cached_state = NULL; 6272 char *kaddr; 6273 unsigned long zero_start; 6274 loff_t size; 6275 int ret; 6276 u64 page_start; 6277 u64 page_end; 6278 6279 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6280 if (ret) { 6281 if (ret == -ENOMEM) 6282 ret = VM_FAULT_OOM; 6283 else /* -ENOSPC, -EIO, etc */ 6284 ret = VM_FAULT_SIGBUS; 6285 goto out; 6286 } 6287 6288 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6289 again: 6290 lock_page(page); 6291 size = i_size_read(inode); 6292 page_start = page_offset(page); 6293 page_end = page_start + PAGE_CACHE_SIZE - 1; 6294 6295 if ((page->mapping != inode->i_mapping) || 6296 (page_start >= size)) { 6297 /* page got truncated out from underneath us */ 6298 goto out_unlock; 6299 } 6300 wait_on_page_writeback(page); 6301 6302 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, 6303 GFP_NOFS); 6304 set_page_extent_mapped(page); 6305 6306 /* 6307 * we can't set the delalloc bits if there are pending ordered 6308 * extents. Drop our locks and wait for them to finish 6309 */ 6310 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6311 if (ordered) { 6312 unlock_extent_cached(io_tree, page_start, page_end, 6313 &cached_state, GFP_NOFS); 6314 unlock_page(page); 6315 btrfs_start_ordered_extent(inode, ordered, 1); 6316 btrfs_put_ordered_extent(ordered); 6317 goto again; 6318 } 6319 6320 /* 6321 * XXX - page_mkwrite gets called every time the page is dirtied, even 6322 * if it was already dirty, so for space accounting reasons we need to 6323 * clear any delalloc bits for the range we are fixing to save. There 6324 * is probably a better way to do this, but for now keep consistent with 6325 * prepare_pages in the normal write path. 6326 */ 6327 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6328 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6329 0, 0, &cached_state, GFP_NOFS); 6330 6331 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6332 &cached_state); 6333 if (ret) { 6334 unlock_extent_cached(io_tree, page_start, page_end, 6335 &cached_state, GFP_NOFS); 6336 ret = VM_FAULT_SIGBUS; 6337 goto out_unlock; 6338 } 6339 ret = 0; 6340 6341 /* page is wholly or partially inside EOF */ 6342 if (page_start + PAGE_CACHE_SIZE > size) 6343 zero_start = size & ~PAGE_CACHE_MASK; 6344 else 6345 zero_start = PAGE_CACHE_SIZE; 6346 6347 if (zero_start != PAGE_CACHE_SIZE) { 6348 kaddr = kmap(page); 6349 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 6350 flush_dcache_page(page); 6351 kunmap(page); 6352 } 6353 ClearPageChecked(page); 6354 set_page_dirty(page); 6355 SetPageUptodate(page); 6356 6357 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6358 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6359 6360 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6361 6362 out_unlock: 6363 if (!ret) 6364 return VM_FAULT_LOCKED; 6365 unlock_page(page); 6366 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6367 out: 6368 return ret; 6369 } 6370 6371 static int btrfs_truncate(struct inode *inode) 6372 { 6373 struct btrfs_root *root = BTRFS_I(inode)->root; 6374 struct btrfs_block_rsv *rsv; 6375 int ret; 6376 int err = 0; 6377 struct btrfs_trans_handle *trans; 6378 unsigned long nr; 6379 u64 mask = root->sectorsize - 1; 6380 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6381 6382 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6383 if (ret) 6384 return ret; 6385 6386 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6387 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6388 6389 /* 6390 * Yes ladies and gentelment, this is indeed ugly. The fact is we have 6391 * 3 things going on here 6392 * 6393 * 1) We need to reserve space for our orphan item and the space to 6394 * delete our orphan item. Lord knows we don't want to have a dangling 6395 * orphan item because we didn't reserve space to remove it. 6396 * 6397 * 2) We need to reserve space to update our inode. 6398 * 6399 * 3) We need to have something to cache all the space that is going to 6400 * be free'd up by the truncate operation, but also have some slack 6401 * space reserved in case it uses space during the truncate (thank you 6402 * very much snapshotting). 6403 * 6404 * And we need these to all be seperate. The fact is we can use alot of 6405 * space doing the truncate, and we have no earthly idea how much space 6406 * we will use, so we need the truncate reservation to be seperate so it 6407 * doesn't end up using space reserved for updating the inode or 6408 * removing the orphan item. We also need to be able to stop the 6409 * transaction and start a new one, which means we need to be able to 6410 * update the inode several times, and we have no idea of knowing how 6411 * many times that will be, so we can't just reserve 1 item for the 6412 * entirety of the opration, so that has to be done seperately as well. 6413 * Then there is the orphan item, which does indeed need to be held on 6414 * to for the whole operation, and we need nobody to touch this reserved 6415 * space except the orphan code. 6416 * 6417 * So that leaves us with 6418 * 6419 * 1) root->orphan_block_rsv - for the orphan deletion. 6420 * 2) rsv - for the truncate reservation, which we will steal from the 6421 * transaction reservation. 6422 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6423 * updating the inode. 6424 */ 6425 rsv = btrfs_alloc_block_rsv(root); 6426 if (!rsv) 6427 return -ENOMEM; 6428 rsv->size = min_size; 6429 6430 /* 6431 * 1 for the truncate slack space 6432 * 1 for the orphan item we're going to add 6433 * 1 for the orphan item deletion 6434 * 1 for updating the inode. 6435 */ 6436 trans = btrfs_start_transaction(root, 4); 6437 if (IS_ERR(trans)) { 6438 err = PTR_ERR(trans); 6439 goto out; 6440 } 6441 6442 /* Migrate the slack space for the truncate to our reserve */ 6443 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 6444 min_size); 6445 BUG_ON(ret); 6446 6447 ret = btrfs_orphan_add(trans, inode); 6448 if (ret) { 6449 btrfs_end_transaction(trans, root); 6450 goto out; 6451 } 6452 6453 /* 6454 * setattr is responsible for setting the ordered_data_close flag, 6455 * but that is only tested during the last file release. That 6456 * could happen well after the next commit, leaving a great big 6457 * window where new writes may get lost if someone chooses to write 6458 * to this file after truncating to zero 6459 * 6460 * The inode doesn't have any dirty data here, and so if we commit 6461 * this is a noop. If someone immediately starts writing to the inode 6462 * it is very likely we'll catch some of their writes in this 6463 * transaction, and the commit will find this file on the ordered 6464 * data list with good things to send down. 6465 * 6466 * This is a best effort solution, there is still a window where 6467 * using truncate to replace the contents of the file will 6468 * end up with a zero length file after a crash. 6469 */ 6470 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 6471 btrfs_add_ordered_operation(trans, root, inode); 6472 6473 while (1) { 6474 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6475 if (ret) { 6476 /* 6477 * This can only happen with the original transaction we 6478 * started above, every other time we shouldn't have a 6479 * transaction started yet. 6480 */ 6481 if (ret == -EAGAIN) 6482 goto end_trans; 6483 err = ret; 6484 break; 6485 } 6486 6487 if (!trans) { 6488 /* Just need the 1 for updating the inode */ 6489 trans = btrfs_start_transaction(root, 1); 6490 if (IS_ERR(trans)) { 6491 err = PTR_ERR(trans); 6492 goto out; 6493 } 6494 } 6495 6496 trans->block_rsv = rsv; 6497 6498 ret = btrfs_truncate_inode_items(trans, root, inode, 6499 inode->i_size, 6500 BTRFS_EXTENT_DATA_KEY); 6501 if (ret != -EAGAIN) { 6502 err = ret; 6503 break; 6504 } 6505 6506 trans->block_rsv = &root->fs_info->trans_block_rsv; 6507 ret = btrfs_update_inode(trans, root, inode); 6508 if (ret) { 6509 err = ret; 6510 break; 6511 } 6512 end_trans: 6513 nr = trans->blocks_used; 6514 btrfs_end_transaction(trans, root); 6515 trans = NULL; 6516 btrfs_btree_balance_dirty(root, nr); 6517 } 6518 6519 if (ret == 0 && inode->i_nlink > 0) { 6520 trans->block_rsv = root->orphan_block_rsv; 6521 ret = btrfs_orphan_del(trans, inode); 6522 if (ret) 6523 err = ret; 6524 } else if (ret && inode->i_nlink > 0) { 6525 /* 6526 * Failed to do the truncate, remove us from the in memory 6527 * orphan list. 6528 */ 6529 ret = btrfs_orphan_del(NULL, inode); 6530 } 6531 6532 trans->block_rsv = &root->fs_info->trans_block_rsv; 6533 ret = btrfs_update_inode(trans, root, inode); 6534 if (ret && !err) 6535 err = ret; 6536 6537 nr = trans->blocks_used; 6538 ret = btrfs_end_transaction_throttle(trans, root); 6539 btrfs_btree_balance_dirty(root, nr); 6540 6541 out: 6542 btrfs_free_block_rsv(root, rsv); 6543 6544 if (ret && !err) 6545 err = ret; 6546 6547 return err; 6548 } 6549 6550 /* 6551 * create a new subvolume directory/inode (helper for the ioctl). 6552 */ 6553 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 6554 struct btrfs_root *new_root, u64 new_dirid) 6555 { 6556 struct inode *inode; 6557 int err; 6558 u64 index = 0; 6559 6560 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 6561 new_dirid, S_IFDIR | 0700, &index); 6562 if (IS_ERR(inode)) 6563 return PTR_ERR(inode); 6564 inode->i_op = &btrfs_dir_inode_operations; 6565 inode->i_fop = &btrfs_dir_file_operations; 6566 6567 set_nlink(inode, 1); 6568 btrfs_i_size_write(inode, 0); 6569 6570 err = btrfs_update_inode(trans, new_root, inode); 6571 BUG_ON(err); 6572 6573 iput(inode); 6574 return 0; 6575 } 6576 6577 struct inode *btrfs_alloc_inode(struct super_block *sb) 6578 { 6579 struct btrfs_inode *ei; 6580 struct inode *inode; 6581 6582 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6583 if (!ei) 6584 return NULL; 6585 6586 ei->root = NULL; 6587 ei->space_info = NULL; 6588 ei->generation = 0; 6589 ei->sequence = 0; 6590 ei->last_trans = 0; 6591 ei->last_sub_trans = 0; 6592 ei->logged_trans = 0; 6593 ei->delalloc_bytes = 0; 6594 ei->disk_i_size = 0; 6595 ei->flags = 0; 6596 ei->csum_bytes = 0; 6597 ei->index_cnt = (u64)-1; 6598 ei->last_unlink_trans = 0; 6599 6600 spin_lock_init(&ei->lock); 6601 ei->outstanding_extents = 0; 6602 ei->reserved_extents = 0; 6603 6604 ei->ordered_data_close = 0; 6605 ei->orphan_meta_reserved = 0; 6606 ei->dummy_inode = 0; 6607 ei->in_defrag = 0; 6608 ei->force_compress = BTRFS_COMPRESS_NONE; 6609 6610 ei->delayed_node = NULL; 6611 6612 inode = &ei->vfs_inode; 6613 extent_map_tree_init(&ei->extent_tree); 6614 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6615 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6616 mutex_init(&ei->log_mutex); 6617 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6618 INIT_LIST_HEAD(&ei->i_orphan); 6619 INIT_LIST_HEAD(&ei->delalloc_inodes); 6620 INIT_LIST_HEAD(&ei->ordered_operations); 6621 RB_CLEAR_NODE(&ei->rb_node); 6622 6623 return inode; 6624 } 6625 6626 static void btrfs_i_callback(struct rcu_head *head) 6627 { 6628 struct inode *inode = container_of(head, struct inode, i_rcu); 6629 INIT_LIST_HEAD(&inode->i_dentry); 6630 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6631 } 6632 6633 void btrfs_destroy_inode(struct inode *inode) 6634 { 6635 struct btrfs_ordered_extent *ordered; 6636 struct btrfs_root *root = BTRFS_I(inode)->root; 6637 6638 WARN_ON(!list_empty(&inode->i_dentry)); 6639 WARN_ON(inode->i_data.nrpages); 6640 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6641 WARN_ON(BTRFS_I(inode)->reserved_extents); 6642 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 6643 WARN_ON(BTRFS_I(inode)->csum_bytes); 6644 6645 /* 6646 * This can happen where we create an inode, but somebody else also 6647 * created the same inode and we need to destroy the one we already 6648 * created. 6649 */ 6650 if (!root) 6651 goto free; 6652 6653 /* 6654 * Make sure we're properly removed from the ordered operation 6655 * lists. 6656 */ 6657 smp_mb(); 6658 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 6659 spin_lock(&root->fs_info->ordered_extent_lock); 6660 list_del_init(&BTRFS_I(inode)->ordered_operations); 6661 spin_unlock(&root->fs_info->ordered_extent_lock); 6662 } 6663 6664 spin_lock(&root->orphan_lock); 6665 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6666 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 6667 (unsigned long long)btrfs_ino(inode)); 6668 list_del_init(&BTRFS_I(inode)->i_orphan); 6669 } 6670 spin_unlock(&root->orphan_lock); 6671 6672 while (1) { 6673 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6674 if (!ordered) 6675 break; 6676 else { 6677 printk(KERN_ERR "btrfs found ordered " 6678 "extent %llu %llu on inode cleanup\n", 6679 (unsigned long long)ordered->file_offset, 6680 (unsigned long long)ordered->len); 6681 btrfs_remove_ordered_extent(inode, ordered); 6682 btrfs_put_ordered_extent(ordered); 6683 btrfs_put_ordered_extent(ordered); 6684 } 6685 } 6686 inode_tree_del(inode); 6687 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 6688 free: 6689 btrfs_remove_delayed_node(inode); 6690 call_rcu(&inode->i_rcu, btrfs_i_callback); 6691 } 6692 6693 int btrfs_drop_inode(struct inode *inode) 6694 { 6695 struct btrfs_root *root = BTRFS_I(inode)->root; 6696 6697 if (btrfs_root_refs(&root->root_item) == 0 && 6698 !btrfs_is_free_space_inode(root, inode)) 6699 return 1; 6700 else 6701 return generic_drop_inode(inode); 6702 } 6703 6704 static void init_once(void *foo) 6705 { 6706 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 6707 6708 inode_init_once(&ei->vfs_inode); 6709 } 6710 6711 void btrfs_destroy_cachep(void) 6712 { 6713 if (btrfs_inode_cachep) 6714 kmem_cache_destroy(btrfs_inode_cachep); 6715 if (btrfs_trans_handle_cachep) 6716 kmem_cache_destroy(btrfs_trans_handle_cachep); 6717 if (btrfs_transaction_cachep) 6718 kmem_cache_destroy(btrfs_transaction_cachep); 6719 if (btrfs_path_cachep) 6720 kmem_cache_destroy(btrfs_path_cachep); 6721 if (btrfs_free_space_cachep) 6722 kmem_cache_destroy(btrfs_free_space_cachep); 6723 } 6724 6725 int btrfs_init_cachep(void) 6726 { 6727 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 6728 sizeof(struct btrfs_inode), 0, 6729 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 6730 if (!btrfs_inode_cachep) 6731 goto fail; 6732 6733 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 6734 sizeof(struct btrfs_trans_handle), 0, 6735 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6736 if (!btrfs_trans_handle_cachep) 6737 goto fail; 6738 6739 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 6740 sizeof(struct btrfs_transaction), 0, 6741 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6742 if (!btrfs_transaction_cachep) 6743 goto fail; 6744 6745 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 6746 sizeof(struct btrfs_path), 0, 6747 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6748 if (!btrfs_path_cachep) 6749 goto fail; 6750 6751 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 6752 sizeof(struct btrfs_free_space), 0, 6753 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6754 if (!btrfs_free_space_cachep) 6755 goto fail; 6756 6757 return 0; 6758 fail: 6759 btrfs_destroy_cachep(); 6760 return -ENOMEM; 6761 } 6762 6763 static int btrfs_getattr(struct vfsmount *mnt, 6764 struct dentry *dentry, struct kstat *stat) 6765 { 6766 struct inode *inode = dentry->d_inode; 6767 generic_fillattr(inode, stat); 6768 stat->dev = BTRFS_I(inode)->root->anon_dev; 6769 stat->blksize = PAGE_CACHE_SIZE; 6770 stat->blocks = (inode_get_bytes(inode) + 6771 BTRFS_I(inode)->delalloc_bytes) >> 9; 6772 return 0; 6773 } 6774 6775 /* 6776 * If a file is moved, it will inherit the cow and compression flags of the new 6777 * directory. 6778 */ 6779 static void fixup_inode_flags(struct inode *dir, struct inode *inode) 6780 { 6781 struct btrfs_inode *b_dir = BTRFS_I(dir); 6782 struct btrfs_inode *b_inode = BTRFS_I(inode); 6783 6784 if (b_dir->flags & BTRFS_INODE_NODATACOW) 6785 b_inode->flags |= BTRFS_INODE_NODATACOW; 6786 else 6787 b_inode->flags &= ~BTRFS_INODE_NODATACOW; 6788 6789 if (b_dir->flags & BTRFS_INODE_COMPRESS) 6790 b_inode->flags |= BTRFS_INODE_COMPRESS; 6791 else 6792 b_inode->flags &= ~BTRFS_INODE_COMPRESS; 6793 } 6794 6795 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 6796 struct inode *new_dir, struct dentry *new_dentry) 6797 { 6798 struct btrfs_trans_handle *trans; 6799 struct btrfs_root *root = BTRFS_I(old_dir)->root; 6800 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 6801 struct inode *new_inode = new_dentry->d_inode; 6802 struct inode *old_inode = old_dentry->d_inode; 6803 struct timespec ctime = CURRENT_TIME; 6804 u64 index = 0; 6805 u64 root_objectid; 6806 int ret; 6807 u64 old_ino = btrfs_ino(old_inode); 6808 6809 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 6810 return -EPERM; 6811 6812 /* we only allow rename subvolume link between subvolumes */ 6813 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 6814 return -EXDEV; 6815 6816 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 6817 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) 6818 return -ENOTEMPTY; 6819 6820 if (S_ISDIR(old_inode->i_mode) && new_inode && 6821 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6822 return -ENOTEMPTY; 6823 /* 6824 * we're using rename to replace one file with another. 6825 * and the replacement file is large. Start IO on it now so 6826 * we don't add too much work to the end of the transaction 6827 */ 6828 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 6829 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 6830 filemap_flush(old_inode->i_mapping); 6831 6832 /* close the racy window with snapshot create/destroy ioctl */ 6833 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 6834 down_read(&root->fs_info->subvol_sem); 6835 /* 6836 * We want to reserve the absolute worst case amount of items. So if 6837 * both inodes are subvols and we need to unlink them then that would 6838 * require 4 item modifications, but if they are both normal inodes it 6839 * would require 5 item modifications, so we'll assume their normal 6840 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 6841 * should cover the worst case number of items we'll modify. 6842 */ 6843 trans = btrfs_start_transaction(root, 20); 6844 if (IS_ERR(trans)) { 6845 ret = PTR_ERR(trans); 6846 goto out_notrans; 6847 } 6848 6849 if (dest != root) 6850 btrfs_record_root_in_trans(trans, dest); 6851 6852 ret = btrfs_set_inode_index(new_dir, &index); 6853 if (ret) 6854 goto out_fail; 6855 6856 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 6857 /* force full log commit if subvolume involved. */ 6858 root->fs_info->last_trans_log_full_commit = trans->transid; 6859 } else { 6860 ret = btrfs_insert_inode_ref(trans, dest, 6861 new_dentry->d_name.name, 6862 new_dentry->d_name.len, 6863 old_ino, 6864 btrfs_ino(new_dir), index); 6865 if (ret) 6866 goto out_fail; 6867 /* 6868 * this is an ugly little race, but the rename is required 6869 * to make sure that if we crash, the inode is either at the 6870 * old name or the new one. pinning the log transaction lets 6871 * us make sure we don't allow a log commit to come in after 6872 * we unlink the name but before we add the new name back in. 6873 */ 6874 btrfs_pin_log_trans(root); 6875 } 6876 /* 6877 * make sure the inode gets flushed if it is replacing 6878 * something. 6879 */ 6880 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 6881 btrfs_add_ordered_operation(trans, root, old_inode); 6882 6883 old_dir->i_ctime = old_dir->i_mtime = ctime; 6884 new_dir->i_ctime = new_dir->i_mtime = ctime; 6885 old_inode->i_ctime = ctime; 6886 6887 if (old_dentry->d_parent != new_dentry->d_parent) 6888 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 6889 6890 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 6891 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 6892 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 6893 old_dentry->d_name.name, 6894 old_dentry->d_name.len); 6895 } else { 6896 ret = __btrfs_unlink_inode(trans, root, old_dir, 6897 old_dentry->d_inode, 6898 old_dentry->d_name.name, 6899 old_dentry->d_name.len); 6900 if (!ret) 6901 ret = btrfs_update_inode(trans, root, old_inode); 6902 } 6903 BUG_ON(ret); 6904 6905 if (new_inode) { 6906 new_inode->i_ctime = CURRENT_TIME; 6907 if (unlikely(btrfs_ino(new_inode) == 6908 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 6909 root_objectid = BTRFS_I(new_inode)->location.objectid; 6910 ret = btrfs_unlink_subvol(trans, dest, new_dir, 6911 root_objectid, 6912 new_dentry->d_name.name, 6913 new_dentry->d_name.len); 6914 BUG_ON(new_inode->i_nlink == 0); 6915 } else { 6916 ret = btrfs_unlink_inode(trans, dest, new_dir, 6917 new_dentry->d_inode, 6918 new_dentry->d_name.name, 6919 new_dentry->d_name.len); 6920 } 6921 BUG_ON(ret); 6922 if (new_inode->i_nlink == 0) { 6923 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 6924 BUG_ON(ret); 6925 } 6926 } 6927 6928 fixup_inode_flags(new_dir, old_inode); 6929 6930 ret = btrfs_add_link(trans, new_dir, old_inode, 6931 new_dentry->d_name.name, 6932 new_dentry->d_name.len, 0, index); 6933 BUG_ON(ret); 6934 6935 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 6936 struct dentry *parent = new_dentry->d_parent; 6937 btrfs_log_new_name(trans, old_inode, old_dir, parent); 6938 btrfs_end_log_trans(root); 6939 } 6940 out_fail: 6941 btrfs_end_transaction_throttle(trans, root); 6942 out_notrans: 6943 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 6944 up_read(&root->fs_info->subvol_sem); 6945 6946 return ret; 6947 } 6948 6949 /* 6950 * some fairly slow code that needs optimization. This walks the list 6951 * of all the inodes with pending delalloc and forces them to disk. 6952 */ 6953 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 6954 { 6955 struct list_head *head = &root->fs_info->delalloc_inodes; 6956 struct btrfs_inode *binode; 6957 struct inode *inode; 6958 6959 if (root->fs_info->sb->s_flags & MS_RDONLY) 6960 return -EROFS; 6961 6962 spin_lock(&root->fs_info->delalloc_lock); 6963 while (!list_empty(head)) { 6964 binode = list_entry(head->next, struct btrfs_inode, 6965 delalloc_inodes); 6966 inode = igrab(&binode->vfs_inode); 6967 if (!inode) 6968 list_del_init(&binode->delalloc_inodes); 6969 spin_unlock(&root->fs_info->delalloc_lock); 6970 if (inode) { 6971 filemap_flush(inode->i_mapping); 6972 if (delay_iput) 6973 btrfs_add_delayed_iput(inode); 6974 else 6975 iput(inode); 6976 } 6977 cond_resched(); 6978 spin_lock(&root->fs_info->delalloc_lock); 6979 } 6980 spin_unlock(&root->fs_info->delalloc_lock); 6981 6982 /* the filemap_flush will queue IO into the worker threads, but 6983 * we have to make sure the IO is actually started and that 6984 * ordered extents get created before we return 6985 */ 6986 atomic_inc(&root->fs_info->async_submit_draining); 6987 while (atomic_read(&root->fs_info->nr_async_submits) || 6988 atomic_read(&root->fs_info->async_delalloc_pages)) { 6989 wait_event(root->fs_info->async_submit_wait, 6990 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 6991 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 6992 } 6993 atomic_dec(&root->fs_info->async_submit_draining); 6994 return 0; 6995 } 6996 6997 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6998 const char *symname) 6999 { 7000 struct btrfs_trans_handle *trans; 7001 struct btrfs_root *root = BTRFS_I(dir)->root; 7002 struct btrfs_path *path; 7003 struct btrfs_key key; 7004 struct inode *inode = NULL; 7005 int err; 7006 int drop_inode = 0; 7007 u64 objectid; 7008 u64 index = 0 ; 7009 int name_len; 7010 int datasize; 7011 unsigned long ptr; 7012 struct btrfs_file_extent_item *ei; 7013 struct extent_buffer *leaf; 7014 unsigned long nr = 0; 7015 7016 name_len = strlen(symname) + 1; 7017 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7018 return -ENAMETOOLONG; 7019 7020 /* 7021 * 2 items for inode item and ref 7022 * 2 items for dir items 7023 * 1 item for xattr if selinux is on 7024 */ 7025 trans = btrfs_start_transaction(root, 5); 7026 if (IS_ERR(trans)) 7027 return PTR_ERR(trans); 7028 7029 err = btrfs_find_free_ino(root, &objectid); 7030 if (err) 7031 goto out_unlock; 7032 7033 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 7034 dentry->d_name.len, btrfs_ino(dir), objectid, 7035 S_IFLNK|S_IRWXUGO, &index); 7036 if (IS_ERR(inode)) { 7037 err = PTR_ERR(inode); 7038 goto out_unlock; 7039 } 7040 7041 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 7042 if (err) { 7043 drop_inode = 1; 7044 goto out_unlock; 7045 } 7046 7047 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7048 if (err) 7049 drop_inode = 1; 7050 else { 7051 inode->i_mapping->a_ops = &btrfs_aops; 7052 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7053 inode->i_fop = &btrfs_file_operations; 7054 inode->i_op = &btrfs_file_inode_operations; 7055 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7056 } 7057 if (drop_inode) 7058 goto out_unlock; 7059 7060 path = btrfs_alloc_path(); 7061 if (!path) { 7062 err = -ENOMEM; 7063 drop_inode = 1; 7064 goto out_unlock; 7065 } 7066 key.objectid = btrfs_ino(inode); 7067 key.offset = 0; 7068 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7069 datasize = btrfs_file_extent_calc_inline_size(name_len); 7070 err = btrfs_insert_empty_item(trans, root, path, &key, 7071 datasize); 7072 if (err) { 7073 drop_inode = 1; 7074 btrfs_free_path(path); 7075 goto out_unlock; 7076 } 7077 leaf = path->nodes[0]; 7078 ei = btrfs_item_ptr(leaf, path->slots[0], 7079 struct btrfs_file_extent_item); 7080 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 7081 btrfs_set_file_extent_type(leaf, ei, 7082 BTRFS_FILE_EXTENT_INLINE); 7083 btrfs_set_file_extent_encryption(leaf, ei, 0); 7084 btrfs_set_file_extent_compression(leaf, ei, 0); 7085 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 7086 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 7087 7088 ptr = btrfs_file_extent_inline_start(ei); 7089 write_extent_buffer(leaf, symname, ptr, name_len); 7090 btrfs_mark_buffer_dirty(leaf); 7091 btrfs_free_path(path); 7092 7093 inode->i_op = &btrfs_symlink_inode_operations; 7094 inode->i_mapping->a_ops = &btrfs_symlink_aops; 7095 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7096 inode_set_bytes(inode, name_len); 7097 btrfs_i_size_write(inode, name_len - 1); 7098 err = btrfs_update_inode(trans, root, inode); 7099 if (err) 7100 drop_inode = 1; 7101 7102 out_unlock: 7103 nr = trans->blocks_used; 7104 btrfs_end_transaction_throttle(trans, root); 7105 if (drop_inode) { 7106 inode_dec_link_count(inode); 7107 iput(inode); 7108 } 7109 btrfs_btree_balance_dirty(root, nr); 7110 return err; 7111 } 7112 7113 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 7114 u64 start, u64 num_bytes, u64 min_size, 7115 loff_t actual_len, u64 *alloc_hint, 7116 struct btrfs_trans_handle *trans) 7117 { 7118 struct btrfs_root *root = BTRFS_I(inode)->root; 7119 struct btrfs_key ins; 7120 u64 cur_offset = start; 7121 u64 i_size; 7122 int ret = 0; 7123 bool own_trans = true; 7124 7125 if (trans) 7126 own_trans = false; 7127 while (num_bytes > 0) { 7128 if (own_trans) { 7129 trans = btrfs_start_transaction(root, 3); 7130 if (IS_ERR(trans)) { 7131 ret = PTR_ERR(trans); 7132 break; 7133 } 7134 } 7135 7136 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 7137 0, *alloc_hint, (u64)-1, &ins, 1); 7138 if (ret) { 7139 if (own_trans) 7140 btrfs_end_transaction(trans, root); 7141 break; 7142 } 7143 7144 ret = insert_reserved_file_extent(trans, inode, 7145 cur_offset, ins.objectid, 7146 ins.offset, ins.offset, 7147 ins.offset, 0, 0, 0, 7148 BTRFS_FILE_EXTENT_PREALLOC); 7149 BUG_ON(ret); 7150 btrfs_drop_extent_cache(inode, cur_offset, 7151 cur_offset + ins.offset -1, 0); 7152 7153 num_bytes -= ins.offset; 7154 cur_offset += ins.offset; 7155 *alloc_hint = ins.objectid + ins.offset; 7156 7157 inode->i_ctime = CURRENT_TIME; 7158 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7159 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7160 (actual_len > inode->i_size) && 7161 (cur_offset > inode->i_size)) { 7162 if (cur_offset > actual_len) 7163 i_size = actual_len; 7164 else 7165 i_size = cur_offset; 7166 i_size_write(inode, i_size); 7167 btrfs_ordered_update_i_size(inode, i_size, NULL); 7168 } 7169 7170 ret = btrfs_update_inode(trans, root, inode); 7171 BUG_ON(ret); 7172 7173 if (own_trans) 7174 btrfs_end_transaction(trans, root); 7175 } 7176 return ret; 7177 } 7178 7179 int btrfs_prealloc_file_range(struct inode *inode, int mode, 7180 u64 start, u64 num_bytes, u64 min_size, 7181 loff_t actual_len, u64 *alloc_hint) 7182 { 7183 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7184 min_size, actual_len, alloc_hint, 7185 NULL); 7186 } 7187 7188 int btrfs_prealloc_file_range_trans(struct inode *inode, 7189 struct btrfs_trans_handle *trans, int mode, 7190 u64 start, u64 num_bytes, u64 min_size, 7191 loff_t actual_len, u64 *alloc_hint) 7192 { 7193 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7194 min_size, actual_len, alloc_hint, trans); 7195 } 7196 7197 static int btrfs_set_page_dirty(struct page *page) 7198 { 7199 return __set_page_dirty_nobuffers(page); 7200 } 7201 7202 static int btrfs_permission(struct inode *inode, int mask) 7203 { 7204 struct btrfs_root *root = BTRFS_I(inode)->root; 7205 umode_t mode = inode->i_mode; 7206 7207 if (mask & MAY_WRITE && 7208 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 7209 if (btrfs_root_readonly(root)) 7210 return -EROFS; 7211 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 7212 return -EACCES; 7213 } 7214 return generic_permission(inode, mask); 7215 } 7216 7217 static const struct inode_operations btrfs_dir_inode_operations = { 7218 .getattr = btrfs_getattr, 7219 .lookup = btrfs_lookup, 7220 .create = btrfs_create, 7221 .unlink = btrfs_unlink, 7222 .link = btrfs_link, 7223 .mkdir = btrfs_mkdir, 7224 .rmdir = btrfs_rmdir, 7225 .rename = btrfs_rename, 7226 .symlink = btrfs_symlink, 7227 .setattr = btrfs_setattr, 7228 .mknod = btrfs_mknod, 7229 .setxattr = btrfs_setxattr, 7230 .getxattr = btrfs_getxattr, 7231 .listxattr = btrfs_listxattr, 7232 .removexattr = btrfs_removexattr, 7233 .permission = btrfs_permission, 7234 .get_acl = btrfs_get_acl, 7235 }; 7236 static const struct inode_operations btrfs_dir_ro_inode_operations = { 7237 .lookup = btrfs_lookup, 7238 .permission = btrfs_permission, 7239 .get_acl = btrfs_get_acl, 7240 }; 7241 7242 static const struct file_operations btrfs_dir_file_operations = { 7243 .llseek = generic_file_llseek, 7244 .read = generic_read_dir, 7245 .readdir = btrfs_real_readdir, 7246 .unlocked_ioctl = btrfs_ioctl, 7247 #ifdef CONFIG_COMPAT 7248 .compat_ioctl = btrfs_ioctl, 7249 #endif 7250 .release = btrfs_release_file, 7251 .fsync = btrfs_sync_file, 7252 }; 7253 7254 static struct extent_io_ops btrfs_extent_io_ops = { 7255 .fill_delalloc = run_delalloc_range, 7256 .submit_bio_hook = btrfs_submit_bio_hook, 7257 .merge_bio_hook = btrfs_merge_bio_hook, 7258 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7259 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7260 .writepage_start_hook = btrfs_writepage_start_hook, 7261 .set_bit_hook = btrfs_set_bit_hook, 7262 .clear_bit_hook = btrfs_clear_bit_hook, 7263 .merge_extent_hook = btrfs_merge_extent_hook, 7264 .split_extent_hook = btrfs_split_extent_hook, 7265 }; 7266 7267 /* 7268 * btrfs doesn't support the bmap operation because swapfiles 7269 * use bmap to make a mapping of extents in the file. They assume 7270 * these extents won't change over the life of the file and they 7271 * use the bmap result to do IO directly to the drive. 7272 * 7273 * the btrfs bmap call would return logical addresses that aren't 7274 * suitable for IO and they also will change frequently as COW 7275 * operations happen. So, swapfile + btrfs == corruption. 7276 * 7277 * For now we're avoiding this by dropping bmap. 7278 */ 7279 static const struct address_space_operations btrfs_aops = { 7280 .readpage = btrfs_readpage, 7281 .writepage = btrfs_writepage, 7282 .writepages = btrfs_writepages, 7283 .readpages = btrfs_readpages, 7284 .direct_IO = btrfs_direct_IO, 7285 .invalidatepage = btrfs_invalidatepage, 7286 .releasepage = btrfs_releasepage, 7287 .set_page_dirty = btrfs_set_page_dirty, 7288 .error_remove_page = generic_error_remove_page, 7289 }; 7290 7291 static const struct address_space_operations btrfs_symlink_aops = { 7292 .readpage = btrfs_readpage, 7293 .writepage = btrfs_writepage, 7294 .invalidatepage = btrfs_invalidatepage, 7295 .releasepage = btrfs_releasepage, 7296 }; 7297 7298 static const struct inode_operations btrfs_file_inode_operations = { 7299 .getattr = btrfs_getattr, 7300 .setattr = btrfs_setattr, 7301 .setxattr = btrfs_setxattr, 7302 .getxattr = btrfs_getxattr, 7303 .listxattr = btrfs_listxattr, 7304 .removexattr = btrfs_removexattr, 7305 .permission = btrfs_permission, 7306 .fiemap = btrfs_fiemap, 7307 .get_acl = btrfs_get_acl, 7308 }; 7309 static const struct inode_operations btrfs_special_inode_operations = { 7310 .getattr = btrfs_getattr, 7311 .setattr = btrfs_setattr, 7312 .permission = btrfs_permission, 7313 .setxattr = btrfs_setxattr, 7314 .getxattr = btrfs_getxattr, 7315 .listxattr = btrfs_listxattr, 7316 .removexattr = btrfs_removexattr, 7317 .get_acl = btrfs_get_acl, 7318 }; 7319 static const struct inode_operations btrfs_symlink_inode_operations = { 7320 .readlink = generic_readlink, 7321 .follow_link = page_follow_link_light, 7322 .put_link = page_put_link, 7323 .getattr = btrfs_getattr, 7324 .permission = btrfs_permission, 7325 .setxattr = btrfs_setxattr, 7326 .getxattr = btrfs_getxattr, 7327 .listxattr = btrfs_listxattr, 7328 .removexattr = btrfs_removexattr, 7329 .get_acl = btrfs_get_acl, 7330 }; 7331 7332 const struct dentry_operations btrfs_dentry_operations = { 7333 .d_delete = btrfs_dentry_delete, 7334 .d_release = btrfs_dentry_release, 7335 }; 7336