1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/module.h> 8 #include <linux/spinlock.h> 9 #include <linux/blkdev.h> 10 #include <linux/swap.h> 11 #include <linux/writeback.h> 12 #include <linux/pagevec.h> 13 #include "extent_io.h" 14 #include "extent_map.h" 15 #include "compat.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 19 static struct kmem_cache *extent_state_cache; 20 static struct kmem_cache *extent_buffer_cache; 21 22 static LIST_HEAD(buffers); 23 static LIST_HEAD(states); 24 25 #define LEAK_DEBUG 0 26 #if LEAK_DEBUG 27 static DEFINE_SPINLOCK(leak_lock); 28 #endif 29 30 #define BUFFER_LRU_MAX 64 31 32 struct tree_entry { 33 u64 start; 34 u64 end; 35 struct rb_node rb_node; 36 }; 37 38 struct extent_page_data { 39 struct bio *bio; 40 struct extent_io_tree *tree; 41 get_extent_t *get_extent; 42 43 /* tells writepage not to lock the state bits for this range 44 * it still does the unlocking 45 */ 46 unsigned int extent_locked:1; 47 48 /* tells the submit_bio code to use a WRITE_SYNC */ 49 unsigned int sync_io:1; 50 }; 51 52 int __init extent_io_init(void) 53 { 54 extent_state_cache = kmem_cache_create("extent_state", 55 sizeof(struct extent_state), 0, 56 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 57 if (!extent_state_cache) 58 return -ENOMEM; 59 60 extent_buffer_cache = kmem_cache_create("extent_buffers", 61 sizeof(struct extent_buffer), 0, 62 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 63 if (!extent_buffer_cache) 64 goto free_state_cache; 65 return 0; 66 67 free_state_cache: 68 kmem_cache_destroy(extent_state_cache); 69 return -ENOMEM; 70 } 71 72 void extent_io_exit(void) 73 { 74 struct extent_state *state; 75 struct extent_buffer *eb; 76 77 while (!list_empty(&states)) { 78 state = list_entry(states.next, struct extent_state, leak_list); 79 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 80 "state %lu in tree %p refs %d\n", 81 (unsigned long long)state->start, 82 (unsigned long long)state->end, 83 state->state, state->tree, atomic_read(&state->refs)); 84 list_del(&state->leak_list); 85 kmem_cache_free(extent_state_cache, state); 86 87 } 88 89 while (!list_empty(&buffers)) { 90 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 91 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 92 "refs %d\n", (unsigned long long)eb->start, 93 eb->len, atomic_read(&eb->refs)); 94 list_del(&eb->leak_list); 95 kmem_cache_free(extent_buffer_cache, eb); 96 } 97 if (extent_state_cache) 98 kmem_cache_destroy(extent_state_cache); 99 if (extent_buffer_cache) 100 kmem_cache_destroy(extent_buffer_cache); 101 } 102 103 void extent_io_tree_init(struct extent_io_tree *tree, 104 struct address_space *mapping, gfp_t mask) 105 { 106 tree->state = RB_ROOT; 107 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 108 tree->ops = NULL; 109 tree->dirty_bytes = 0; 110 spin_lock_init(&tree->lock); 111 spin_lock_init(&tree->buffer_lock); 112 tree->mapping = mapping; 113 } 114 115 static struct extent_state *alloc_extent_state(gfp_t mask) 116 { 117 struct extent_state *state; 118 #if LEAK_DEBUG 119 unsigned long flags; 120 #endif 121 122 state = kmem_cache_alloc(extent_state_cache, mask); 123 if (!state) 124 return state; 125 state->state = 0; 126 state->private = 0; 127 state->tree = NULL; 128 #if LEAK_DEBUG 129 spin_lock_irqsave(&leak_lock, flags); 130 list_add(&state->leak_list, &states); 131 spin_unlock_irqrestore(&leak_lock, flags); 132 #endif 133 atomic_set(&state->refs, 1); 134 init_waitqueue_head(&state->wq); 135 return state; 136 } 137 138 void free_extent_state(struct extent_state *state) 139 { 140 if (!state) 141 return; 142 if (atomic_dec_and_test(&state->refs)) { 143 #if LEAK_DEBUG 144 unsigned long flags; 145 #endif 146 WARN_ON(state->tree); 147 #if LEAK_DEBUG 148 spin_lock_irqsave(&leak_lock, flags); 149 list_del(&state->leak_list); 150 spin_unlock_irqrestore(&leak_lock, flags); 151 #endif 152 kmem_cache_free(extent_state_cache, state); 153 } 154 } 155 156 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 157 struct rb_node *node) 158 { 159 struct rb_node **p = &root->rb_node; 160 struct rb_node *parent = NULL; 161 struct tree_entry *entry; 162 163 while (*p) { 164 parent = *p; 165 entry = rb_entry(parent, struct tree_entry, rb_node); 166 167 if (offset < entry->start) 168 p = &(*p)->rb_left; 169 else if (offset > entry->end) 170 p = &(*p)->rb_right; 171 else 172 return parent; 173 } 174 175 entry = rb_entry(node, struct tree_entry, rb_node); 176 rb_link_node(node, parent, p); 177 rb_insert_color(node, root); 178 return NULL; 179 } 180 181 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 182 struct rb_node **prev_ret, 183 struct rb_node **next_ret) 184 { 185 struct rb_root *root = &tree->state; 186 struct rb_node *n = root->rb_node; 187 struct rb_node *prev = NULL; 188 struct rb_node *orig_prev = NULL; 189 struct tree_entry *entry; 190 struct tree_entry *prev_entry = NULL; 191 192 while (n) { 193 entry = rb_entry(n, struct tree_entry, rb_node); 194 prev = n; 195 prev_entry = entry; 196 197 if (offset < entry->start) 198 n = n->rb_left; 199 else if (offset > entry->end) 200 n = n->rb_right; 201 else 202 return n; 203 } 204 205 if (prev_ret) { 206 orig_prev = prev; 207 while (prev && offset > prev_entry->end) { 208 prev = rb_next(prev); 209 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 210 } 211 *prev_ret = prev; 212 prev = orig_prev; 213 } 214 215 if (next_ret) { 216 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 217 while (prev && offset < prev_entry->start) { 218 prev = rb_prev(prev); 219 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 220 } 221 *next_ret = prev; 222 } 223 return NULL; 224 } 225 226 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 227 u64 offset) 228 { 229 struct rb_node *prev = NULL; 230 struct rb_node *ret; 231 232 ret = __etree_search(tree, offset, &prev, NULL); 233 if (!ret) 234 return prev; 235 return ret; 236 } 237 238 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 239 struct extent_state *other) 240 { 241 if (tree->ops && tree->ops->merge_extent_hook) 242 tree->ops->merge_extent_hook(tree->mapping->host, new, 243 other); 244 } 245 246 /* 247 * utility function to look for merge candidates inside a given range. 248 * Any extents with matching state are merged together into a single 249 * extent in the tree. Extents with EXTENT_IO in their state field 250 * are not merged because the end_io handlers need to be able to do 251 * operations on them without sleeping (or doing allocations/splits). 252 * 253 * This should be called with the tree lock held. 254 */ 255 static int merge_state(struct extent_io_tree *tree, 256 struct extent_state *state) 257 { 258 struct extent_state *other; 259 struct rb_node *other_node; 260 261 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 262 return 0; 263 264 other_node = rb_prev(&state->rb_node); 265 if (other_node) { 266 other = rb_entry(other_node, struct extent_state, rb_node); 267 if (other->end == state->start - 1 && 268 other->state == state->state) { 269 merge_cb(tree, state, other); 270 state->start = other->start; 271 other->tree = NULL; 272 rb_erase(&other->rb_node, &tree->state); 273 free_extent_state(other); 274 } 275 } 276 other_node = rb_next(&state->rb_node); 277 if (other_node) { 278 other = rb_entry(other_node, struct extent_state, rb_node); 279 if (other->start == state->end + 1 && 280 other->state == state->state) { 281 merge_cb(tree, state, other); 282 other->start = state->start; 283 state->tree = NULL; 284 rb_erase(&state->rb_node, &tree->state); 285 free_extent_state(state); 286 state = NULL; 287 } 288 } 289 290 return 0; 291 } 292 293 static int set_state_cb(struct extent_io_tree *tree, 294 struct extent_state *state, int *bits) 295 { 296 if (tree->ops && tree->ops->set_bit_hook) { 297 return tree->ops->set_bit_hook(tree->mapping->host, 298 state, bits); 299 } 300 301 return 0; 302 } 303 304 static void clear_state_cb(struct extent_io_tree *tree, 305 struct extent_state *state, int *bits) 306 { 307 if (tree->ops && tree->ops->clear_bit_hook) 308 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 309 } 310 311 /* 312 * insert an extent_state struct into the tree. 'bits' are set on the 313 * struct before it is inserted. 314 * 315 * This may return -EEXIST if the extent is already there, in which case the 316 * state struct is freed. 317 * 318 * The tree lock is not taken internally. This is a utility function and 319 * probably isn't what you want to call (see set/clear_extent_bit). 320 */ 321 static int insert_state(struct extent_io_tree *tree, 322 struct extent_state *state, u64 start, u64 end, 323 int *bits) 324 { 325 struct rb_node *node; 326 int bits_to_set = *bits & ~EXTENT_CTLBITS; 327 int ret; 328 329 if (end < start) { 330 printk(KERN_ERR "btrfs end < start %llu %llu\n", 331 (unsigned long long)end, 332 (unsigned long long)start); 333 WARN_ON(1); 334 } 335 state->start = start; 336 state->end = end; 337 ret = set_state_cb(tree, state, bits); 338 if (ret) 339 return ret; 340 341 if (bits_to_set & EXTENT_DIRTY) 342 tree->dirty_bytes += end - start + 1; 343 state->state |= bits_to_set; 344 node = tree_insert(&tree->state, end, &state->rb_node); 345 if (node) { 346 struct extent_state *found; 347 found = rb_entry(node, struct extent_state, rb_node); 348 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 349 "%llu %llu\n", (unsigned long long)found->start, 350 (unsigned long long)found->end, 351 (unsigned long long)start, (unsigned long long)end); 352 free_extent_state(state); 353 return -EEXIST; 354 } 355 state->tree = tree; 356 merge_state(tree, state); 357 return 0; 358 } 359 360 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 361 u64 split) 362 { 363 if (tree->ops && tree->ops->split_extent_hook) 364 return tree->ops->split_extent_hook(tree->mapping->host, 365 orig, split); 366 return 0; 367 } 368 369 /* 370 * split a given extent state struct in two, inserting the preallocated 371 * struct 'prealloc' as the newly created second half. 'split' indicates an 372 * offset inside 'orig' where it should be split. 373 * 374 * Before calling, 375 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 376 * are two extent state structs in the tree: 377 * prealloc: [orig->start, split - 1] 378 * orig: [ split, orig->end ] 379 * 380 * The tree locks are not taken by this function. They need to be held 381 * by the caller. 382 */ 383 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 384 struct extent_state *prealloc, u64 split) 385 { 386 struct rb_node *node; 387 388 split_cb(tree, orig, split); 389 390 prealloc->start = orig->start; 391 prealloc->end = split - 1; 392 prealloc->state = orig->state; 393 orig->start = split; 394 395 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 396 if (node) { 397 free_extent_state(prealloc); 398 return -EEXIST; 399 } 400 prealloc->tree = tree; 401 return 0; 402 } 403 404 /* 405 * utility function to clear some bits in an extent state struct. 406 * it will optionally wake up any one waiting on this state (wake == 1), or 407 * forcibly remove the state from the tree (delete == 1). 408 * 409 * If no bits are set on the state struct after clearing things, the 410 * struct is freed and removed from the tree 411 */ 412 static int clear_state_bit(struct extent_io_tree *tree, 413 struct extent_state *state, 414 int *bits, int wake) 415 { 416 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 417 int ret = state->state & bits_to_clear; 418 419 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 420 u64 range = state->end - state->start + 1; 421 WARN_ON(range > tree->dirty_bytes); 422 tree->dirty_bytes -= range; 423 } 424 clear_state_cb(tree, state, bits); 425 state->state &= ~bits_to_clear; 426 if (wake) 427 wake_up(&state->wq); 428 if (state->state == 0) { 429 if (state->tree) { 430 rb_erase(&state->rb_node, &tree->state); 431 state->tree = NULL; 432 free_extent_state(state); 433 } else { 434 WARN_ON(1); 435 } 436 } else { 437 merge_state(tree, state); 438 } 439 return ret; 440 } 441 442 /* 443 * clear some bits on a range in the tree. This may require splitting 444 * or inserting elements in the tree, so the gfp mask is used to 445 * indicate which allocations or sleeping are allowed. 446 * 447 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 448 * the given range from the tree regardless of state (ie for truncate). 449 * 450 * the range [start, end] is inclusive. 451 * 452 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 453 * bits were already set, or zero if none of the bits were already set. 454 */ 455 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 456 int bits, int wake, int delete, 457 struct extent_state **cached_state, 458 gfp_t mask) 459 { 460 struct extent_state *state; 461 struct extent_state *cached; 462 struct extent_state *prealloc = NULL; 463 struct rb_node *next_node; 464 struct rb_node *node; 465 u64 last_end; 466 int err; 467 int set = 0; 468 int clear = 0; 469 470 if (delete) 471 bits |= ~EXTENT_CTLBITS; 472 bits |= EXTENT_FIRST_DELALLOC; 473 474 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 475 clear = 1; 476 again: 477 if (!prealloc && (mask & __GFP_WAIT)) { 478 prealloc = alloc_extent_state(mask); 479 if (!prealloc) 480 return -ENOMEM; 481 } 482 483 spin_lock(&tree->lock); 484 if (cached_state) { 485 cached = *cached_state; 486 487 if (clear) { 488 *cached_state = NULL; 489 cached_state = NULL; 490 } 491 492 if (cached && cached->tree && cached->start == start) { 493 if (clear) 494 atomic_dec(&cached->refs); 495 state = cached; 496 goto hit_next; 497 } 498 if (clear) 499 free_extent_state(cached); 500 } 501 /* 502 * this search will find the extents that end after 503 * our range starts 504 */ 505 node = tree_search(tree, start); 506 if (!node) 507 goto out; 508 state = rb_entry(node, struct extent_state, rb_node); 509 hit_next: 510 if (state->start > end) 511 goto out; 512 WARN_ON(state->end < start); 513 last_end = state->end; 514 515 /* 516 * | ---- desired range ---- | 517 * | state | or 518 * | ------------- state -------------- | 519 * 520 * We need to split the extent we found, and may flip 521 * bits on second half. 522 * 523 * If the extent we found extends past our range, we 524 * just split and search again. It'll get split again 525 * the next time though. 526 * 527 * If the extent we found is inside our range, we clear 528 * the desired bit on it. 529 */ 530 531 if (state->start < start) { 532 if (!prealloc) 533 prealloc = alloc_extent_state(GFP_ATOMIC); 534 err = split_state(tree, state, prealloc, start); 535 BUG_ON(err == -EEXIST); 536 prealloc = NULL; 537 if (err) 538 goto out; 539 if (state->end <= end) { 540 set |= clear_state_bit(tree, state, &bits, wake); 541 if (last_end == (u64)-1) 542 goto out; 543 start = last_end + 1; 544 } 545 goto search_again; 546 } 547 /* 548 * | ---- desired range ---- | 549 * | state | 550 * We need to split the extent, and clear the bit 551 * on the first half 552 */ 553 if (state->start <= end && state->end > end) { 554 if (!prealloc) 555 prealloc = alloc_extent_state(GFP_ATOMIC); 556 err = split_state(tree, state, prealloc, end + 1); 557 BUG_ON(err == -EEXIST); 558 if (wake) 559 wake_up(&state->wq); 560 561 set |= clear_state_bit(tree, prealloc, &bits, wake); 562 563 prealloc = NULL; 564 goto out; 565 } 566 567 if (state->end < end && prealloc && !need_resched()) 568 next_node = rb_next(&state->rb_node); 569 else 570 next_node = NULL; 571 572 set |= clear_state_bit(tree, state, &bits, wake); 573 if (last_end == (u64)-1) 574 goto out; 575 start = last_end + 1; 576 if (start <= end && next_node) { 577 state = rb_entry(next_node, struct extent_state, 578 rb_node); 579 if (state->start == start) 580 goto hit_next; 581 } 582 goto search_again; 583 584 out: 585 spin_unlock(&tree->lock); 586 if (prealloc) 587 free_extent_state(prealloc); 588 589 return set; 590 591 search_again: 592 if (start > end) 593 goto out; 594 spin_unlock(&tree->lock); 595 if (mask & __GFP_WAIT) 596 cond_resched(); 597 goto again; 598 } 599 600 static int wait_on_state(struct extent_io_tree *tree, 601 struct extent_state *state) 602 __releases(tree->lock) 603 __acquires(tree->lock) 604 { 605 DEFINE_WAIT(wait); 606 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 607 spin_unlock(&tree->lock); 608 schedule(); 609 spin_lock(&tree->lock); 610 finish_wait(&state->wq, &wait); 611 return 0; 612 } 613 614 /* 615 * waits for one or more bits to clear on a range in the state tree. 616 * The range [start, end] is inclusive. 617 * The tree lock is taken by this function 618 */ 619 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 620 { 621 struct extent_state *state; 622 struct rb_node *node; 623 624 spin_lock(&tree->lock); 625 again: 626 while (1) { 627 /* 628 * this search will find all the extents that end after 629 * our range starts 630 */ 631 node = tree_search(tree, start); 632 if (!node) 633 break; 634 635 state = rb_entry(node, struct extent_state, rb_node); 636 637 if (state->start > end) 638 goto out; 639 640 if (state->state & bits) { 641 start = state->start; 642 atomic_inc(&state->refs); 643 wait_on_state(tree, state); 644 free_extent_state(state); 645 goto again; 646 } 647 start = state->end + 1; 648 649 if (start > end) 650 break; 651 652 if (need_resched()) { 653 spin_unlock(&tree->lock); 654 cond_resched(); 655 spin_lock(&tree->lock); 656 } 657 } 658 out: 659 spin_unlock(&tree->lock); 660 return 0; 661 } 662 663 static int set_state_bits(struct extent_io_tree *tree, 664 struct extent_state *state, 665 int *bits) 666 { 667 int ret; 668 int bits_to_set = *bits & ~EXTENT_CTLBITS; 669 670 ret = set_state_cb(tree, state, bits); 671 if (ret) 672 return ret; 673 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 674 u64 range = state->end - state->start + 1; 675 tree->dirty_bytes += range; 676 } 677 state->state |= bits_to_set; 678 679 return 0; 680 } 681 682 static void cache_state(struct extent_state *state, 683 struct extent_state **cached_ptr) 684 { 685 if (cached_ptr && !(*cached_ptr)) { 686 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 687 *cached_ptr = state; 688 atomic_inc(&state->refs); 689 } 690 } 691 } 692 693 /* 694 * set some bits on a range in the tree. This may require allocations or 695 * sleeping, so the gfp mask is used to indicate what is allowed. 696 * 697 * If any of the exclusive bits are set, this will fail with -EEXIST if some 698 * part of the range already has the desired bits set. The start of the 699 * existing range is returned in failed_start in this case. 700 * 701 * [start, end] is inclusive This takes the tree lock. 702 */ 703 704 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 705 int bits, int exclusive_bits, u64 *failed_start, 706 struct extent_state **cached_state, gfp_t mask) 707 { 708 struct extent_state *state; 709 struct extent_state *prealloc = NULL; 710 struct rb_node *node; 711 int err = 0; 712 u64 last_start; 713 u64 last_end; 714 715 bits |= EXTENT_FIRST_DELALLOC; 716 again: 717 if (!prealloc && (mask & __GFP_WAIT)) { 718 prealloc = alloc_extent_state(mask); 719 if (!prealloc) 720 return -ENOMEM; 721 } 722 723 spin_lock(&tree->lock); 724 if (cached_state && *cached_state) { 725 state = *cached_state; 726 if (state->start == start && state->tree) { 727 node = &state->rb_node; 728 goto hit_next; 729 } 730 } 731 /* 732 * this search will find all the extents that end after 733 * our range starts. 734 */ 735 node = tree_search(tree, start); 736 if (!node) { 737 err = insert_state(tree, prealloc, start, end, &bits); 738 prealloc = NULL; 739 BUG_ON(err == -EEXIST); 740 goto out; 741 } 742 state = rb_entry(node, struct extent_state, rb_node); 743 hit_next: 744 last_start = state->start; 745 last_end = state->end; 746 747 /* 748 * | ---- desired range ---- | 749 * | state | 750 * 751 * Just lock what we found and keep going 752 */ 753 if (state->start == start && state->end <= end) { 754 struct rb_node *next_node; 755 if (state->state & exclusive_bits) { 756 *failed_start = state->start; 757 err = -EEXIST; 758 goto out; 759 } 760 761 err = set_state_bits(tree, state, &bits); 762 if (err) 763 goto out; 764 765 cache_state(state, cached_state); 766 merge_state(tree, state); 767 if (last_end == (u64)-1) 768 goto out; 769 770 start = last_end + 1; 771 if (start < end && prealloc && !need_resched()) { 772 next_node = rb_next(node); 773 if (next_node) { 774 state = rb_entry(next_node, struct extent_state, 775 rb_node); 776 if (state->start == start) 777 goto hit_next; 778 } 779 } 780 goto search_again; 781 } 782 783 /* 784 * | ---- desired range ---- | 785 * | state | 786 * or 787 * | ------------- state -------------- | 788 * 789 * We need to split the extent we found, and may flip bits on 790 * second half. 791 * 792 * If the extent we found extends past our 793 * range, we just split and search again. It'll get split 794 * again the next time though. 795 * 796 * If the extent we found is inside our range, we set the 797 * desired bit on it. 798 */ 799 if (state->start < start) { 800 if (state->state & exclusive_bits) { 801 *failed_start = start; 802 err = -EEXIST; 803 goto out; 804 } 805 err = split_state(tree, state, prealloc, start); 806 BUG_ON(err == -EEXIST); 807 prealloc = NULL; 808 if (err) 809 goto out; 810 if (state->end <= end) { 811 err = set_state_bits(tree, state, &bits); 812 if (err) 813 goto out; 814 cache_state(state, cached_state); 815 merge_state(tree, state); 816 if (last_end == (u64)-1) 817 goto out; 818 start = last_end + 1; 819 } 820 goto search_again; 821 } 822 /* 823 * | ---- desired range ---- | 824 * | state | or | state | 825 * 826 * There's a hole, we need to insert something in it and 827 * ignore the extent we found. 828 */ 829 if (state->start > start) { 830 u64 this_end; 831 if (end < last_start) 832 this_end = end; 833 else 834 this_end = last_start - 1; 835 err = insert_state(tree, prealloc, start, this_end, 836 &bits); 837 BUG_ON(err == -EEXIST); 838 if (err) { 839 prealloc = NULL; 840 goto out; 841 } 842 cache_state(prealloc, cached_state); 843 prealloc = NULL; 844 start = this_end + 1; 845 goto search_again; 846 } 847 /* 848 * | ---- desired range ---- | 849 * | state | 850 * We need to split the extent, and set the bit 851 * on the first half 852 */ 853 if (state->start <= end && state->end > end) { 854 if (state->state & exclusive_bits) { 855 *failed_start = start; 856 err = -EEXIST; 857 goto out; 858 } 859 err = split_state(tree, state, prealloc, end + 1); 860 BUG_ON(err == -EEXIST); 861 862 err = set_state_bits(tree, prealloc, &bits); 863 if (err) { 864 prealloc = NULL; 865 goto out; 866 } 867 cache_state(prealloc, cached_state); 868 merge_state(tree, prealloc); 869 prealloc = NULL; 870 goto out; 871 } 872 873 goto search_again; 874 875 out: 876 spin_unlock(&tree->lock); 877 if (prealloc) 878 free_extent_state(prealloc); 879 880 return err; 881 882 search_again: 883 if (start > end) 884 goto out; 885 spin_unlock(&tree->lock); 886 if (mask & __GFP_WAIT) 887 cond_resched(); 888 goto again; 889 } 890 891 /* wrappers around set/clear extent bit */ 892 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 893 gfp_t mask) 894 { 895 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 896 NULL, mask); 897 } 898 899 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 900 int bits, gfp_t mask) 901 { 902 return set_extent_bit(tree, start, end, bits, 0, NULL, 903 NULL, mask); 904 } 905 906 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 907 int bits, gfp_t mask) 908 { 909 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 910 } 911 912 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 913 struct extent_state **cached_state, gfp_t mask) 914 { 915 return set_extent_bit(tree, start, end, 916 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 917 0, NULL, cached_state, mask); 918 } 919 920 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 921 gfp_t mask) 922 { 923 return clear_extent_bit(tree, start, end, 924 EXTENT_DIRTY | EXTENT_DELALLOC | 925 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 926 } 927 928 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 929 gfp_t mask) 930 { 931 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 932 NULL, mask); 933 } 934 935 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 936 gfp_t mask) 937 { 938 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, 939 NULL, mask); 940 } 941 942 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 943 gfp_t mask) 944 { 945 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 946 NULL, mask); 947 } 948 949 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 950 u64 end, struct extent_state **cached_state, 951 gfp_t mask) 952 { 953 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 954 cached_state, mask); 955 } 956 957 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 958 { 959 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 960 } 961 962 /* 963 * either insert or lock state struct between start and end use mask to tell 964 * us if waiting is desired. 965 */ 966 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 967 int bits, struct extent_state **cached_state, gfp_t mask) 968 { 969 int err; 970 u64 failed_start; 971 while (1) { 972 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 973 EXTENT_LOCKED, &failed_start, 974 cached_state, mask); 975 if (err == -EEXIST && (mask & __GFP_WAIT)) { 976 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 977 start = failed_start; 978 } else { 979 break; 980 } 981 WARN_ON(start > end); 982 } 983 return err; 984 } 985 986 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 987 { 988 return lock_extent_bits(tree, start, end, 0, NULL, mask); 989 } 990 991 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 992 gfp_t mask) 993 { 994 int err; 995 u64 failed_start; 996 997 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 998 &failed_start, NULL, mask); 999 if (err == -EEXIST) { 1000 if (failed_start > start) 1001 clear_extent_bit(tree, start, failed_start - 1, 1002 EXTENT_LOCKED, 1, 0, NULL, mask); 1003 return 0; 1004 } 1005 return 1; 1006 } 1007 1008 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1009 struct extent_state **cached, gfp_t mask) 1010 { 1011 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1012 mask); 1013 } 1014 1015 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1016 gfp_t mask) 1017 { 1018 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1019 mask); 1020 } 1021 1022 /* 1023 * helper function to set pages and extents in the tree dirty 1024 */ 1025 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) 1026 { 1027 unsigned long index = start >> PAGE_CACHE_SHIFT; 1028 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1029 struct page *page; 1030 1031 while (index <= end_index) { 1032 page = find_get_page(tree->mapping, index); 1033 BUG_ON(!page); 1034 __set_page_dirty_nobuffers(page); 1035 page_cache_release(page); 1036 index++; 1037 } 1038 return 0; 1039 } 1040 1041 /* 1042 * helper function to set both pages and extents in the tree writeback 1043 */ 1044 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1045 { 1046 unsigned long index = start >> PAGE_CACHE_SHIFT; 1047 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1048 struct page *page; 1049 1050 while (index <= end_index) { 1051 page = find_get_page(tree->mapping, index); 1052 BUG_ON(!page); 1053 set_page_writeback(page); 1054 page_cache_release(page); 1055 index++; 1056 } 1057 return 0; 1058 } 1059 1060 /* 1061 * find the first offset in the io tree with 'bits' set. zero is 1062 * returned if we find something, and *start_ret and *end_ret are 1063 * set to reflect the state struct that was found. 1064 * 1065 * If nothing was found, 1 is returned, < 0 on error 1066 */ 1067 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1068 u64 *start_ret, u64 *end_ret, int bits) 1069 { 1070 struct rb_node *node; 1071 struct extent_state *state; 1072 int ret = 1; 1073 1074 spin_lock(&tree->lock); 1075 /* 1076 * this search will find all the extents that end after 1077 * our range starts. 1078 */ 1079 node = tree_search(tree, start); 1080 if (!node) 1081 goto out; 1082 1083 while (1) { 1084 state = rb_entry(node, struct extent_state, rb_node); 1085 if (state->end >= start && (state->state & bits)) { 1086 *start_ret = state->start; 1087 *end_ret = state->end; 1088 ret = 0; 1089 break; 1090 } 1091 node = rb_next(node); 1092 if (!node) 1093 break; 1094 } 1095 out: 1096 spin_unlock(&tree->lock); 1097 return ret; 1098 } 1099 1100 /* find the first state struct with 'bits' set after 'start', and 1101 * return it. tree->lock must be held. NULL will returned if 1102 * nothing was found after 'start' 1103 */ 1104 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1105 u64 start, int bits) 1106 { 1107 struct rb_node *node; 1108 struct extent_state *state; 1109 1110 /* 1111 * this search will find all the extents that end after 1112 * our range starts. 1113 */ 1114 node = tree_search(tree, start); 1115 if (!node) 1116 goto out; 1117 1118 while (1) { 1119 state = rb_entry(node, struct extent_state, rb_node); 1120 if (state->end >= start && (state->state & bits)) 1121 return state; 1122 1123 node = rb_next(node); 1124 if (!node) 1125 break; 1126 } 1127 out: 1128 return NULL; 1129 } 1130 1131 /* 1132 * find a contiguous range of bytes in the file marked as delalloc, not 1133 * more than 'max_bytes'. start and end are used to return the range, 1134 * 1135 * 1 is returned if we find something, 0 if nothing was in the tree 1136 */ 1137 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1138 u64 *start, u64 *end, u64 max_bytes, 1139 struct extent_state **cached_state) 1140 { 1141 struct rb_node *node; 1142 struct extent_state *state; 1143 u64 cur_start = *start; 1144 u64 found = 0; 1145 u64 total_bytes = 0; 1146 1147 spin_lock(&tree->lock); 1148 1149 /* 1150 * this search will find all the extents that end after 1151 * our range starts. 1152 */ 1153 node = tree_search(tree, cur_start); 1154 if (!node) { 1155 if (!found) 1156 *end = (u64)-1; 1157 goto out; 1158 } 1159 1160 while (1) { 1161 state = rb_entry(node, struct extent_state, rb_node); 1162 if (found && (state->start != cur_start || 1163 (state->state & EXTENT_BOUNDARY))) { 1164 goto out; 1165 } 1166 if (!(state->state & EXTENT_DELALLOC)) { 1167 if (!found) 1168 *end = state->end; 1169 goto out; 1170 } 1171 if (!found) { 1172 *start = state->start; 1173 *cached_state = state; 1174 atomic_inc(&state->refs); 1175 } 1176 found++; 1177 *end = state->end; 1178 cur_start = state->end + 1; 1179 node = rb_next(node); 1180 if (!node) 1181 break; 1182 total_bytes += state->end - state->start + 1; 1183 if (total_bytes >= max_bytes) 1184 break; 1185 } 1186 out: 1187 spin_unlock(&tree->lock); 1188 return found; 1189 } 1190 1191 static noinline int __unlock_for_delalloc(struct inode *inode, 1192 struct page *locked_page, 1193 u64 start, u64 end) 1194 { 1195 int ret; 1196 struct page *pages[16]; 1197 unsigned long index = start >> PAGE_CACHE_SHIFT; 1198 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1199 unsigned long nr_pages = end_index - index + 1; 1200 int i; 1201 1202 if (index == locked_page->index && end_index == index) 1203 return 0; 1204 1205 while (nr_pages > 0) { 1206 ret = find_get_pages_contig(inode->i_mapping, index, 1207 min_t(unsigned long, nr_pages, 1208 ARRAY_SIZE(pages)), pages); 1209 for (i = 0; i < ret; i++) { 1210 if (pages[i] != locked_page) 1211 unlock_page(pages[i]); 1212 page_cache_release(pages[i]); 1213 } 1214 nr_pages -= ret; 1215 index += ret; 1216 cond_resched(); 1217 } 1218 return 0; 1219 } 1220 1221 static noinline int lock_delalloc_pages(struct inode *inode, 1222 struct page *locked_page, 1223 u64 delalloc_start, 1224 u64 delalloc_end) 1225 { 1226 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1227 unsigned long start_index = index; 1228 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1229 unsigned long pages_locked = 0; 1230 struct page *pages[16]; 1231 unsigned long nrpages; 1232 int ret; 1233 int i; 1234 1235 /* the caller is responsible for locking the start index */ 1236 if (index == locked_page->index && index == end_index) 1237 return 0; 1238 1239 /* skip the page at the start index */ 1240 nrpages = end_index - index + 1; 1241 while (nrpages > 0) { 1242 ret = find_get_pages_contig(inode->i_mapping, index, 1243 min_t(unsigned long, 1244 nrpages, ARRAY_SIZE(pages)), pages); 1245 if (ret == 0) { 1246 ret = -EAGAIN; 1247 goto done; 1248 } 1249 /* now we have an array of pages, lock them all */ 1250 for (i = 0; i < ret; i++) { 1251 /* 1252 * the caller is taking responsibility for 1253 * locked_page 1254 */ 1255 if (pages[i] != locked_page) { 1256 lock_page(pages[i]); 1257 if (!PageDirty(pages[i]) || 1258 pages[i]->mapping != inode->i_mapping) { 1259 ret = -EAGAIN; 1260 unlock_page(pages[i]); 1261 page_cache_release(pages[i]); 1262 goto done; 1263 } 1264 } 1265 page_cache_release(pages[i]); 1266 pages_locked++; 1267 } 1268 nrpages -= ret; 1269 index += ret; 1270 cond_resched(); 1271 } 1272 ret = 0; 1273 done: 1274 if (ret && pages_locked) { 1275 __unlock_for_delalloc(inode, locked_page, 1276 delalloc_start, 1277 ((u64)(start_index + pages_locked - 1)) << 1278 PAGE_CACHE_SHIFT); 1279 } 1280 return ret; 1281 } 1282 1283 /* 1284 * find a contiguous range of bytes in the file marked as delalloc, not 1285 * more than 'max_bytes'. start and end are used to return the range, 1286 * 1287 * 1 is returned if we find something, 0 if nothing was in the tree 1288 */ 1289 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1290 struct extent_io_tree *tree, 1291 struct page *locked_page, 1292 u64 *start, u64 *end, 1293 u64 max_bytes) 1294 { 1295 u64 delalloc_start; 1296 u64 delalloc_end; 1297 u64 found; 1298 struct extent_state *cached_state = NULL; 1299 int ret; 1300 int loops = 0; 1301 1302 again: 1303 /* step one, find a bunch of delalloc bytes starting at start */ 1304 delalloc_start = *start; 1305 delalloc_end = 0; 1306 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1307 max_bytes, &cached_state); 1308 if (!found || delalloc_end <= *start) { 1309 *start = delalloc_start; 1310 *end = delalloc_end; 1311 free_extent_state(cached_state); 1312 return found; 1313 } 1314 1315 /* 1316 * start comes from the offset of locked_page. We have to lock 1317 * pages in order, so we can't process delalloc bytes before 1318 * locked_page 1319 */ 1320 if (delalloc_start < *start) 1321 delalloc_start = *start; 1322 1323 /* 1324 * make sure to limit the number of pages we try to lock down 1325 * if we're looping. 1326 */ 1327 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1328 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1329 1330 /* step two, lock all the pages after the page that has start */ 1331 ret = lock_delalloc_pages(inode, locked_page, 1332 delalloc_start, delalloc_end); 1333 if (ret == -EAGAIN) { 1334 /* some of the pages are gone, lets avoid looping by 1335 * shortening the size of the delalloc range we're searching 1336 */ 1337 free_extent_state(cached_state); 1338 if (!loops) { 1339 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1340 max_bytes = PAGE_CACHE_SIZE - offset; 1341 loops = 1; 1342 goto again; 1343 } else { 1344 found = 0; 1345 goto out_failed; 1346 } 1347 } 1348 BUG_ON(ret); 1349 1350 /* step three, lock the state bits for the whole range */ 1351 lock_extent_bits(tree, delalloc_start, delalloc_end, 1352 0, &cached_state, GFP_NOFS); 1353 1354 /* then test to make sure it is all still delalloc */ 1355 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1356 EXTENT_DELALLOC, 1, cached_state); 1357 if (!ret) { 1358 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1359 &cached_state, GFP_NOFS); 1360 __unlock_for_delalloc(inode, locked_page, 1361 delalloc_start, delalloc_end); 1362 cond_resched(); 1363 goto again; 1364 } 1365 free_extent_state(cached_state); 1366 *start = delalloc_start; 1367 *end = delalloc_end; 1368 out_failed: 1369 return found; 1370 } 1371 1372 int extent_clear_unlock_delalloc(struct inode *inode, 1373 struct extent_io_tree *tree, 1374 u64 start, u64 end, struct page *locked_page, 1375 unsigned long op) 1376 { 1377 int ret; 1378 struct page *pages[16]; 1379 unsigned long index = start >> PAGE_CACHE_SHIFT; 1380 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1381 unsigned long nr_pages = end_index - index + 1; 1382 int i; 1383 int clear_bits = 0; 1384 1385 if (op & EXTENT_CLEAR_UNLOCK) 1386 clear_bits |= EXTENT_LOCKED; 1387 if (op & EXTENT_CLEAR_DIRTY) 1388 clear_bits |= EXTENT_DIRTY; 1389 1390 if (op & EXTENT_CLEAR_DELALLOC) 1391 clear_bits |= EXTENT_DELALLOC; 1392 1393 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1394 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1395 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1396 EXTENT_SET_PRIVATE2))) 1397 return 0; 1398 1399 while (nr_pages > 0) { 1400 ret = find_get_pages_contig(inode->i_mapping, index, 1401 min_t(unsigned long, 1402 nr_pages, ARRAY_SIZE(pages)), pages); 1403 for (i = 0; i < ret; i++) { 1404 1405 if (op & EXTENT_SET_PRIVATE2) 1406 SetPagePrivate2(pages[i]); 1407 1408 if (pages[i] == locked_page) { 1409 page_cache_release(pages[i]); 1410 continue; 1411 } 1412 if (op & EXTENT_CLEAR_DIRTY) 1413 clear_page_dirty_for_io(pages[i]); 1414 if (op & EXTENT_SET_WRITEBACK) 1415 set_page_writeback(pages[i]); 1416 if (op & EXTENT_END_WRITEBACK) 1417 end_page_writeback(pages[i]); 1418 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1419 unlock_page(pages[i]); 1420 page_cache_release(pages[i]); 1421 } 1422 nr_pages -= ret; 1423 index += ret; 1424 cond_resched(); 1425 } 1426 return 0; 1427 } 1428 1429 /* 1430 * count the number of bytes in the tree that have a given bit(s) 1431 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1432 * cached. The total number found is returned. 1433 */ 1434 u64 count_range_bits(struct extent_io_tree *tree, 1435 u64 *start, u64 search_end, u64 max_bytes, 1436 unsigned long bits) 1437 { 1438 struct rb_node *node; 1439 struct extent_state *state; 1440 u64 cur_start = *start; 1441 u64 total_bytes = 0; 1442 int found = 0; 1443 1444 if (search_end <= cur_start) { 1445 WARN_ON(1); 1446 return 0; 1447 } 1448 1449 spin_lock(&tree->lock); 1450 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1451 total_bytes = tree->dirty_bytes; 1452 goto out; 1453 } 1454 /* 1455 * this search will find all the extents that end after 1456 * our range starts. 1457 */ 1458 node = tree_search(tree, cur_start); 1459 if (!node) 1460 goto out; 1461 1462 while (1) { 1463 state = rb_entry(node, struct extent_state, rb_node); 1464 if (state->start > search_end) 1465 break; 1466 if (state->end >= cur_start && (state->state & bits)) { 1467 total_bytes += min(search_end, state->end) + 1 - 1468 max(cur_start, state->start); 1469 if (total_bytes >= max_bytes) 1470 break; 1471 if (!found) { 1472 *start = state->start; 1473 found = 1; 1474 } 1475 } 1476 node = rb_next(node); 1477 if (!node) 1478 break; 1479 } 1480 out: 1481 spin_unlock(&tree->lock); 1482 return total_bytes; 1483 } 1484 1485 /* 1486 * set the private field for a given byte offset in the tree. If there isn't 1487 * an extent_state there already, this does nothing. 1488 */ 1489 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1490 { 1491 struct rb_node *node; 1492 struct extent_state *state; 1493 int ret = 0; 1494 1495 spin_lock(&tree->lock); 1496 /* 1497 * this search will find all the extents that end after 1498 * our range starts. 1499 */ 1500 node = tree_search(tree, start); 1501 if (!node) { 1502 ret = -ENOENT; 1503 goto out; 1504 } 1505 state = rb_entry(node, struct extent_state, rb_node); 1506 if (state->start != start) { 1507 ret = -ENOENT; 1508 goto out; 1509 } 1510 state->private = private; 1511 out: 1512 spin_unlock(&tree->lock); 1513 return ret; 1514 } 1515 1516 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1517 { 1518 struct rb_node *node; 1519 struct extent_state *state; 1520 int ret = 0; 1521 1522 spin_lock(&tree->lock); 1523 /* 1524 * this search will find all the extents that end after 1525 * our range starts. 1526 */ 1527 node = tree_search(tree, start); 1528 if (!node) { 1529 ret = -ENOENT; 1530 goto out; 1531 } 1532 state = rb_entry(node, struct extent_state, rb_node); 1533 if (state->start != start) { 1534 ret = -ENOENT; 1535 goto out; 1536 } 1537 *private = state->private; 1538 out: 1539 spin_unlock(&tree->lock); 1540 return ret; 1541 } 1542 1543 /* 1544 * searches a range in the state tree for a given mask. 1545 * If 'filled' == 1, this returns 1 only if every extent in the tree 1546 * has the bits set. Otherwise, 1 is returned if any bit in the 1547 * range is found set. 1548 */ 1549 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1550 int bits, int filled, struct extent_state *cached) 1551 { 1552 struct extent_state *state = NULL; 1553 struct rb_node *node; 1554 int bitset = 0; 1555 1556 spin_lock(&tree->lock); 1557 if (cached && cached->tree && cached->start == start) 1558 node = &cached->rb_node; 1559 else 1560 node = tree_search(tree, start); 1561 while (node && start <= end) { 1562 state = rb_entry(node, struct extent_state, rb_node); 1563 1564 if (filled && state->start > start) { 1565 bitset = 0; 1566 break; 1567 } 1568 1569 if (state->start > end) 1570 break; 1571 1572 if (state->state & bits) { 1573 bitset = 1; 1574 if (!filled) 1575 break; 1576 } else if (filled) { 1577 bitset = 0; 1578 break; 1579 } 1580 1581 if (state->end == (u64)-1) 1582 break; 1583 1584 start = state->end + 1; 1585 if (start > end) 1586 break; 1587 node = rb_next(node); 1588 if (!node) { 1589 if (filled) 1590 bitset = 0; 1591 break; 1592 } 1593 } 1594 spin_unlock(&tree->lock); 1595 return bitset; 1596 } 1597 1598 /* 1599 * helper function to set a given page up to date if all the 1600 * extents in the tree for that page are up to date 1601 */ 1602 static int check_page_uptodate(struct extent_io_tree *tree, 1603 struct page *page) 1604 { 1605 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1606 u64 end = start + PAGE_CACHE_SIZE - 1; 1607 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1608 SetPageUptodate(page); 1609 return 0; 1610 } 1611 1612 /* 1613 * helper function to unlock a page if all the extents in the tree 1614 * for that page are unlocked 1615 */ 1616 static int check_page_locked(struct extent_io_tree *tree, 1617 struct page *page) 1618 { 1619 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1620 u64 end = start + PAGE_CACHE_SIZE - 1; 1621 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1622 unlock_page(page); 1623 return 0; 1624 } 1625 1626 /* 1627 * helper function to end page writeback if all the extents 1628 * in the tree for that page are done with writeback 1629 */ 1630 static int check_page_writeback(struct extent_io_tree *tree, 1631 struct page *page) 1632 { 1633 end_page_writeback(page); 1634 return 0; 1635 } 1636 1637 /* lots and lots of room for performance fixes in the end_bio funcs */ 1638 1639 /* 1640 * after a writepage IO is done, we need to: 1641 * clear the uptodate bits on error 1642 * clear the writeback bits in the extent tree for this IO 1643 * end_page_writeback if the page has no more pending IO 1644 * 1645 * Scheduling is not allowed, so the extent state tree is expected 1646 * to have one and only one object corresponding to this IO. 1647 */ 1648 static void end_bio_extent_writepage(struct bio *bio, int err) 1649 { 1650 int uptodate = err == 0; 1651 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1652 struct extent_io_tree *tree; 1653 u64 start; 1654 u64 end; 1655 int whole_page; 1656 int ret; 1657 1658 do { 1659 struct page *page = bvec->bv_page; 1660 tree = &BTRFS_I(page->mapping->host)->io_tree; 1661 1662 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1663 bvec->bv_offset; 1664 end = start + bvec->bv_len - 1; 1665 1666 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1667 whole_page = 1; 1668 else 1669 whole_page = 0; 1670 1671 if (--bvec >= bio->bi_io_vec) 1672 prefetchw(&bvec->bv_page->flags); 1673 if (tree->ops && tree->ops->writepage_end_io_hook) { 1674 ret = tree->ops->writepage_end_io_hook(page, start, 1675 end, NULL, uptodate); 1676 if (ret) 1677 uptodate = 0; 1678 } 1679 1680 if (!uptodate && tree->ops && 1681 tree->ops->writepage_io_failed_hook) { 1682 ret = tree->ops->writepage_io_failed_hook(bio, page, 1683 start, end, NULL); 1684 if (ret == 0) { 1685 uptodate = (err == 0); 1686 continue; 1687 } 1688 } 1689 1690 if (!uptodate) { 1691 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 1692 ClearPageUptodate(page); 1693 SetPageError(page); 1694 } 1695 1696 if (whole_page) 1697 end_page_writeback(page); 1698 else 1699 check_page_writeback(tree, page); 1700 } while (bvec >= bio->bi_io_vec); 1701 1702 bio_put(bio); 1703 } 1704 1705 /* 1706 * after a readpage IO is done, we need to: 1707 * clear the uptodate bits on error 1708 * set the uptodate bits if things worked 1709 * set the page up to date if all extents in the tree are uptodate 1710 * clear the lock bit in the extent tree 1711 * unlock the page if there are no other extents locked for it 1712 * 1713 * Scheduling is not allowed, so the extent state tree is expected 1714 * to have one and only one object corresponding to this IO. 1715 */ 1716 static void end_bio_extent_readpage(struct bio *bio, int err) 1717 { 1718 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1719 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 1720 struct bio_vec *bvec = bio->bi_io_vec; 1721 struct extent_io_tree *tree; 1722 u64 start; 1723 u64 end; 1724 int whole_page; 1725 int ret; 1726 1727 if (err) 1728 uptodate = 0; 1729 1730 do { 1731 struct page *page = bvec->bv_page; 1732 tree = &BTRFS_I(page->mapping->host)->io_tree; 1733 1734 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1735 bvec->bv_offset; 1736 end = start + bvec->bv_len - 1; 1737 1738 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1739 whole_page = 1; 1740 else 1741 whole_page = 0; 1742 1743 if (++bvec <= bvec_end) 1744 prefetchw(&bvec->bv_page->flags); 1745 1746 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1747 ret = tree->ops->readpage_end_io_hook(page, start, end, 1748 NULL); 1749 if (ret) 1750 uptodate = 0; 1751 } 1752 if (!uptodate && tree->ops && 1753 tree->ops->readpage_io_failed_hook) { 1754 ret = tree->ops->readpage_io_failed_hook(bio, page, 1755 start, end, NULL); 1756 if (ret == 0) { 1757 uptodate = 1758 test_bit(BIO_UPTODATE, &bio->bi_flags); 1759 if (err) 1760 uptodate = 0; 1761 continue; 1762 } 1763 } 1764 1765 if (uptodate) { 1766 set_extent_uptodate(tree, start, end, 1767 GFP_ATOMIC); 1768 } 1769 unlock_extent(tree, start, end, GFP_ATOMIC); 1770 1771 if (whole_page) { 1772 if (uptodate) { 1773 SetPageUptodate(page); 1774 } else { 1775 ClearPageUptodate(page); 1776 SetPageError(page); 1777 } 1778 unlock_page(page); 1779 } else { 1780 if (uptodate) { 1781 check_page_uptodate(tree, page); 1782 } else { 1783 ClearPageUptodate(page); 1784 SetPageError(page); 1785 } 1786 check_page_locked(tree, page); 1787 } 1788 } while (bvec <= bvec_end); 1789 1790 bio_put(bio); 1791 } 1792 1793 /* 1794 * IO done from prepare_write is pretty simple, we just unlock 1795 * the structs in the extent tree when done, and set the uptodate bits 1796 * as appropriate. 1797 */ 1798 static void end_bio_extent_preparewrite(struct bio *bio, int err) 1799 { 1800 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1801 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1802 struct extent_io_tree *tree; 1803 u64 start; 1804 u64 end; 1805 1806 do { 1807 struct page *page = bvec->bv_page; 1808 tree = &BTRFS_I(page->mapping->host)->io_tree; 1809 1810 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1811 bvec->bv_offset; 1812 end = start + bvec->bv_len - 1; 1813 1814 if (--bvec >= bio->bi_io_vec) 1815 prefetchw(&bvec->bv_page->flags); 1816 1817 if (uptodate) { 1818 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1819 } else { 1820 ClearPageUptodate(page); 1821 SetPageError(page); 1822 } 1823 1824 unlock_extent(tree, start, end, GFP_ATOMIC); 1825 1826 } while (bvec >= bio->bi_io_vec); 1827 1828 bio_put(bio); 1829 } 1830 1831 static struct bio * 1832 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1833 gfp_t gfp_flags) 1834 { 1835 struct bio *bio; 1836 1837 bio = bio_alloc(gfp_flags, nr_vecs); 1838 1839 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 1840 while (!bio && (nr_vecs /= 2)) 1841 bio = bio_alloc(gfp_flags, nr_vecs); 1842 } 1843 1844 if (bio) { 1845 bio->bi_size = 0; 1846 bio->bi_bdev = bdev; 1847 bio->bi_sector = first_sector; 1848 } 1849 return bio; 1850 } 1851 1852 static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 1853 unsigned long bio_flags) 1854 { 1855 int ret = 0; 1856 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1857 struct page *page = bvec->bv_page; 1858 struct extent_io_tree *tree = bio->bi_private; 1859 u64 start; 1860 1861 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1862 1863 bio->bi_private = NULL; 1864 1865 bio_get(bio); 1866 1867 if (tree->ops && tree->ops->submit_bio_hook) 1868 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1869 mirror_num, bio_flags, start); 1870 else 1871 submit_bio(rw, bio); 1872 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1873 ret = -EOPNOTSUPP; 1874 bio_put(bio); 1875 return ret; 1876 } 1877 1878 static int submit_extent_page(int rw, struct extent_io_tree *tree, 1879 struct page *page, sector_t sector, 1880 size_t size, unsigned long offset, 1881 struct block_device *bdev, 1882 struct bio **bio_ret, 1883 unsigned long max_pages, 1884 bio_end_io_t end_io_func, 1885 int mirror_num, 1886 unsigned long prev_bio_flags, 1887 unsigned long bio_flags) 1888 { 1889 int ret = 0; 1890 struct bio *bio; 1891 int nr; 1892 int contig = 0; 1893 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 1894 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 1895 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 1896 1897 if (bio_ret && *bio_ret) { 1898 bio = *bio_ret; 1899 if (old_compressed) 1900 contig = bio->bi_sector == sector; 1901 else 1902 contig = bio->bi_sector + (bio->bi_size >> 9) == 1903 sector; 1904 1905 if (prev_bio_flags != bio_flags || !contig || 1906 (tree->ops && tree->ops->merge_bio_hook && 1907 tree->ops->merge_bio_hook(page, offset, page_size, bio, 1908 bio_flags)) || 1909 bio_add_page(bio, page, page_size, offset) < page_size) { 1910 ret = submit_one_bio(rw, bio, mirror_num, 1911 prev_bio_flags); 1912 bio = NULL; 1913 } else { 1914 return 0; 1915 } 1916 } 1917 if (this_compressed) 1918 nr = BIO_MAX_PAGES; 1919 else 1920 nr = bio_get_nr_vecs(bdev); 1921 1922 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1923 1924 bio_add_page(bio, page, page_size, offset); 1925 bio->bi_end_io = end_io_func; 1926 bio->bi_private = tree; 1927 1928 if (bio_ret) 1929 *bio_ret = bio; 1930 else 1931 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 1932 1933 return ret; 1934 } 1935 1936 void set_page_extent_mapped(struct page *page) 1937 { 1938 if (!PagePrivate(page)) { 1939 SetPagePrivate(page); 1940 page_cache_get(page); 1941 set_page_private(page, EXTENT_PAGE_PRIVATE); 1942 } 1943 } 1944 1945 static void set_page_extent_head(struct page *page, unsigned long len) 1946 { 1947 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1948 } 1949 1950 /* 1951 * basic readpage implementation. Locked extent state structs are inserted 1952 * into the tree that are removed when the IO is done (by the end_io 1953 * handlers) 1954 */ 1955 static int __extent_read_full_page(struct extent_io_tree *tree, 1956 struct page *page, 1957 get_extent_t *get_extent, 1958 struct bio **bio, int mirror_num, 1959 unsigned long *bio_flags) 1960 { 1961 struct inode *inode = page->mapping->host; 1962 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1963 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1964 u64 end; 1965 u64 cur = start; 1966 u64 extent_offset; 1967 u64 last_byte = i_size_read(inode); 1968 u64 block_start; 1969 u64 cur_end; 1970 sector_t sector; 1971 struct extent_map *em; 1972 struct block_device *bdev; 1973 struct btrfs_ordered_extent *ordered; 1974 int ret; 1975 int nr = 0; 1976 size_t page_offset = 0; 1977 size_t iosize; 1978 size_t disk_io_size; 1979 size_t blocksize = inode->i_sb->s_blocksize; 1980 unsigned long this_bio_flag = 0; 1981 1982 set_page_extent_mapped(page); 1983 1984 end = page_end; 1985 while (1) { 1986 lock_extent(tree, start, end, GFP_NOFS); 1987 ordered = btrfs_lookup_ordered_extent(inode, start); 1988 if (!ordered) 1989 break; 1990 unlock_extent(tree, start, end, GFP_NOFS); 1991 btrfs_start_ordered_extent(inode, ordered, 1); 1992 btrfs_put_ordered_extent(ordered); 1993 } 1994 1995 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 1996 char *userpage; 1997 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 1998 1999 if (zero_offset) { 2000 iosize = PAGE_CACHE_SIZE - zero_offset; 2001 userpage = kmap_atomic(page, KM_USER0); 2002 memset(userpage + zero_offset, 0, iosize); 2003 flush_dcache_page(page); 2004 kunmap_atomic(userpage, KM_USER0); 2005 } 2006 } 2007 while (cur <= end) { 2008 if (cur >= last_byte) { 2009 char *userpage; 2010 iosize = PAGE_CACHE_SIZE - page_offset; 2011 userpage = kmap_atomic(page, KM_USER0); 2012 memset(userpage + page_offset, 0, iosize); 2013 flush_dcache_page(page); 2014 kunmap_atomic(userpage, KM_USER0); 2015 set_extent_uptodate(tree, cur, cur + iosize - 1, 2016 GFP_NOFS); 2017 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2018 break; 2019 } 2020 em = get_extent(inode, page, page_offset, cur, 2021 end - cur + 1, 0); 2022 if (IS_ERR(em) || !em) { 2023 SetPageError(page); 2024 unlock_extent(tree, cur, end, GFP_NOFS); 2025 break; 2026 } 2027 extent_offset = cur - em->start; 2028 BUG_ON(extent_map_end(em) <= cur); 2029 BUG_ON(end < cur); 2030 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2033 2034 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2035 cur_end = min(extent_map_end(em) - 1, end); 2036 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2037 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2038 disk_io_size = em->block_len; 2039 sector = em->block_start >> 9; 2040 } else { 2041 sector = (em->block_start + extent_offset) >> 9; 2042 disk_io_size = iosize; 2043 } 2044 bdev = em->bdev; 2045 block_start = em->block_start; 2046 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2047 block_start = EXTENT_MAP_HOLE; 2048 free_extent_map(em); 2049 em = NULL; 2050 2051 /* we've found a hole, just zero and go on */ 2052 if (block_start == EXTENT_MAP_HOLE) { 2053 char *userpage; 2054 userpage = kmap_atomic(page, KM_USER0); 2055 memset(userpage + page_offset, 0, iosize); 2056 flush_dcache_page(page); 2057 kunmap_atomic(userpage, KM_USER0); 2058 2059 set_extent_uptodate(tree, cur, cur + iosize - 1, 2060 GFP_NOFS); 2061 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2062 cur = cur + iosize; 2063 page_offset += iosize; 2064 continue; 2065 } 2066 /* the get_extent function already copied into the page */ 2067 if (test_range_bit(tree, cur, cur_end, 2068 EXTENT_UPTODATE, 1, NULL)) { 2069 check_page_uptodate(tree, page); 2070 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2071 cur = cur + iosize; 2072 page_offset += iosize; 2073 continue; 2074 } 2075 /* we have an inline extent but it didn't get marked up 2076 * to date. Error out 2077 */ 2078 if (block_start == EXTENT_MAP_INLINE) { 2079 SetPageError(page); 2080 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2081 cur = cur + iosize; 2082 page_offset += iosize; 2083 continue; 2084 } 2085 2086 ret = 0; 2087 if (tree->ops && tree->ops->readpage_io_hook) { 2088 ret = tree->ops->readpage_io_hook(page, cur, 2089 cur + iosize - 1); 2090 } 2091 if (!ret) { 2092 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2093 pnr -= page->index; 2094 ret = submit_extent_page(READ, tree, page, 2095 sector, disk_io_size, page_offset, 2096 bdev, bio, pnr, 2097 end_bio_extent_readpage, mirror_num, 2098 *bio_flags, 2099 this_bio_flag); 2100 nr++; 2101 *bio_flags = this_bio_flag; 2102 } 2103 if (ret) 2104 SetPageError(page); 2105 cur = cur + iosize; 2106 page_offset += iosize; 2107 } 2108 if (!nr) { 2109 if (!PageError(page)) 2110 SetPageUptodate(page); 2111 unlock_page(page); 2112 } 2113 return 0; 2114 } 2115 2116 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2117 get_extent_t *get_extent) 2118 { 2119 struct bio *bio = NULL; 2120 unsigned long bio_flags = 0; 2121 int ret; 2122 2123 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2124 &bio_flags); 2125 if (bio) 2126 submit_one_bio(READ, bio, 0, bio_flags); 2127 return ret; 2128 } 2129 2130 static noinline void update_nr_written(struct page *page, 2131 struct writeback_control *wbc, 2132 unsigned long nr_written) 2133 { 2134 wbc->nr_to_write -= nr_written; 2135 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2136 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2137 page->mapping->writeback_index = page->index + nr_written; 2138 } 2139 2140 /* 2141 * the writepage semantics are similar to regular writepage. extent 2142 * records are inserted to lock ranges in the tree, and as dirty areas 2143 * are found, they are marked writeback. Then the lock bits are removed 2144 * and the end_io handler clears the writeback ranges 2145 */ 2146 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2147 void *data) 2148 { 2149 struct inode *inode = page->mapping->host; 2150 struct extent_page_data *epd = data; 2151 struct extent_io_tree *tree = epd->tree; 2152 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2153 u64 delalloc_start; 2154 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2155 u64 end; 2156 u64 cur = start; 2157 u64 extent_offset; 2158 u64 last_byte = i_size_read(inode); 2159 u64 block_start; 2160 u64 iosize; 2161 sector_t sector; 2162 struct extent_state *cached_state = NULL; 2163 struct extent_map *em; 2164 struct block_device *bdev; 2165 int ret; 2166 int nr = 0; 2167 size_t pg_offset = 0; 2168 size_t blocksize; 2169 loff_t i_size = i_size_read(inode); 2170 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2171 u64 nr_delalloc; 2172 u64 delalloc_end; 2173 int page_started; 2174 int compressed; 2175 int write_flags; 2176 unsigned long nr_written = 0; 2177 2178 if (wbc->sync_mode == WB_SYNC_ALL) 2179 write_flags = WRITE_SYNC_PLUG; 2180 else 2181 write_flags = WRITE; 2182 2183 WARN_ON(!PageLocked(page)); 2184 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2185 if (page->index > end_index || 2186 (page->index == end_index && !pg_offset)) { 2187 page->mapping->a_ops->invalidatepage(page, 0); 2188 unlock_page(page); 2189 return 0; 2190 } 2191 2192 if (page->index == end_index) { 2193 char *userpage; 2194 2195 userpage = kmap_atomic(page, KM_USER0); 2196 memset(userpage + pg_offset, 0, 2197 PAGE_CACHE_SIZE - pg_offset); 2198 kunmap_atomic(userpage, KM_USER0); 2199 flush_dcache_page(page); 2200 } 2201 pg_offset = 0; 2202 2203 set_page_extent_mapped(page); 2204 2205 delalloc_start = start; 2206 delalloc_end = 0; 2207 page_started = 0; 2208 if (!epd->extent_locked) { 2209 u64 delalloc_to_write = 0; 2210 /* 2211 * make sure the wbc mapping index is at least updated 2212 * to this page. 2213 */ 2214 update_nr_written(page, wbc, 0); 2215 2216 while (delalloc_end < page_end) { 2217 nr_delalloc = find_lock_delalloc_range(inode, tree, 2218 page, 2219 &delalloc_start, 2220 &delalloc_end, 2221 128 * 1024 * 1024); 2222 if (nr_delalloc == 0) { 2223 delalloc_start = delalloc_end + 1; 2224 continue; 2225 } 2226 tree->ops->fill_delalloc(inode, page, delalloc_start, 2227 delalloc_end, &page_started, 2228 &nr_written); 2229 /* 2230 * delalloc_end is already one less than the total 2231 * length, so we don't subtract one from 2232 * PAGE_CACHE_SIZE 2233 */ 2234 delalloc_to_write += (delalloc_end - delalloc_start + 2235 PAGE_CACHE_SIZE) >> 2236 PAGE_CACHE_SHIFT; 2237 delalloc_start = delalloc_end + 1; 2238 } 2239 if (wbc->nr_to_write < delalloc_to_write) { 2240 int thresh = 8192; 2241 2242 if (delalloc_to_write < thresh * 2) 2243 thresh = delalloc_to_write; 2244 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2245 thresh); 2246 } 2247 2248 /* did the fill delalloc function already unlock and start 2249 * the IO? 2250 */ 2251 if (page_started) { 2252 ret = 0; 2253 /* 2254 * we've unlocked the page, so we can't update 2255 * the mapping's writeback index, just update 2256 * nr_to_write. 2257 */ 2258 wbc->nr_to_write -= nr_written; 2259 goto done_unlocked; 2260 } 2261 } 2262 if (tree->ops && tree->ops->writepage_start_hook) { 2263 ret = tree->ops->writepage_start_hook(page, start, 2264 page_end); 2265 if (ret == -EAGAIN) { 2266 redirty_page_for_writepage(wbc, page); 2267 update_nr_written(page, wbc, nr_written); 2268 unlock_page(page); 2269 ret = 0; 2270 goto done_unlocked; 2271 } 2272 } 2273 2274 /* 2275 * we don't want to touch the inode after unlocking the page, 2276 * so we update the mapping writeback index now 2277 */ 2278 update_nr_written(page, wbc, nr_written + 1); 2279 2280 end = page_end; 2281 if (last_byte <= start) { 2282 if (tree->ops && tree->ops->writepage_end_io_hook) 2283 tree->ops->writepage_end_io_hook(page, start, 2284 page_end, NULL, 1); 2285 goto done; 2286 } 2287 2288 blocksize = inode->i_sb->s_blocksize; 2289 2290 while (cur <= end) { 2291 if (cur >= last_byte) { 2292 if (tree->ops && tree->ops->writepage_end_io_hook) 2293 tree->ops->writepage_end_io_hook(page, cur, 2294 page_end, NULL, 1); 2295 break; 2296 } 2297 em = epd->get_extent(inode, page, pg_offset, cur, 2298 end - cur + 1, 1); 2299 if (IS_ERR(em) || !em) { 2300 SetPageError(page); 2301 break; 2302 } 2303 2304 extent_offset = cur - em->start; 2305 BUG_ON(extent_map_end(em) <= cur); 2306 BUG_ON(end < cur); 2307 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2308 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2309 sector = (em->block_start + extent_offset) >> 9; 2310 bdev = em->bdev; 2311 block_start = em->block_start; 2312 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2313 free_extent_map(em); 2314 em = NULL; 2315 2316 /* 2317 * compressed and inline extents are written through other 2318 * paths in the FS 2319 */ 2320 if (compressed || block_start == EXTENT_MAP_HOLE || 2321 block_start == EXTENT_MAP_INLINE) { 2322 /* 2323 * end_io notification does not happen here for 2324 * compressed extents 2325 */ 2326 if (!compressed && tree->ops && 2327 tree->ops->writepage_end_io_hook) 2328 tree->ops->writepage_end_io_hook(page, cur, 2329 cur + iosize - 1, 2330 NULL, 1); 2331 else if (compressed) { 2332 /* we don't want to end_page_writeback on 2333 * a compressed extent. this happens 2334 * elsewhere 2335 */ 2336 nr++; 2337 } 2338 2339 cur += iosize; 2340 pg_offset += iosize; 2341 continue; 2342 } 2343 /* leave this out until we have a page_mkwrite call */ 2344 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2345 EXTENT_DIRTY, 0, NULL)) { 2346 cur = cur + iosize; 2347 pg_offset += iosize; 2348 continue; 2349 } 2350 2351 if (tree->ops && tree->ops->writepage_io_hook) { 2352 ret = tree->ops->writepage_io_hook(page, cur, 2353 cur + iosize - 1); 2354 } else { 2355 ret = 0; 2356 } 2357 if (ret) { 2358 SetPageError(page); 2359 } else { 2360 unsigned long max_nr = end_index + 1; 2361 2362 set_range_writeback(tree, cur, cur + iosize - 1); 2363 if (!PageWriteback(page)) { 2364 printk(KERN_ERR "btrfs warning page %lu not " 2365 "writeback, cur %llu end %llu\n", 2366 page->index, (unsigned long long)cur, 2367 (unsigned long long)end); 2368 } 2369 2370 ret = submit_extent_page(write_flags, tree, page, 2371 sector, iosize, pg_offset, 2372 bdev, &epd->bio, max_nr, 2373 end_bio_extent_writepage, 2374 0, 0, 0); 2375 if (ret) 2376 SetPageError(page); 2377 } 2378 cur = cur + iosize; 2379 pg_offset += iosize; 2380 nr++; 2381 } 2382 done: 2383 if (nr == 0) { 2384 /* make sure the mapping tag for page dirty gets cleared */ 2385 set_page_writeback(page); 2386 end_page_writeback(page); 2387 } 2388 unlock_page(page); 2389 2390 done_unlocked: 2391 2392 /* drop our reference on any cached states */ 2393 free_extent_state(cached_state); 2394 return 0; 2395 } 2396 2397 /** 2398 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 2399 * @mapping: address space structure to write 2400 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2401 * @writepage: function called for each page 2402 * @data: data passed to writepage function 2403 * 2404 * If a page is already under I/O, write_cache_pages() skips it, even 2405 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2406 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2407 * and msync() need to guarantee that all the data which was dirty at the time 2408 * the call was made get new I/O started against them. If wbc->sync_mode is 2409 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2410 * existing IO to complete. 2411 */ 2412 static int extent_write_cache_pages(struct extent_io_tree *tree, 2413 struct address_space *mapping, 2414 struct writeback_control *wbc, 2415 writepage_t writepage, void *data, 2416 void (*flush_fn)(void *)) 2417 { 2418 int ret = 0; 2419 int done = 0; 2420 int nr_to_write_done = 0; 2421 struct pagevec pvec; 2422 int nr_pages; 2423 pgoff_t index; 2424 pgoff_t end; /* Inclusive */ 2425 int scanned = 0; 2426 2427 pagevec_init(&pvec, 0); 2428 if (wbc->range_cyclic) { 2429 index = mapping->writeback_index; /* Start from prev offset */ 2430 end = -1; 2431 } else { 2432 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2433 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2434 scanned = 1; 2435 } 2436 retry: 2437 while (!done && !nr_to_write_done && (index <= end) && 2438 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2439 PAGECACHE_TAG_DIRTY, min(end - index, 2440 (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2441 unsigned i; 2442 2443 scanned = 1; 2444 for (i = 0; i < nr_pages; i++) { 2445 struct page *page = pvec.pages[i]; 2446 2447 /* 2448 * At this point we hold neither mapping->tree_lock nor 2449 * lock on the page itself: the page may be truncated or 2450 * invalidated (changing page->mapping to NULL), or even 2451 * swizzled back from swapper_space to tmpfs file 2452 * mapping 2453 */ 2454 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2455 tree->ops->write_cache_pages_lock_hook(page); 2456 else 2457 lock_page(page); 2458 2459 if (unlikely(page->mapping != mapping)) { 2460 unlock_page(page); 2461 continue; 2462 } 2463 2464 if (!wbc->range_cyclic && page->index > end) { 2465 done = 1; 2466 unlock_page(page); 2467 continue; 2468 } 2469 2470 if (wbc->sync_mode != WB_SYNC_NONE) { 2471 if (PageWriteback(page)) 2472 flush_fn(data); 2473 wait_on_page_writeback(page); 2474 } 2475 2476 if (PageWriteback(page) || 2477 !clear_page_dirty_for_io(page)) { 2478 unlock_page(page); 2479 continue; 2480 } 2481 2482 ret = (*writepage)(page, wbc, data); 2483 2484 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 2485 unlock_page(page); 2486 ret = 0; 2487 } 2488 if (ret) 2489 done = 1; 2490 2491 /* 2492 * the filesystem may choose to bump up nr_to_write. 2493 * We have to make sure to honor the new nr_to_write 2494 * at any time 2495 */ 2496 nr_to_write_done = wbc->nr_to_write <= 0; 2497 } 2498 pagevec_release(&pvec); 2499 cond_resched(); 2500 } 2501 if (!scanned && !done) { 2502 /* 2503 * We hit the last page and there is more work to be done: wrap 2504 * back to the start of the file 2505 */ 2506 scanned = 1; 2507 index = 0; 2508 goto retry; 2509 } 2510 return ret; 2511 } 2512 2513 static void flush_epd_write_bio(struct extent_page_data *epd) 2514 { 2515 if (epd->bio) { 2516 if (epd->sync_io) 2517 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2518 else 2519 submit_one_bio(WRITE, epd->bio, 0, 0); 2520 epd->bio = NULL; 2521 } 2522 } 2523 2524 static noinline void flush_write_bio(void *data) 2525 { 2526 struct extent_page_data *epd = data; 2527 flush_epd_write_bio(epd); 2528 } 2529 2530 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2531 get_extent_t *get_extent, 2532 struct writeback_control *wbc) 2533 { 2534 int ret; 2535 struct address_space *mapping = page->mapping; 2536 struct extent_page_data epd = { 2537 .bio = NULL, 2538 .tree = tree, 2539 .get_extent = get_extent, 2540 .extent_locked = 0, 2541 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2542 }; 2543 struct writeback_control wbc_writepages = { 2544 .sync_mode = wbc->sync_mode, 2545 .older_than_this = NULL, 2546 .nr_to_write = 64, 2547 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2548 .range_end = (loff_t)-1, 2549 }; 2550 2551 ret = __extent_writepage(page, wbc, &epd); 2552 2553 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2554 __extent_writepage, &epd, flush_write_bio); 2555 flush_epd_write_bio(&epd); 2556 return ret; 2557 } 2558 2559 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 2560 u64 start, u64 end, get_extent_t *get_extent, 2561 int mode) 2562 { 2563 int ret = 0; 2564 struct address_space *mapping = inode->i_mapping; 2565 struct page *page; 2566 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 2567 PAGE_CACHE_SHIFT; 2568 2569 struct extent_page_data epd = { 2570 .bio = NULL, 2571 .tree = tree, 2572 .get_extent = get_extent, 2573 .extent_locked = 1, 2574 .sync_io = mode == WB_SYNC_ALL, 2575 }; 2576 struct writeback_control wbc_writepages = { 2577 .sync_mode = mode, 2578 .older_than_this = NULL, 2579 .nr_to_write = nr_pages * 2, 2580 .range_start = start, 2581 .range_end = end + 1, 2582 }; 2583 2584 while (start <= end) { 2585 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 2586 if (clear_page_dirty_for_io(page)) 2587 ret = __extent_writepage(page, &wbc_writepages, &epd); 2588 else { 2589 if (tree->ops && tree->ops->writepage_end_io_hook) 2590 tree->ops->writepage_end_io_hook(page, start, 2591 start + PAGE_CACHE_SIZE - 1, 2592 NULL, 1); 2593 unlock_page(page); 2594 } 2595 page_cache_release(page); 2596 start += PAGE_CACHE_SIZE; 2597 } 2598 2599 flush_epd_write_bio(&epd); 2600 return ret; 2601 } 2602 2603 int extent_writepages(struct extent_io_tree *tree, 2604 struct address_space *mapping, 2605 get_extent_t *get_extent, 2606 struct writeback_control *wbc) 2607 { 2608 int ret = 0; 2609 struct extent_page_data epd = { 2610 .bio = NULL, 2611 .tree = tree, 2612 .get_extent = get_extent, 2613 .extent_locked = 0, 2614 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2615 }; 2616 2617 ret = extent_write_cache_pages(tree, mapping, wbc, 2618 __extent_writepage, &epd, 2619 flush_write_bio); 2620 flush_epd_write_bio(&epd); 2621 return ret; 2622 } 2623 2624 int extent_readpages(struct extent_io_tree *tree, 2625 struct address_space *mapping, 2626 struct list_head *pages, unsigned nr_pages, 2627 get_extent_t get_extent) 2628 { 2629 struct bio *bio = NULL; 2630 unsigned page_idx; 2631 unsigned long bio_flags = 0; 2632 2633 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2634 struct page *page = list_entry(pages->prev, struct page, lru); 2635 2636 prefetchw(&page->flags); 2637 list_del(&page->lru); 2638 if (!add_to_page_cache_lru(page, mapping, 2639 page->index, GFP_KERNEL)) { 2640 __extent_read_full_page(tree, page, get_extent, 2641 &bio, 0, &bio_flags); 2642 } 2643 page_cache_release(page); 2644 } 2645 BUG_ON(!list_empty(pages)); 2646 if (bio) 2647 submit_one_bio(READ, bio, 0, bio_flags); 2648 return 0; 2649 } 2650 2651 /* 2652 * basic invalidatepage code, this waits on any locked or writeback 2653 * ranges corresponding to the page, and then deletes any extent state 2654 * records from the tree 2655 */ 2656 int extent_invalidatepage(struct extent_io_tree *tree, 2657 struct page *page, unsigned long offset) 2658 { 2659 struct extent_state *cached_state = NULL; 2660 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2661 u64 end = start + PAGE_CACHE_SIZE - 1; 2662 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2663 2664 start += (offset + blocksize - 1) & ~(blocksize - 1); 2665 if (start > end) 2666 return 0; 2667 2668 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 2669 wait_on_page_writeback(page); 2670 clear_extent_bit(tree, start, end, 2671 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2672 EXTENT_DO_ACCOUNTING, 2673 1, 1, &cached_state, GFP_NOFS); 2674 return 0; 2675 } 2676 2677 /* 2678 * simple commit_write call, set_range_dirty is used to mark both 2679 * the pages and the extent records as dirty 2680 */ 2681 int extent_commit_write(struct extent_io_tree *tree, 2682 struct inode *inode, struct page *page, 2683 unsigned from, unsigned to) 2684 { 2685 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 2686 2687 set_page_extent_mapped(page); 2688 set_page_dirty(page); 2689 2690 if (pos > inode->i_size) { 2691 i_size_write(inode, pos); 2692 mark_inode_dirty(inode); 2693 } 2694 return 0; 2695 } 2696 2697 int extent_prepare_write(struct extent_io_tree *tree, 2698 struct inode *inode, struct page *page, 2699 unsigned from, unsigned to, get_extent_t *get_extent) 2700 { 2701 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2702 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 2703 u64 block_start; 2704 u64 orig_block_start; 2705 u64 block_end; 2706 u64 cur_end; 2707 struct extent_map *em; 2708 unsigned blocksize = 1 << inode->i_blkbits; 2709 size_t page_offset = 0; 2710 size_t block_off_start; 2711 size_t block_off_end; 2712 int err = 0; 2713 int iocount = 0; 2714 int ret = 0; 2715 int isnew; 2716 2717 set_page_extent_mapped(page); 2718 2719 block_start = (page_start + from) & ~((u64)blocksize - 1); 2720 block_end = (page_start + to - 1) | (blocksize - 1); 2721 orig_block_start = block_start; 2722 2723 lock_extent(tree, page_start, page_end, GFP_NOFS); 2724 while (block_start <= block_end) { 2725 em = get_extent(inode, page, page_offset, block_start, 2726 block_end - block_start + 1, 1); 2727 if (IS_ERR(em) || !em) 2728 goto err; 2729 2730 cur_end = min(block_end, extent_map_end(em) - 1); 2731 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 2732 block_off_end = block_off_start + blocksize; 2733 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 2734 2735 if (!PageUptodate(page) && isnew && 2736 (block_off_end > to || block_off_start < from)) { 2737 void *kaddr; 2738 2739 kaddr = kmap_atomic(page, KM_USER0); 2740 if (block_off_end > to) 2741 memset(kaddr + to, 0, block_off_end - to); 2742 if (block_off_start < from) 2743 memset(kaddr + block_off_start, 0, 2744 from - block_off_start); 2745 flush_dcache_page(page); 2746 kunmap_atomic(kaddr, KM_USER0); 2747 } 2748 if ((em->block_start != EXTENT_MAP_HOLE && 2749 em->block_start != EXTENT_MAP_INLINE) && 2750 !isnew && !PageUptodate(page) && 2751 (block_off_end > to || block_off_start < from) && 2752 !test_range_bit(tree, block_start, cur_end, 2753 EXTENT_UPTODATE, 1, NULL)) { 2754 u64 sector; 2755 u64 extent_offset = block_start - em->start; 2756 size_t iosize; 2757 sector = (em->block_start + extent_offset) >> 9; 2758 iosize = (cur_end - block_start + blocksize) & 2759 ~((u64)blocksize - 1); 2760 /* 2761 * we've already got the extent locked, but we 2762 * need to split the state such that our end_bio 2763 * handler can clear the lock. 2764 */ 2765 set_extent_bit(tree, block_start, 2766 block_start + iosize - 1, 2767 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); 2768 ret = submit_extent_page(READ, tree, page, 2769 sector, iosize, page_offset, em->bdev, 2770 NULL, 1, 2771 end_bio_extent_preparewrite, 0, 2772 0, 0); 2773 if (ret && !err) 2774 err = ret; 2775 iocount++; 2776 block_start = block_start + iosize; 2777 } else { 2778 set_extent_uptodate(tree, block_start, cur_end, 2779 GFP_NOFS); 2780 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2781 block_start = cur_end + 1; 2782 } 2783 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2784 free_extent_map(em); 2785 } 2786 if (iocount) { 2787 wait_extent_bit(tree, orig_block_start, 2788 block_end, EXTENT_LOCKED); 2789 } 2790 check_page_uptodate(tree, page); 2791 err: 2792 /* FIXME, zero out newly allocated blocks on error */ 2793 return err; 2794 } 2795 2796 /* 2797 * a helper for releasepage, this tests for areas of the page that 2798 * are locked or under IO and drops the related state bits if it is safe 2799 * to drop the page. 2800 */ 2801 int try_release_extent_state(struct extent_map_tree *map, 2802 struct extent_io_tree *tree, struct page *page, 2803 gfp_t mask) 2804 { 2805 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2806 u64 end = start + PAGE_CACHE_SIZE - 1; 2807 int ret = 1; 2808 2809 if (test_range_bit(tree, start, end, 2810 EXTENT_IOBITS, 0, NULL)) 2811 ret = 0; 2812 else { 2813 if ((mask & GFP_NOFS) == GFP_NOFS) 2814 mask = GFP_NOFS; 2815 /* 2816 * at this point we can safely clear everything except the 2817 * locked bit and the nodatasum bit 2818 */ 2819 clear_extent_bit(tree, start, end, 2820 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2821 0, 0, NULL, mask); 2822 } 2823 return ret; 2824 } 2825 2826 /* 2827 * a helper for releasepage. As long as there are no locked extents 2828 * in the range corresponding to the page, both state records and extent 2829 * map records are removed 2830 */ 2831 int try_release_extent_mapping(struct extent_map_tree *map, 2832 struct extent_io_tree *tree, struct page *page, 2833 gfp_t mask) 2834 { 2835 struct extent_map *em; 2836 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2837 u64 end = start + PAGE_CACHE_SIZE - 1; 2838 2839 if ((mask & __GFP_WAIT) && 2840 page->mapping->host->i_size > 16 * 1024 * 1024) { 2841 u64 len; 2842 while (start <= end) { 2843 len = end - start + 1; 2844 write_lock(&map->lock); 2845 em = lookup_extent_mapping(map, start, len); 2846 if (!em || IS_ERR(em)) { 2847 write_unlock(&map->lock); 2848 break; 2849 } 2850 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2851 em->start != start) { 2852 write_unlock(&map->lock); 2853 free_extent_map(em); 2854 break; 2855 } 2856 if (!test_range_bit(tree, em->start, 2857 extent_map_end(em) - 1, 2858 EXTENT_LOCKED | EXTENT_WRITEBACK, 2859 0, NULL)) { 2860 remove_extent_mapping(map, em); 2861 /* once for the rb tree */ 2862 free_extent_map(em); 2863 } 2864 start = extent_map_end(em); 2865 write_unlock(&map->lock); 2866 2867 /* once for us */ 2868 free_extent_map(em); 2869 } 2870 } 2871 return try_release_extent_state(map, tree, page, mask); 2872 } 2873 2874 sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 2875 get_extent_t *get_extent) 2876 { 2877 struct inode *inode = mapping->host; 2878 struct extent_state *cached_state = NULL; 2879 u64 start = iblock << inode->i_blkbits; 2880 sector_t sector = 0; 2881 size_t blksize = (1 << inode->i_blkbits); 2882 struct extent_map *em; 2883 2884 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2885 0, &cached_state, GFP_NOFS); 2886 em = get_extent(inode, NULL, 0, start, blksize, 0); 2887 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, 2888 start + blksize - 1, &cached_state, GFP_NOFS); 2889 if (!em || IS_ERR(em)) 2890 return 0; 2891 2892 if (em->block_start > EXTENT_MAP_LAST_BYTE) 2893 goto out; 2894 2895 sector = (em->block_start + start - em->start) >> inode->i_blkbits; 2896 out: 2897 free_extent_map(em); 2898 return sector; 2899 } 2900 2901 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2902 __u64 start, __u64 len, get_extent_t *get_extent) 2903 { 2904 int ret; 2905 u64 off = start; 2906 u64 max = start + len; 2907 u32 flags = 0; 2908 u64 disko = 0; 2909 struct extent_map *em = NULL; 2910 struct extent_state *cached_state = NULL; 2911 int end = 0; 2912 u64 em_start = 0, em_len = 0; 2913 unsigned long emflags; 2914 ret = 0; 2915 2916 if (len == 0) 2917 return -EINVAL; 2918 2919 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2920 &cached_state, GFP_NOFS); 2921 em = get_extent(inode, NULL, 0, off, max - off, 0); 2922 if (!em) 2923 goto out; 2924 if (IS_ERR(em)) { 2925 ret = PTR_ERR(em); 2926 goto out; 2927 } 2928 while (!end) { 2929 off = em->start + em->len; 2930 if (off >= max) 2931 end = 1; 2932 2933 em_start = em->start; 2934 em_len = em->len; 2935 2936 disko = 0; 2937 flags = 0; 2938 2939 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2940 end = 1; 2941 flags |= FIEMAP_EXTENT_LAST; 2942 } else if (em->block_start == EXTENT_MAP_HOLE) { 2943 flags |= FIEMAP_EXTENT_UNWRITTEN; 2944 } else if (em->block_start == EXTENT_MAP_INLINE) { 2945 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2946 FIEMAP_EXTENT_NOT_ALIGNED); 2947 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 2948 flags |= (FIEMAP_EXTENT_DELALLOC | 2949 FIEMAP_EXTENT_UNKNOWN); 2950 } else { 2951 disko = em->block_start; 2952 } 2953 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2954 flags |= FIEMAP_EXTENT_ENCODED; 2955 2956 emflags = em->flags; 2957 free_extent_map(em); 2958 em = NULL; 2959 2960 if (!end) { 2961 em = get_extent(inode, NULL, 0, off, max - off, 0); 2962 if (!em) 2963 goto out; 2964 if (IS_ERR(em)) { 2965 ret = PTR_ERR(em); 2966 goto out; 2967 } 2968 emflags = em->flags; 2969 } 2970 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 2971 flags |= FIEMAP_EXTENT_LAST; 2972 end = 1; 2973 } 2974 2975 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 2976 em_len, flags); 2977 if (ret) 2978 goto out_free; 2979 } 2980 out_free: 2981 free_extent_map(em); 2982 out: 2983 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 2984 &cached_state, GFP_NOFS); 2985 return ret; 2986 } 2987 2988 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2989 unsigned long i) 2990 { 2991 struct page *p; 2992 struct address_space *mapping; 2993 2994 if (i == 0) 2995 return eb->first_page; 2996 i += eb->start >> PAGE_CACHE_SHIFT; 2997 mapping = eb->first_page->mapping; 2998 if (!mapping) 2999 return NULL; 3000 3001 /* 3002 * extent_buffer_page is only called after pinning the page 3003 * by increasing the reference count. So we know the page must 3004 * be in the radix tree. 3005 */ 3006 rcu_read_lock(); 3007 p = radix_tree_lookup(&mapping->page_tree, i); 3008 rcu_read_unlock(); 3009 3010 return p; 3011 } 3012 3013 static inline unsigned long num_extent_pages(u64 start, u64 len) 3014 { 3015 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3016 (start >> PAGE_CACHE_SHIFT); 3017 } 3018 3019 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3020 u64 start, 3021 unsigned long len, 3022 gfp_t mask) 3023 { 3024 struct extent_buffer *eb = NULL; 3025 #if LEAK_DEBUG 3026 unsigned long flags; 3027 #endif 3028 3029 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3030 eb->start = start; 3031 eb->len = len; 3032 spin_lock_init(&eb->lock); 3033 init_waitqueue_head(&eb->lock_wq); 3034 3035 #if LEAK_DEBUG 3036 spin_lock_irqsave(&leak_lock, flags); 3037 list_add(&eb->leak_list, &buffers); 3038 spin_unlock_irqrestore(&leak_lock, flags); 3039 #endif 3040 atomic_set(&eb->refs, 1); 3041 3042 return eb; 3043 } 3044 3045 static void __free_extent_buffer(struct extent_buffer *eb) 3046 { 3047 #if LEAK_DEBUG 3048 unsigned long flags; 3049 spin_lock_irqsave(&leak_lock, flags); 3050 list_del(&eb->leak_list); 3051 spin_unlock_irqrestore(&leak_lock, flags); 3052 #endif 3053 kmem_cache_free(extent_buffer_cache, eb); 3054 } 3055 3056 /* 3057 * Helper for releasing extent buffer page. 3058 */ 3059 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 3060 unsigned long start_idx) 3061 { 3062 unsigned long index; 3063 struct page *page; 3064 3065 if (!eb->first_page) 3066 return; 3067 3068 index = num_extent_pages(eb->start, eb->len); 3069 if (start_idx >= index) 3070 return; 3071 3072 do { 3073 index--; 3074 page = extent_buffer_page(eb, index); 3075 if (page) 3076 page_cache_release(page); 3077 } while (index != start_idx); 3078 } 3079 3080 /* 3081 * Helper for releasing the extent buffer. 3082 */ 3083 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 3084 { 3085 btrfs_release_extent_buffer_page(eb, 0); 3086 __free_extent_buffer(eb); 3087 } 3088 3089 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3090 u64 start, unsigned long len, 3091 struct page *page0, 3092 gfp_t mask) 3093 { 3094 unsigned long num_pages = num_extent_pages(start, len); 3095 unsigned long i; 3096 unsigned long index = start >> PAGE_CACHE_SHIFT; 3097 struct extent_buffer *eb; 3098 struct extent_buffer *exists = NULL; 3099 struct page *p; 3100 struct address_space *mapping = tree->mapping; 3101 int uptodate = 1; 3102 int ret; 3103 3104 rcu_read_lock(); 3105 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3106 if (eb && atomic_inc_not_zero(&eb->refs)) { 3107 rcu_read_unlock(); 3108 mark_page_accessed(eb->first_page); 3109 return eb; 3110 } 3111 rcu_read_unlock(); 3112 3113 eb = __alloc_extent_buffer(tree, start, len, mask); 3114 if (!eb) 3115 return NULL; 3116 3117 if (page0) { 3118 eb->first_page = page0; 3119 i = 1; 3120 index++; 3121 page_cache_get(page0); 3122 mark_page_accessed(page0); 3123 set_page_extent_mapped(page0); 3124 set_page_extent_head(page0, len); 3125 uptodate = PageUptodate(page0); 3126 } else { 3127 i = 0; 3128 } 3129 for (; i < num_pages; i++, index++) { 3130 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); 3131 if (!p) { 3132 WARN_ON(1); 3133 goto free_eb; 3134 } 3135 set_page_extent_mapped(p); 3136 mark_page_accessed(p); 3137 if (i == 0) { 3138 eb->first_page = p; 3139 set_page_extent_head(p, len); 3140 } else { 3141 set_page_private(p, EXTENT_PAGE_PRIVATE); 3142 } 3143 if (!PageUptodate(p)) 3144 uptodate = 0; 3145 unlock_page(p); 3146 } 3147 if (uptodate) 3148 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3149 3150 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 3151 if (ret) 3152 goto free_eb; 3153 3154 spin_lock(&tree->buffer_lock); 3155 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 3156 if (ret == -EEXIST) { 3157 exists = radix_tree_lookup(&tree->buffer, 3158 start >> PAGE_CACHE_SHIFT); 3159 /* add one reference for the caller */ 3160 atomic_inc(&exists->refs); 3161 spin_unlock(&tree->buffer_lock); 3162 radix_tree_preload_end(); 3163 goto free_eb; 3164 } 3165 /* add one reference for the tree */ 3166 atomic_inc(&eb->refs); 3167 spin_unlock(&tree->buffer_lock); 3168 radix_tree_preload_end(); 3169 return eb; 3170 3171 free_eb: 3172 if (!atomic_dec_and_test(&eb->refs)) 3173 return exists; 3174 btrfs_release_extent_buffer(eb); 3175 return exists; 3176 } 3177 3178 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3179 u64 start, unsigned long len, 3180 gfp_t mask) 3181 { 3182 struct extent_buffer *eb; 3183 3184 rcu_read_lock(); 3185 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3186 if (eb && atomic_inc_not_zero(&eb->refs)) { 3187 rcu_read_unlock(); 3188 mark_page_accessed(eb->first_page); 3189 return eb; 3190 } 3191 rcu_read_unlock(); 3192 3193 return NULL; 3194 } 3195 3196 void free_extent_buffer(struct extent_buffer *eb) 3197 { 3198 if (!eb) 3199 return; 3200 3201 if (!atomic_dec_and_test(&eb->refs)) 3202 return; 3203 3204 WARN_ON(1); 3205 } 3206 3207 int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3208 struct extent_buffer *eb) 3209 { 3210 unsigned long i; 3211 unsigned long num_pages; 3212 struct page *page; 3213 3214 num_pages = num_extent_pages(eb->start, eb->len); 3215 3216 for (i = 0; i < num_pages; i++) { 3217 page = extent_buffer_page(eb, i); 3218 if (!PageDirty(page)) 3219 continue; 3220 3221 lock_page(page); 3222 if (i == 0) 3223 set_page_extent_head(page, eb->len); 3224 else 3225 set_page_private(page, EXTENT_PAGE_PRIVATE); 3226 3227 clear_page_dirty_for_io(page); 3228 spin_lock_irq(&page->mapping->tree_lock); 3229 if (!PageDirty(page)) { 3230 radix_tree_tag_clear(&page->mapping->page_tree, 3231 page_index(page), 3232 PAGECACHE_TAG_DIRTY); 3233 } 3234 spin_unlock_irq(&page->mapping->tree_lock); 3235 unlock_page(page); 3236 } 3237 return 0; 3238 } 3239 3240 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, 3241 struct extent_buffer *eb) 3242 { 3243 return wait_on_extent_writeback(tree, eb->start, 3244 eb->start + eb->len - 1); 3245 } 3246 3247 int set_extent_buffer_dirty(struct extent_io_tree *tree, 3248 struct extent_buffer *eb) 3249 { 3250 unsigned long i; 3251 unsigned long num_pages; 3252 int was_dirty = 0; 3253 3254 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3255 num_pages = num_extent_pages(eb->start, eb->len); 3256 for (i = 0; i < num_pages; i++) 3257 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3258 return was_dirty; 3259 } 3260 3261 int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3262 struct extent_buffer *eb, 3263 struct extent_state **cached_state) 3264 { 3265 unsigned long i; 3266 struct page *page; 3267 unsigned long num_pages; 3268 3269 num_pages = num_extent_pages(eb->start, eb->len); 3270 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3271 3272 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3273 cached_state, GFP_NOFS); 3274 for (i = 0; i < num_pages; i++) { 3275 page = extent_buffer_page(eb, i); 3276 if (page) 3277 ClearPageUptodate(page); 3278 } 3279 return 0; 3280 } 3281 3282 int set_extent_buffer_uptodate(struct extent_io_tree *tree, 3283 struct extent_buffer *eb) 3284 { 3285 unsigned long i; 3286 struct page *page; 3287 unsigned long num_pages; 3288 3289 num_pages = num_extent_pages(eb->start, eb->len); 3290 3291 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3292 GFP_NOFS); 3293 for (i = 0; i < num_pages; i++) { 3294 page = extent_buffer_page(eb, i); 3295 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3296 ((i == num_pages - 1) && 3297 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 3298 check_page_uptodate(tree, page); 3299 continue; 3300 } 3301 SetPageUptodate(page); 3302 } 3303 return 0; 3304 } 3305 3306 int extent_range_uptodate(struct extent_io_tree *tree, 3307 u64 start, u64 end) 3308 { 3309 struct page *page; 3310 int ret; 3311 int pg_uptodate = 1; 3312 int uptodate; 3313 unsigned long index; 3314 3315 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3316 if (ret) 3317 return 1; 3318 while (start <= end) { 3319 index = start >> PAGE_CACHE_SHIFT; 3320 page = find_get_page(tree->mapping, index); 3321 uptodate = PageUptodate(page); 3322 page_cache_release(page); 3323 if (!uptodate) { 3324 pg_uptodate = 0; 3325 break; 3326 } 3327 start += PAGE_CACHE_SIZE; 3328 } 3329 return pg_uptodate; 3330 } 3331 3332 int extent_buffer_uptodate(struct extent_io_tree *tree, 3333 struct extent_buffer *eb, 3334 struct extent_state *cached_state) 3335 { 3336 int ret = 0; 3337 unsigned long num_pages; 3338 unsigned long i; 3339 struct page *page; 3340 int pg_uptodate = 1; 3341 3342 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3343 return 1; 3344 3345 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3346 EXTENT_UPTODATE, 1, cached_state); 3347 if (ret) 3348 return ret; 3349 3350 num_pages = num_extent_pages(eb->start, eb->len); 3351 for (i = 0; i < num_pages; i++) { 3352 page = extent_buffer_page(eb, i); 3353 if (!PageUptodate(page)) { 3354 pg_uptodate = 0; 3355 break; 3356 } 3357 } 3358 return pg_uptodate; 3359 } 3360 3361 int read_extent_buffer_pages(struct extent_io_tree *tree, 3362 struct extent_buffer *eb, 3363 u64 start, int wait, 3364 get_extent_t *get_extent, int mirror_num) 3365 { 3366 unsigned long i; 3367 unsigned long start_i; 3368 struct page *page; 3369 int err; 3370 int ret = 0; 3371 int locked_pages = 0; 3372 int all_uptodate = 1; 3373 int inc_all_pages = 0; 3374 unsigned long num_pages; 3375 struct bio *bio = NULL; 3376 unsigned long bio_flags = 0; 3377 3378 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3379 return 0; 3380 3381 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3382 EXTENT_UPTODATE, 1, NULL)) { 3383 return 0; 3384 } 3385 3386 if (start) { 3387 WARN_ON(start < eb->start); 3388 start_i = (start >> PAGE_CACHE_SHIFT) - 3389 (eb->start >> PAGE_CACHE_SHIFT); 3390 } else { 3391 start_i = 0; 3392 } 3393 3394 num_pages = num_extent_pages(eb->start, eb->len); 3395 for (i = start_i; i < num_pages; i++) { 3396 page = extent_buffer_page(eb, i); 3397 if (!wait) { 3398 if (!trylock_page(page)) 3399 goto unlock_exit; 3400 } else { 3401 lock_page(page); 3402 } 3403 locked_pages++; 3404 if (!PageUptodate(page)) 3405 all_uptodate = 0; 3406 } 3407 if (all_uptodate) { 3408 if (start_i == 0) 3409 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3410 goto unlock_exit; 3411 } 3412 3413 for (i = start_i; i < num_pages; i++) { 3414 page = extent_buffer_page(eb, i); 3415 if (inc_all_pages) 3416 page_cache_get(page); 3417 if (!PageUptodate(page)) { 3418 if (start_i == 0) 3419 inc_all_pages = 1; 3420 ClearPageError(page); 3421 err = __extent_read_full_page(tree, page, 3422 get_extent, &bio, 3423 mirror_num, &bio_flags); 3424 if (err) 3425 ret = err; 3426 } else { 3427 unlock_page(page); 3428 } 3429 } 3430 3431 if (bio) 3432 submit_one_bio(READ, bio, mirror_num, bio_flags); 3433 3434 if (ret || !wait) 3435 return ret; 3436 3437 for (i = start_i; i < num_pages; i++) { 3438 page = extent_buffer_page(eb, i); 3439 wait_on_page_locked(page); 3440 if (!PageUptodate(page)) 3441 ret = -EIO; 3442 } 3443 3444 if (!ret) 3445 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3446 return ret; 3447 3448 unlock_exit: 3449 i = start_i; 3450 while (locked_pages > 0) { 3451 page = extent_buffer_page(eb, i); 3452 i++; 3453 unlock_page(page); 3454 locked_pages--; 3455 } 3456 return ret; 3457 } 3458 3459 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 3460 unsigned long start, 3461 unsigned long len) 3462 { 3463 size_t cur; 3464 size_t offset; 3465 struct page *page; 3466 char *kaddr; 3467 char *dst = (char *)dstv; 3468 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3469 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3470 3471 WARN_ON(start > eb->len); 3472 WARN_ON(start + len > eb->start + eb->len); 3473 3474 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3475 3476 while (len > 0) { 3477 page = extent_buffer_page(eb, i); 3478 3479 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3480 kaddr = kmap_atomic(page, KM_USER1); 3481 memcpy(dst, kaddr + offset, cur); 3482 kunmap_atomic(kaddr, KM_USER1); 3483 3484 dst += cur; 3485 len -= cur; 3486 offset = 0; 3487 i++; 3488 } 3489 } 3490 3491 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3492 unsigned long min_len, char **token, char **map, 3493 unsigned long *map_start, 3494 unsigned long *map_len, int km) 3495 { 3496 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3497 char *kaddr; 3498 struct page *p; 3499 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3500 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3501 unsigned long end_i = (start_offset + start + min_len - 1) >> 3502 PAGE_CACHE_SHIFT; 3503 3504 if (i != end_i) 3505 return -EINVAL; 3506 3507 if (i == 0) { 3508 offset = start_offset; 3509 *map_start = 0; 3510 } else { 3511 offset = 0; 3512 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 3513 } 3514 3515 if (start + min_len > eb->len) { 3516 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 3517 "wanted %lu %lu\n", (unsigned long long)eb->start, 3518 eb->len, start, min_len); 3519 WARN_ON(1); 3520 } 3521 3522 p = extent_buffer_page(eb, i); 3523 kaddr = kmap_atomic(p, km); 3524 *token = kaddr; 3525 *map = kaddr + offset; 3526 *map_len = PAGE_CACHE_SIZE - offset; 3527 return 0; 3528 } 3529 3530 int map_extent_buffer(struct extent_buffer *eb, unsigned long start, 3531 unsigned long min_len, 3532 char **token, char **map, 3533 unsigned long *map_start, 3534 unsigned long *map_len, int km) 3535 { 3536 int err; 3537 int save = 0; 3538 if (eb->map_token) { 3539 unmap_extent_buffer(eb, eb->map_token, km); 3540 eb->map_token = NULL; 3541 save = 1; 3542 } 3543 err = map_private_extent_buffer(eb, start, min_len, token, map, 3544 map_start, map_len, km); 3545 if (!err && save) { 3546 eb->map_token = *token; 3547 eb->kaddr = *map; 3548 eb->map_start = *map_start; 3549 eb->map_len = *map_len; 3550 } 3551 return err; 3552 } 3553 3554 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) 3555 { 3556 kunmap_atomic(token, km); 3557 } 3558 3559 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3560 unsigned long start, 3561 unsigned long len) 3562 { 3563 size_t cur; 3564 size_t offset; 3565 struct page *page; 3566 char *kaddr; 3567 char *ptr = (char *)ptrv; 3568 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3569 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3570 int ret = 0; 3571 3572 WARN_ON(start > eb->len); 3573 WARN_ON(start + len > eb->start + eb->len); 3574 3575 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3576 3577 while (len > 0) { 3578 page = extent_buffer_page(eb, i); 3579 3580 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3581 3582 kaddr = kmap_atomic(page, KM_USER0); 3583 ret = memcmp(ptr, kaddr + offset, cur); 3584 kunmap_atomic(kaddr, KM_USER0); 3585 if (ret) 3586 break; 3587 3588 ptr += cur; 3589 len -= cur; 3590 offset = 0; 3591 i++; 3592 } 3593 return ret; 3594 } 3595 3596 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 3597 unsigned long start, unsigned long len) 3598 { 3599 size_t cur; 3600 size_t offset; 3601 struct page *page; 3602 char *kaddr; 3603 char *src = (char *)srcv; 3604 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3605 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3606 3607 WARN_ON(start > eb->len); 3608 WARN_ON(start + len > eb->start + eb->len); 3609 3610 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3611 3612 while (len > 0) { 3613 page = extent_buffer_page(eb, i); 3614 WARN_ON(!PageUptodate(page)); 3615 3616 cur = min(len, PAGE_CACHE_SIZE - offset); 3617 kaddr = kmap_atomic(page, KM_USER1); 3618 memcpy(kaddr + offset, src, cur); 3619 kunmap_atomic(kaddr, KM_USER1); 3620 3621 src += cur; 3622 len -= cur; 3623 offset = 0; 3624 i++; 3625 } 3626 } 3627 3628 void memset_extent_buffer(struct extent_buffer *eb, char c, 3629 unsigned long start, unsigned long len) 3630 { 3631 size_t cur; 3632 size_t offset; 3633 struct page *page; 3634 char *kaddr; 3635 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3636 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3637 3638 WARN_ON(start > eb->len); 3639 WARN_ON(start + len > eb->start + eb->len); 3640 3641 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3642 3643 while (len > 0) { 3644 page = extent_buffer_page(eb, i); 3645 WARN_ON(!PageUptodate(page)); 3646 3647 cur = min(len, PAGE_CACHE_SIZE - offset); 3648 kaddr = kmap_atomic(page, KM_USER0); 3649 memset(kaddr + offset, c, cur); 3650 kunmap_atomic(kaddr, KM_USER0); 3651 3652 len -= cur; 3653 offset = 0; 3654 i++; 3655 } 3656 } 3657 3658 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 3659 unsigned long dst_offset, unsigned long src_offset, 3660 unsigned long len) 3661 { 3662 u64 dst_len = dst->len; 3663 size_t cur; 3664 size_t offset; 3665 struct page *page; 3666 char *kaddr; 3667 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3668 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3669 3670 WARN_ON(src->len != dst_len); 3671 3672 offset = (start_offset + dst_offset) & 3673 ((unsigned long)PAGE_CACHE_SIZE - 1); 3674 3675 while (len > 0) { 3676 page = extent_buffer_page(dst, i); 3677 WARN_ON(!PageUptodate(page)); 3678 3679 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3680 3681 kaddr = kmap_atomic(page, KM_USER0); 3682 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3683 kunmap_atomic(kaddr, KM_USER0); 3684 3685 src_offset += cur; 3686 len -= cur; 3687 offset = 0; 3688 i++; 3689 } 3690 } 3691 3692 static void move_pages(struct page *dst_page, struct page *src_page, 3693 unsigned long dst_off, unsigned long src_off, 3694 unsigned long len) 3695 { 3696 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3697 if (dst_page == src_page) { 3698 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3699 } else { 3700 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3701 char *p = dst_kaddr + dst_off + len; 3702 char *s = src_kaddr + src_off + len; 3703 3704 while (len--) 3705 *--p = *--s; 3706 3707 kunmap_atomic(src_kaddr, KM_USER1); 3708 } 3709 kunmap_atomic(dst_kaddr, KM_USER0); 3710 } 3711 3712 static void copy_pages(struct page *dst_page, struct page *src_page, 3713 unsigned long dst_off, unsigned long src_off, 3714 unsigned long len) 3715 { 3716 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3717 char *src_kaddr; 3718 3719 if (dst_page != src_page) 3720 src_kaddr = kmap_atomic(src_page, KM_USER1); 3721 else 3722 src_kaddr = dst_kaddr; 3723 3724 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3725 kunmap_atomic(dst_kaddr, KM_USER0); 3726 if (dst_page != src_page) 3727 kunmap_atomic(src_kaddr, KM_USER1); 3728 } 3729 3730 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3731 unsigned long src_offset, unsigned long len) 3732 { 3733 size_t cur; 3734 size_t dst_off_in_page; 3735 size_t src_off_in_page; 3736 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3737 unsigned long dst_i; 3738 unsigned long src_i; 3739 3740 if (src_offset + len > dst->len) { 3741 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3742 "len %lu dst len %lu\n", src_offset, len, dst->len); 3743 BUG_ON(1); 3744 } 3745 if (dst_offset + len > dst->len) { 3746 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3747 "len %lu dst len %lu\n", dst_offset, len, dst->len); 3748 BUG_ON(1); 3749 } 3750 3751 while (len > 0) { 3752 dst_off_in_page = (start_offset + dst_offset) & 3753 ((unsigned long)PAGE_CACHE_SIZE - 1); 3754 src_off_in_page = (start_offset + src_offset) & 3755 ((unsigned long)PAGE_CACHE_SIZE - 1); 3756 3757 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3758 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 3759 3760 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 3761 src_off_in_page)); 3762 cur = min_t(unsigned long, cur, 3763 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 3764 3765 copy_pages(extent_buffer_page(dst, dst_i), 3766 extent_buffer_page(dst, src_i), 3767 dst_off_in_page, src_off_in_page, cur); 3768 3769 src_offset += cur; 3770 dst_offset += cur; 3771 len -= cur; 3772 } 3773 } 3774 3775 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3776 unsigned long src_offset, unsigned long len) 3777 { 3778 size_t cur; 3779 size_t dst_off_in_page; 3780 size_t src_off_in_page; 3781 unsigned long dst_end = dst_offset + len - 1; 3782 unsigned long src_end = src_offset + len - 1; 3783 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3784 unsigned long dst_i; 3785 unsigned long src_i; 3786 3787 if (src_offset + len > dst->len) { 3788 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3789 "len %lu len %lu\n", src_offset, len, dst->len); 3790 BUG_ON(1); 3791 } 3792 if (dst_offset + len > dst->len) { 3793 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3794 "len %lu len %lu\n", dst_offset, len, dst->len); 3795 BUG_ON(1); 3796 } 3797 if (dst_offset < src_offset) { 3798 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3799 return; 3800 } 3801 while (len > 0) { 3802 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 3803 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 3804 3805 dst_off_in_page = (start_offset + dst_end) & 3806 ((unsigned long)PAGE_CACHE_SIZE - 1); 3807 src_off_in_page = (start_offset + src_end) & 3808 ((unsigned long)PAGE_CACHE_SIZE - 1); 3809 3810 cur = min_t(unsigned long, len, src_off_in_page + 1); 3811 cur = min(cur, dst_off_in_page + 1); 3812 move_pages(extent_buffer_page(dst, dst_i), 3813 extent_buffer_page(dst, src_i), 3814 dst_off_in_page - cur + 1, 3815 src_off_in_page - cur + 1, cur); 3816 3817 dst_end -= cur; 3818 src_end -= cur; 3819 len -= cur; 3820 } 3821 } 3822 3823 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3824 { 3825 struct extent_buffer *eb = 3826 container_of(head, struct extent_buffer, rcu_head); 3827 3828 btrfs_release_extent_buffer(eb); 3829 } 3830 3831 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3832 { 3833 u64 start = page_offset(page); 3834 struct extent_buffer *eb; 3835 int ret = 1; 3836 3837 spin_lock(&tree->buffer_lock); 3838 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3839 if (!eb) 3840 goto out; 3841 3842 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3843 ret = 0; 3844 goto out; 3845 } 3846 3847 /* 3848 * set @eb->refs to 0 if it is already 1, and then release the @eb. 3849 * Or go back. 3850 */ 3851 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 3852 ret = 0; 3853 goto out; 3854 } 3855 3856 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3857 out: 3858 spin_unlock(&tree->buffer_lock); 3859 3860 /* at this point we can safely release the extent buffer */ 3861 if (atomic_read(&eb->refs) == 0) 3862 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3863 return ret; 3864 } 3865