1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/module.h> 8 #include <linux/spinlock.h> 9 #include <linux/blkdev.h> 10 #include <linux/swap.h> 11 #include <linux/writeback.h> 12 #include <linux/pagevec.h> 13 #include <linux/prefetch.h> 14 #include <linux/cleancache.h> 15 #include "extent_io.h" 16 #include "extent_map.h" 17 #include "compat.h" 18 #include "ctree.h" 19 #include "btrfs_inode.h" 20 #include "volumes.h" 21 #include "check-integrity.h" 22 23 static struct kmem_cache *extent_state_cache; 24 static struct kmem_cache *extent_buffer_cache; 25 26 static LIST_HEAD(buffers); 27 static LIST_HEAD(states); 28 29 #define LEAK_DEBUG 0 30 #if LEAK_DEBUG 31 static DEFINE_SPINLOCK(leak_lock); 32 #endif 33 34 #define BUFFER_LRU_MAX 64 35 36 struct tree_entry { 37 u64 start; 38 u64 end; 39 struct rb_node rb_node; 40 }; 41 42 struct extent_page_data { 43 struct bio *bio; 44 struct extent_io_tree *tree; 45 get_extent_t *get_extent; 46 47 /* tells writepage not to lock the state bits for this range 48 * it still does the unlocking 49 */ 50 unsigned int extent_locked:1; 51 52 /* tells the submit_bio code to use a WRITE_SYNC */ 53 unsigned int sync_io:1; 54 }; 55 56 int __init extent_io_init(void) 57 { 58 extent_state_cache = kmem_cache_create("extent_state", 59 sizeof(struct extent_state), 0, 60 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 61 if (!extent_state_cache) 62 return -ENOMEM; 63 64 extent_buffer_cache = kmem_cache_create("extent_buffers", 65 sizeof(struct extent_buffer), 0, 66 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 67 if (!extent_buffer_cache) 68 goto free_state_cache; 69 return 0; 70 71 free_state_cache: 72 kmem_cache_destroy(extent_state_cache); 73 return -ENOMEM; 74 } 75 76 void extent_io_exit(void) 77 { 78 struct extent_state *state; 79 struct extent_buffer *eb; 80 81 while (!list_empty(&states)) { 82 state = list_entry(states.next, struct extent_state, leak_list); 83 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 84 "state %lu in tree %p refs %d\n", 85 (unsigned long long)state->start, 86 (unsigned long long)state->end, 87 state->state, state->tree, atomic_read(&state->refs)); 88 list_del(&state->leak_list); 89 kmem_cache_free(extent_state_cache, state); 90 91 } 92 93 while (!list_empty(&buffers)) { 94 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 95 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 96 "refs %d\n", (unsigned long long)eb->start, 97 eb->len, atomic_read(&eb->refs)); 98 list_del(&eb->leak_list); 99 kmem_cache_free(extent_buffer_cache, eb); 100 } 101 if (extent_state_cache) 102 kmem_cache_destroy(extent_state_cache); 103 if (extent_buffer_cache) 104 kmem_cache_destroy(extent_buffer_cache); 105 } 106 107 void extent_io_tree_init(struct extent_io_tree *tree, 108 struct address_space *mapping) 109 { 110 tree->state = RB_ROOT; 111 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 112 tree->ops = NULL; 113 tree->dirty_bytes = 0; 114 spin_lock_init(&tree->lock); 115 spin_lock_init(&tree->buffer_lock); 116 tree->mapping = mapping; 117 } 118 119 static struct extent_state *alloc_extent_state(gfp_t mask) 120 { 121 struct extent_state *state; 122 #if LEAK_DEBUG 123 unsigned long flags; 124 #endif 125 126 state = kmem_cache_alloc(extent_state_cache, mask); 127 if (!state) 128 return state; 129 state->state = 0; 130 state->private = 0; 131 state->tree = NULL; 132 #if LEAK_DEBUG 133 spin_lock_irqsave(&leak_lock, flags); 134 list_add(&state->leak_list, &states); 135 spin_unlock_irqrestore(&leak_lock, flags); 136 #endif 137 atomic_set(&state->refs, 1); 138 init_waitqueue_head(&state->wq); 139 return state; 140 } 141 142 void free_extent_state(struct extent_state *state) 143 { 144 if (!state) 145 return; 146 if (atomic_dec_and_test(&state->refs)) { 147 #if LEAK_DEBUG 148 unsigned long flags; 149 #endif 150 WARN_ON(state->tree); 151 #if LEAK_DEBUG 152 spin_lock_irqsave(&leak_lock, flags); 153 list_del(&state->leak_list); 154 spin_unlock_irqrestore(&leak_lock, flags); 155 #endif 156 kmem_cache_free(extent_state_cache, state); 157 } 158 } 159 160 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 161 struct rb_node *node) 162 { 163 struct rb_node **p = &root->rb_node; 164 struct rb_node *parent = NULL; 165 struct tree_entry *entry; 166 167 while (*p) { 168 parent = *p; 169 entry = rb_entry(parent, struct tree_entry, rb_node); 170 171 if (offset < entry->start) 172 p = &(*p)->rb_left; 173 else if (offset > entry->end) 174 p = &(*p)->rb_right; 175 else 176 return parent; 177 } 178 179 entry = rb_entry(node, struct tree_entry, rb_node); 180 rb_link_node(node, parent, p); 181 rb_insert_color(node, root); 182 return NULL; 183 } 184 185 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 186 struct rb_node **prev_ret, 187 struct rb_node **next_ret) 188 { 189 struct rb_root *root = &tree->state; 190 struct rb_node *n = root->rb_node; 191 struct rb_node *prev = NULL; 192 struct rb_node *orig_prev = NULL; 193 struct tree_entry *entry; 194 struct tree_entry *prev_entry = NULL; 195 196 while (n) { 197 entry = rb_entry(n, struct tree_entry, rb_node); 198 prev = n; 199 prev_entry = entry; 200 201 if (offset < entry->start) 202 n = n->rb_left; 203 else if (offset > entry->end) 204 n = n->rb_right; 205 else 206 return n; 207 } 208 209 if (prev_ret) { 210 orig_prev = prev; 211 while (prev && offset > prev_entry->end) { 212 prev = rb_next(prev); 213 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 214 } 215 *prev_ret = prev; 216 prev = orig_prev; 217 } 218 219 if (next_ret) { 220 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 221 while (prev && offset < prev_entry->start) { 222 prev = rb_prev(prev); 223 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 224 } 225 *next_ret = prev; 226 } 227 return NULL; 228 } 229 230 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 231 u64 offset) 232 { 233 struct rb_node *prev = NULL; 234 struct rb_node *ret; 235 236 ret = __etree_search(tree, offset, &prev, NULL); 237 if (!ret) 238 return prev; 239 return ret; 240 } 241 242 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 243 struct extent_state *other) 244 { 245 if (tree->ops && tree->ops->merge_extent_hook) 246 tree->ops->merge_extent_hook(tree->mapping->host, new, 247 other); 248 } 249 250 /* 251 * utility function to look for merge candidates inside a given range. 252 * Any extents with matching state are merged together into a single 253 * extent in the tree. Extents with EXTENT_IO in their state field 254 * are not merged because the end_io handlers need to be able to do 255 * operations on them without sleeping (or doing allocations/splits). 256 * 257 * This should be called with the tree lock held. 258 */ 259 static void merge_state(struct extent_io_tree *tree, 260 struct extent_state *state) 261 { 262 struct extent_state *other; 263 struct rb_node *other_node; 264 265 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 266 return; 267 268 other_node = rb_prev(&state->rb_node); 269 if (other_node) { 270 other = rb_entry(other_node, struct extent_state, rb_node); 271 if (other->end == state->start - 1 && 272 other->state == state->state) { 273 merge_cb(tree, state, other); 274 state->start = other->start; 275 other->tree = NULL; 276 rb_erase(&other->rb_node, &tree->state); 277 free_extent_state(other); 278 } 279 } 280 other_node = rb_next(&state->rb_node); 281 if (other_node) { 282 other = rb_entry(other_node, struct extent_state, rb_node); 283 if (other->start == state->end + 1 && 284 other->state == state->state) { 285 merge_cb(tree, state, other); 286 state->end = other->end; 287 other->tree = NULL; 288 rb_erase(&other->rb_node, &tree->state); 289 free_extent_state(other); 290 } 291 } 292 } 293 294 static void set_state_cb(struct extent_io_tree *tree, 295 struct extent_state *state, int *bits) 296 { 297 if (tree->ops && tree->ops->set_bit_hook) 298 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 299 } 300 301 static void clear_state_cb(struct extent_io_tree *tree, 302 struct extent_state *state, int *bits) 303 { 304 if (tree->ops && tree->ops->clear_bit_hook) 305 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 306 } 307 308 static void set_state_bits(struct extent_io_tree *tree, 309 struct extent_state *state, int *bits); 310 311 /* 312 * insert an extent_state struct into the tree. 'bits' are set on the 313 * struct before it is inserted. 314 * 315 * This may return -EEXIST if the extent is already there, in which case the 316 * state struct is freed. 317 * 318 * The tree lock is not taken internally. This is a utility function and 319 * probably isn't what you want to call (see set/clear_extent_bit). 320 */ 321 static int insert_state(struct extent_io_tree *tree, 322 struct extent_state *state, u64 start, u64 end, 323 int *bits) 324 { 325 struct rb_node *node; 326 327 if (end < start) { 328 printk(KERN_ERR "btrfs end < start %llu %llu\n", 329 (unsigned long long)end, 330 (unsigned long long)start); 331 WARN_ON(1); 332 } 333 state->start = start; 334 state->end = end; 335 336 set_state_bits(tree, state, bits); 337 338 node = tree_insert(&tree->state, end, &state->rb_node); 339 if (node) { 340 struct extent_state *found; 341 found = rb_entry(node, struct extent_state, rb_node); 342 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 343 "%llu %llu\n", (unsigned long long)found->start, 344 (unsigned long long)found->end, 345 (unsigned long long)start, (unsigned long long)end); 346 return -EEXIST; 347 } 348 state->tree = tree; 349 merge_state(tree, state); 350 return 0; 351 } 352 353 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 354 u64 split) 355 { 356 if (tree->ops && tree->ops->split_extent_hook) 357 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 358 } 359 360 /* 361 * split a given extent state struct in two, inserting the preallocated 362 * struct 'prealloc' as the newly created second half. 'split' indicates an 363 * offset inside 'orig' where it should be split. 364 * 365 * Before calling, 366 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 367 * are two extent state structs in the tree: 368 * prealloc: [orig->start, split - 1] 369 * orig: [ split, orig->end ] 370 * 371 * The tree locks are not taken by this function. They need to be held 372 * by the caller. 373 */ 374 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 375 struct extent_state *prealloc, u64 split) 376 { 377 struct rb_node *node; 378 379 split_cb(tree, orig, split); 380 381 prealloc->start = orig->start; 382 prealloc->end = split - 1; 383 prealloc->state = orig->state; 384 orig->start = split; 385 386 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 387 if (node) { 388 free_extent_state(prealloc); 389 return -EEXIST; 390 } 391 prealloc->tree = tree; 392 return 0; 393 } 394 395 /* 396 * utility function to clear some bits in an extent state struct. 397 * it will optionally wake up any one waiting on this state (wake == 1), or 398 * forcibly remove the state from the tree (delete == 1). 399 * 400 * If no bits are set on the state struct after clearing things, the 401 * struct is freed and removed from the tree 402 */ 403 static int clear_state_bit(struct extent_io_tree *tree, 404 struct extent_state *state, 405 int *bits, int wake) 406 { 407 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 408 int ret = state->state & bits_to_clear; 409 410 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 411 u64 range = state->end - state->start + 1; 412 WARN_ON(range > tree->dirty_bytes); 413 tree->dirty_bytes -= range; 414 } 415 clear_state_cb(tree, state, bits); 416 state->state &= ~bits_to_clear; 417 if (wake) 418 wake_up(&state->wq); 419 if (state->state == 0) { 420 if (state->tree) { 421 rb_erase(&state->rb_node, &tree->state); 422 state->tree = NULL; 423 free_extent_state(state); 424 } else { 425 WARN_ON(1); 426 } 427 } else { 428 merge_state(tree, state); 429 } 430 return ret; 431 } 432 433 static struct extent_state * 434 alloc_extent_state_atomic(struct extent_state *prealloc) 435 { 436 if (!prealloc) 437 prealloc = alloc_extent_state(GFP_ATOMIC); 438 439 return prealloc; 440 } 441 442 /* 443 * clear some bits on a range in the tree. This may require splitting 444 * or inserting elements in the tree, so the gfp mask is used to 445 * indicate which allocations or sleeping are allowed. 446 * 447 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 448 * the given range from the tree regardless of state (ie for truncate). 449 * 450 * the range [start, end] is inclusive. 451 * 452 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 453 * bits were already set, or zero if none of the bits were already set. 454 */ 455 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 456 int bits, int wake, int delete, 457 struct extent_state **cached_state, 458 gfp_t mask) 459 { 460 struct extent_state *state; 461 struct extent_state *cached; 462 struct extent_state *prealloc = NULL; 463 struct rb_node *next_node; 464 struct rb_node *node; 465 u64 last_end; 466 int err; 467 int set = 0; 468 int clear = 0; 469 470 if (delete) 471 bits |= ~EXTENT_CTLBITS; 472 bits |= EXTENT_FIRST_DELALLOC; 473 474 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 475 clear = 1; 476 again: 477 if (!prealloc && (mask & __GFP_WAIT)) { 478 prealloc = alloc_extent_state(mask); 479 if (!prealloc) 480 return -ENOMEM; 481 } 482 483 spin_lock(&tree->lock); 484 if (cached_state) { 485 cached = *cached_state; 486 487 if (clear) { 488 *cached_state = NULL; 489 cached_state = NULL; 490 } 491 492 if (cached && cached->tree && cached->start <= start && 493 cached->end > start) { 494 if (clear) 495 atomic_dec(&cached->refs); 496 state = cached; 497 goto hit_next; 498 } 499 if (clear) 500 free_extent_state(cached); 501 } 502 /* 503 * this search will find the extents that end after 504 * our range starts 505 */ 506 node = tree_search(tree, start); 507 if (!node) 508 goto out; 509 state = rb_entry(node, struct extent_state, rb_node); 510 hit_next: 511 if (state->start > end) 512 goto out; 513 WARN_ON(state->end < start); 514 last_end = state->end; 515 516 /* 517 * | ---- desired range ---- | 518 * | state | or 519 * | ------------- state -------------- | 520 * 521 * We need to split the extent we found, and may flip 522 * bits on second half. 523 * 524 * If the extent we found extends past our range, we 525 * just split and search again. It'll get split again 526 * the next time though. 527 * 528 * If the extent we found is inside our range, we clear 529 * the desired bit on it. 530 */ 531 532 if (state->start < start) { 533 prealloc = alloc_extent_state_atomic(prealloc); 534 BUG_ON(!prealloc); 535 err = split_state(tree, state, prealloc, start); 536 BUG_ON(err == -EEXIST); 537 prealloc = NULL; 538 if (err) 539 goto out; 540 if (state->end <= end) { 541 set |= clear_state_bit(tree, state, &bits, wake); 542 if (last_end == (u64)-1) 543 goto out; 544 start = last_end + 1; 545 } 546 goto search_again; 547 } 548 /* 549 * | ---- desired range ---- | 550 * | state | 551 * We need to split the extent, and clear the bit 552 * on the first half 553 */ 554 if (state->start <= end && state->end > end) { 555 prealloc = alloc_extent_state_atomic(prealloc); 556 BUG_ON(!prealloc); 557 err = split_state(tree, state, prealloc, end + 1); 558 BUG_ON(err == -EEXIST); 559 if (wake) 560 wake_up(&state->wq); 561 562 set |= clear_state_bit(tree, prealloc, &bits, wake); 563 564 prealloc = NULL; 565 goto out; 566 } 567 568 if (state->end < end && prealloc && !need_resched()) 569 next_node = rb_next(&state->rb_node); 570 else 571 next_node = NULL; 572 573 set |= clear_state_bit(tree, state, &bits, wake); 574 if (last_end == (u64)-1) 575 goto out; 576 start = last_end + 1; 577 if (start <= end && next_node) { 578 state = rb_entry(next_node, struct extent_state, 579 rb_node); 580 if (state->start == start) 581 goto hit_next; 582 } 583 goto search_again; 584 585 out: 586 spin_unlock(&tree->lock); 587 if (prealloc) 588 free_extent_state(prealloc); 589 590 return set; 591 592 search_again: 593 if (start > end) 594 goto out; 595 spin_unlock(&tree->lock); 596 if (mask & __GFP_WAIT) 597 cond_resched(); 598 goto again; 599 } 600 601 static int wait_on_state(struct extent_io_tree *tree, 602 struct extent_state *state) 603 __releases(tree->lock) 604 __acquires(tree->lock) 605 { 606 DEFINE_WAIT(wait); 607 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 608 spin_unlock(&tree->lock); 609 schedule(); 610 spin_lock(&tree->lock); 611 finish_wait(&state->wq, &wait); 612 return 0; 613 } 614 615 /* 616 * waits for one or more bits to clear on a range in the state tree. 617 * The range [start, end] is inclusive. 618 * The tree lock is taken by this function 619 */ 620 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 621 { 622 struct extent_state *state; 623 struct rb_node *node; 624 625 spin_lock(&tree->lock); 626 again: 627 while (1) { 628 /* 629 * this search will find all the extents that end after 630 * our range starts 631 */ 632 node = tree_search(tree, start); 633 if (!node) 634 break; 635 636 state = rb_entry(node, struct extent_state, rb_node); 637 638 if (state->start > end) 639 goto out; 640 641 if (state->state & bits) { 642 start = state->start; 643 atomic_inc(&state->refs); 644 wait_on_state(tree, state); 645 free_extent_state(state); 646 goto again; 647 } 648 start = state->end + 1; 649 650 if (start > end) 651 break; 652 653 cond_resched_lock(&tree->lock); 654 } 655 out: 656 spin_unlock(&tree->lock); 657 return 0; 658 } 659 660 static void set_state_bits(struct extent_io_tree *tree, 661 struct extent_state *state, 662 int *bits) 663 { 664 int bits_to_set = *bits & ~EXTENT_CTLBITS; 665 666 set_state_cb(tree, state, bits); 667 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 668 u64 range = state->end - state->start + 1; 669 tree->dirty_bytes += range; 670 } 671 state->state |= bits_to_set; 672 } 673 674 static void cache_state(struct extent_state *state, 675 struct extent_state **cached_ptr) 676 { 677 if (cached_ptr && !(*cached_ptr)) { 678 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 679 *cached_ptr = state; 680 atomic_inc(&state->refs); 681 } 682 } 683 } 684 685 static void uncache_state(struct extent_state **cached_ptr) 686 { 687 if (cached_ptr && (*cached_ptr)) { 688 struct extent_state *state = *cached_ptr; 689 *cached_ptr = NULL; 690 free_extent_state(state); 691 } 692 } 693 694 /* 695 * set some bits on a range in the tree. This may require allocations or 696 * sleeping, so the gfp mask is used to indicate what is allowed. 697 * 698 * If any of the exclusive bits are set, this will fail with -EEXIST if some 699 * part of the range already has the desired bits set. The start of the 700 * existing range is returned in failed_start in this case. 701 * 702 * [start, end] is inclusive This takes the tree lock. 703 */ 704 705 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 706 int bits, int exclusive_bits, u64 *failed_start, 707 struct extent_state **cached_state, gfp_t mask) 708 { 709 struct extent_state *state; 710 struct extent_state *prealloc = NULL; 711 struct rb_node *node; 712 int err = 0; 713 u64 last_start; 714 u64 last_end; 715 716 bits |= EXTENT_FIRST_DELALLOC; 717 again: 718 if (!prealloc && (mask & __GFP_WAIT)) { 719 prealloc = alloc_extent_state(mask); 720 BUG_ON(!prealloc); 721 } 722 723 spin_lock(&tree->lock); 724 if (cached_state && *cached_state) { 725 state = *cached_state; 726 if (state->start <= start && state->end > start && 727 state->tree) { 728 node = &state->rb_node; 729 goto hit_next; 730 } 731 } 732 /* 733 * this search will find all the extents that end after 734 * our range starts. 735 */ 736 node = tree_search(tree, start); 737 if (!node) { 738 prealloc = alloc_extent_state_atomic(prealloc); 739 BUG_ON(!prealloc); 740 err = insert_state(tree, prealloc, start, end, &bits); 741 prealloc = NULL; 742 BUG_ON(err == -EEXIST); 743 goto out; 744 } 745 state = rb_entry(node, struct extent_state, rb_node); 746 hit_next: 747 last_start = state->start; 748 last_end = state->end; 749 750 /* 751 * | ---- desired range ---- | 752 * | state | 753 * 754 * Just lock what we found and keep going 755 */ 756 if (state->start == start && state->end <= end) { 757 struct rb_node *next_node; 758 if (state->state & exclusive_bits) { 759 *failed_start = state->start; 760 err = -EEXIST; 761 goto out; 762 } 763 764 set_state_bits(tree, state, &bits); 765 766 cache_state(state, cached_state); 767 merge_state(tree, state); 768 if (last_end == (u64)-1) 769 goto out; 770 771 start = last_end + 1; 772 next_node = rb_next(&state->rb_node); 773 if (next_node && start < end && prealloc && !need_resched()) { 774 state = rb_entry(next_node, struct extent_state, 775 rb_node); 776 if (state->start == start) 777 goto hit_next; 778 } 779 goto search_again; 780 } 781 782 /* 783 * | ---- desired range ---- | 784 * | state | 785 * or 786 * | ------------- state -------------- | 787 * 788 * We need to split the extent we found, and may flip bits on 789 * second half. 790 * 791 * If the extent we found extends past our 792 * range, we just split and search again. It'll get split 793 * again the next time though. 794 * 795 * If the extent we found is inside our range, we set the 796 * desired bit on it. 797 */ 798 if (state->start < start) { 799 if (state->state & exclusive_bits) { 800 *failed_start = start; 801 err = -EEXIST; 802 goto out; 803 } 804 805 prealloc = alloc_extent_state_atomic(prealloc); 806 BUG_ON(!prealloc); 807 err = split_state(tree, state, prealloc, start); 808 BUG_ON(err == -EEXIST); 809 prealloc = NULL; 810 if (err) 811 goto out; 812 if (state->end <= end) { 813 set_state_bits(tree, state, &bits); 814 cache_state(state, cached_state); 815 merge_state(tree, state); 816 if (last_end == (u64)-1) 817 goto out; 818 start = last_end + 1; 819 } 820 goto search_again; 821 } 822 /* 823 * | ---- desired range ---- | 824 * | state | or | state | 825 * 826 * There's a hole, we need to insert something in it and 827 * ignore the extent we found. 828 */ 829 if (state->start > start) { 830 u64 this_end; 831 if (end < last_start) 832 this_end = end; 833 else 834 this_end = last_start - 1; 835 836 prealloc = alloc_extent_state_atomic(prealloc); 837 BUG_ON(!prealloc); 838 839 /* 840 * Avoid to free 'prealloc' if it can be merged with 841 * the later extent. 842 */ 843 err = insert_state(tree, prealloc, start, this_end, 844 &bits); 845 BUG_ON(err == -EEXIST); 846 if (err) { 847 free_extent_state(prealloc); 848 prealloc = NULL; 849 goto out; 850 } 851 cache_state(prealloc, cached_state); 852 prealloc = NULL; 853 start = this_end + 1; 854 goto search_again; 855 } 856 /* 857 * | ---- desired range ---- | 858 * | state | 859 * We need to split the extent, and set the bit 860 * on the first half 861 */ 862 if (state->start <= end && state->end > end) { 863 if (state->state & exclusive_bits) { 864 *failed_start = start; 865 err = -EEXIST; 866 goto out; 867 } 868 869 prealloc = alloc_extent_state_atomic(prealloc); 870 BUG_ON(!prealloc); 871 err = split_state(tree, state, prealloc, end + 1); 872 BUG_ON(err == -EEXIST); 873 874 set_state_bits(tree, prealloc, &bits); 875 cache_state(prealloc, cached_state); 876 merge_state(tree, prealloc); 877 prealloc = NULL; 878 goto out; 879 } 880 881 goto search_again; 882 883 out: 884 spin_unlock(&tree->lock); 885 if (prealloc) 886 free_extent_state(prealloc); 887 888 return err; 889 890 search_again: 891 if (start > end) 892 goto out; 893 spin_unlock(&tree->lock); 894 if (mask & __GFP_WAIT) 895 cond_resched(); 896 goto again; 897 } 898 899 /** 900 * convert_extent - convert all bits in a given range from one bit to another 901 * @tree: the io tree to search 902 * @start: the start offset in bytes 903 * @end: the end offset in bytes (inclusive) 904 * @bits: the bits to set in this range 905 * @clear_bits: the bits to clear in this range 906 * @mask: the allocation mask 907 * 908 * This will go through and set bits for the given range. If any states exist 909 * already in this range they are set with the given bit and cleared of the 910 * clear_bits. This is only meant to be used by things that are mergeable, ie 911 * converting from say DELALLOC to DIRTY. This is not meant to be used with 912 * boundary bits like LOCK. 913 */ 914 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 915 int bits, int clear_bits, gfp_t mask) 916 { 917 struct extent_state *state; 918 struct extent_state *prealloc = NULL; 919 struct rb_node *node; 920 int err = 0; 921 u64 last_start; 922 u64 last_end; 923 924 again: 925 if (!prealloc && (mask & __GFP_WAIT)) { 926 prealloc = alloc_extent_state(mask); 927 if (!prealloc) 928 return -ENOMEM; 929 } 930 931 spin_lock(&tree->lock); 932 /* 933 * this search will find all the extents that end after 934 * our range starts. 935 */ 936 node = tree_search(tree, start); 937 if (!node) { 938 prealloc = alloc_extent_state_atomic(prealloc); 939 if (!prealloc) { 940 err = -ENOMEM; 941 goto out; 942 } 943 err = insert_state(tree, prealloc, start, end, &bits); 944 prealloc = NULL; 945 BUG_ON(err == -EEXIST); 946 goto out; 947 } 948 state = rb_entry(node, struct extent_state, rb_node); 949 hit_next: 950 last_start = state->start; 951 last_end = state->end; 952 953 /* 954 * | ---- desired range ---- | 955 * | state | 956 * 957 * Just lock what we found and keep going 958 */ 959 if (state->start == start && state->end <= end) { 960 struct rb_node *next_node; 961 962 set_state_bits(tree, state, &bits); 963 clear_state_bit(tree, state, &clear_bits, 0); 964 965 merge_state(tree, state); 966 if (last_end == (u64)-1) 967 goto out; 968 969 start = last_end + 1; 970 next_node = rb_next(&state->rb_node); 971 if (next_node && start < end && prealloc && !need_resched()) { 972 state = rb_entry(next_node, struct extent_state, 973 rb_node); 974 if (state->start == start) 975 goto hit_next; 976 } 977 goto search_again; 978 } 979 980 /* 981 * | ---- desired range ---- | 982 * | state | 983 * or 984 * | ------------- state -------------- | 985 * 986 * We need to split the extent we found, and may flip bits on 987 * second half. 988 * 989 * If the extent we found extends past our 990 * range, we just split and search again. It'll get split 991 * again the next time though. 992 * 993 * If the extent we found is inside our range, we set the 994 * desired bit on it. 995 */ 996 if (state->start < start) { 997 prealloc = alloc_extent_state_atomic(prealloc); 998 if (!prealloc) { 999 err = -ENOMEM; 1000 goto out; 1001 } 1002 err = split_state(tree, state, prealloc, start); 1003 BUG_ON(err == -EEXIST); 1004 prealloc = NULL; 1005 if (err) 1006 goto out; 1007 if (state->end <= end) { 1008 set_state_bits(tree, state, &bits); 1009 clear_state_bit(tree, state, &clear_bits, 0); 1010 merge_state(tree, state); 1011 if (last_end == (u64)-1) 1012 goto out; 1013 start = last_end + 1; 1014 } 1015 goto search_again; 1016 } 1017 /* 1018 * | ---- desired range ---- | 1019 * | state | or | state | 1020 * 1021 * There's a hole, we need to insert something in it and 1022 * ignore the extent we found. 1023 */ 1024 if (state->start > start) { 1025 u64 this_end; 1026 if (end < last_start) 1027 this_end = end; 1028 else 1029 this_end = last_start - 1; 1030 1031 prealloc = alloc_extent_state_atomic(prealloc); 1032 if (!prealloc) { 1033 err = -ENOMEM; 1034 goto out; 1035 } 1036 1037 /* 1038 * Avoid to free 'prealloc' if it can be merged with 1039 * the later extent. 1040 */ 1041 err = insert_state(tree, prealloc, start, this_end, 1042 &bits); 1043 BUG_ON(err == -EEXIST); 1044 if (err) { 1045 free_extent_state(prealloc); 1046 prealloc = NULL; 1047 goto out; 1048 } 1049 prealloc = NULL; 1050 start = this_end + 1; 1051 goto search_again; 1052 } 1053 /* 1054 * | ---- desired range ---- | 1055 * | state | 1056 * We need to split the extent, and set the bit 1057 * on the first half 1058 */ 1059 if (state->start <= end && state->end > end) { 1060 prealloc = alloc_extent_state_atomic(prealloc); 1061 if (!prealloc) { 1062 err = -ENOMEM; 1063 goto out; 1064 } 1065 1066 err = split_state(tree, state, prealloc, end + 1); 1067 BUG_ON(err == -EEXIST); 1068 1069 set_state_bits(tree, prealloc, &bits); 1070 clear_state_bit(tree, prealloc, &clear_bits, 0); 1071 1072 merge_state(tree, prealloc); 1073 prealloc = NULL; 1074 goto out; 1075 } 1076 1077 goto search_again; 1078 1079 out: 1080 spin_unlock(&tree->lock); 1081 if (prealloc) 1082 free_extent_state(prealloc); 1083 1084 return err; 1085 1086 search_again: 1087 if (start > end) 1088 goto out; 1089 spin_unlock(&tree->lock); 1090 if (mask & __GFP_WAIT) 1091 cond_resched(); 1092 goto again; 1093 } 1094 1095 /* wrappers around set/clear extent bit */ 1096 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1097 gfp_t mask) 1098 { 1099 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 1100 NULL, mask); 1101 } 1102 1103 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1104 int bits, gfp_t mask) 1105 { 1106 return set_extent_bit(tree, start, end, bits, 0, NULL, 1107 NULL, mask); 1108 } 1109 1110 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1111 int bits, gfp_t mask) 1112 { 1113 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1114 } 1115 1116 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 1117 struct extent_state **cached_state, gfp_t mask) 1118 { 1119 return set_extent_bit(tree, start, end, 1120 EXTENT_DELALLOC | EXTENT_UPTODATE, 1121 0, NULL, cached_state, mask); 1122 } 1123 1124 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1125 gfp_t mask) 1126 { 1127 return clear_extent_bit(tree, start, end, 1128 EXTENT_DIRTY | EXTENT_DELALLOC | 1129 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 1130 } 1131 1132 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 1133 gfp_t mask) 1134 { 1135 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 1136 NULL, mask); 1137 } 1138 1139 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1140 struct extent_state **cached_state, gfp_t mask) 1141 { 1142 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 1143 NULL, cached_state, mask); 1144 } 1145 1146 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 1147 u64 end, struct extent_state **cached_state, 1148 gfp_t mask) 1149 { 1150 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1151 cached_state, mask); 1152 } 1153 1154 /* 1155 * either insert or lock state struct between start and end use mask to tell 1156 * us if waiting is desired. 1157 */ 1158 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1159 int bits, struct extent_state **cached_state, gfp_t mask) 1160 { 1161 int err; 1162 u64 failed_start; 1163 while (1) { 1164 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1165 EXTENT_LOCKED, &failed_start, 1166 cached_state, mask); 1167 if (err == -EEXIST && (mask & __GFP_WAIT)) { 1168 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1169 start = failed_start; 1170 } else { 1171 break; 1172 } 1173 WARN_ON(start > end); 1174 } 1175 return err; 1176 } 1177 1178 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1179 { 1180 return lock_extent_bits(tree, start, end, 0, NULL, mask); 1181 } 1182 1183 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1184 gfp_t mask) 1185 { 1186 int err; 1187 u64 failed_start; 1188 1189 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1190 &failed_start, NULL, mask); 1191 if (err == -EEXIST) { 1192 if (failed_start > start) 1193 clear_extent_bit(tree, start, failed_start - 1, 1194 EXTENT_LOCKED, 1, 0, NULL, mask); 1195 return 0; 1196 } 1197 return 1; 1198 } 1199 1200 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1201 struct extent_state **cached, gfp_t mask) 1202 { 1203 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1204 mask); 1205 } 1206 1207 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1208 { 1209 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1210 mask); 1211 } 1212 1213 /* 1214 * helper function to set both pages and extents in the tree writeback 1215 */ 1216 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1217 { 1218 unsigned long index = start >> PAGE_CACHE_SHIFT; 1219 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1220 struct page *page; 1221 1222 while (index <= end_index) { 1223 page = find_get_page(tree->mapping, index); 1224 BUG_ON(!page); 1225 set_page_writeback(page); 1226 page_cache_release(page); 1227 index++; 1228 } 1229 return 0; 1230 } 1231 1232 /* find the first state struct with 'bits' set after 'start', and 1233 * return it. tree->lock must be held. NULL will returned if 1234 * nothing was found after 'start' 1235 */ 1236 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1237 u64 start, int bits) 1238 { 1239 struct rb_node *node; 1240 struct extent_state *state; 1241 1242 /* 1243 * this search will find all the extents that end after 1244 * our range starts. 1245 */ 1246 node = tree_search(tree, start); 1247 if (!node) 1248 goto out; 1249 1250 while (1) { 1251 state = rb_entry(node, struct extent_state, rb_node); 1252 if (state->end >= start && (state->state & bits)) 1253 return state; 1254 1255 node = rb_next(node); 1256 if (!node) 1257 break; 1258 } 1259 out: 1260 return NULL; 1261 } 1262 1263 /* 1264 * find the first offset in the io tree with 'bits' set. zero is 1265 * returned if we find something, and *start_ret and *end_ret are 1266 * set to reflect the state struct that was found. 1267 * 1268 * If nothing was found, 1 is returned, < 0 on error 1269 */ 1270 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1271 u64 *start_ret, u64 *end_ret, int bits) 1272 { 1273 struct extent_state *state; 1274 int ret = 1; 1275 1276 spin_lock(&tree->lock); 1277 state = find_first_extent_bit_state(tree, start, bits); 1278 if (state) { 1279 *start_ret = state->start; 1280 *end_ret = state->end; 1281 ret = 0; 1282 } 1283 spin_unlock(&tree->lock); 1284 return ret; 1285 } 1286 1287 /* 1288 * find a contiguous range of bytes in the file marked as delalloc, not 1289 * more than 'max_bytes'. start and end are used to return the range, 1290 * 1291 * 1 is returned if we find something, 0 if nothing was in the tree 1292 */ 1293 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1294 u64 *start, u64 *end, u64 max_bytes, 1295 struct extent_state **cached_state) 1296 { 1297 struct rb_node *node; 1298 struct extent_state *state; 1299 u64 cur_start = *start; 1300 u64 found = 0; 1301 u64 total_bytes = 0; 1302 1303 spin_lock(&tree->lock); 1304 1305 /* 1306 * this search will find all the extents that end after 1307 * our range starts. 1308 */ 1309 node = tree_search(tree, cur_start); 1310 if (!node) { 1311 if (!found) 1312 *end = (u64)-1; 1313 goto out; 1314 } 1315 1316 while (1) { 1317 state = rb_entry(node, struct extent_state, rb_node); 1318 if (found && (state->start != cur_start || 1319 (state->state & EXTENT_BOUNDARY))) { 1320 goto out; 1321 } 1322 if (!(state->state & EXTENT_DELALLOC)) { 1323 if (!found) 1324 *end = state->end; 1325 goto out; 1326 } 1327 if (!found) { 1328 *start = state->start; 1329 *cached_state = state; 1330 atomic_inc(&state->refs); 1331 } 1332 found++; 1333 *end = state->end; 1334 cur_start = state->end + 1; 1335 node = rb_next(node); 1336 if (!node) 1337 break; 1338 total_bytes += state->end - state->start + 1; 1339 if (total_bytes >= max_bytes) 1340 break; 1341 } 1342 out: 1343 spin_unlock(&tree->lock); 1344 return found; 1345 } 1346 1347 static noinline int __unlock_for_delalloc(struct inode *inode, 1348 struct page *locked_page, 1349 u64 start, u64 end) 1350 { 1351 int ret; 1352 struct page *pages[16]; 1353 unsigned long index = start >> PAGE_CACHE_SHIFT; 1354 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1355 unsigned long nr_pages = end_index - index + 1; 1356 int i; 1357 1358 if (index == locked_page->index && end_index == index) 1359 return 0; 1360 1361 while (nr_pages > 0) { 1362 ret = find_get_pages_contig(inode->i_mapping, index, 1363 min_t(unsigned long, nr_pages, 1364 ARRAY_SIZE(pages)), pages); 1365 for (i = 0; i < ret; i++) { 1366 if (pages[i] != locked_page) 1367 unlock_page(pages[i]); 1368 page_cache_release(pages[i]); 1369 } 1370 nr_pages -= ret; 1371 index += ret; 1372 cond_resched(); 1373 } 1374 return 0; 1375 } 1376 1377 static noinline int lock_delalloc_pages(struct inode *inode, 1378 struct page *locked_page, 1379 u64 delalloc_start, 1380 u64 delalloc_end) 1381 { 1382 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1383 unsigned long start_index = index; 1384 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1385 unsigned long pages_locked = 0; 1386 struct page *pages[16]; 1387 unsigned long nrpages; 1388 int ret; 1389 int i; 1390 1391 /* the caller is responsible for locking the start index */ 1392 if (index == locked_page->index && index == end_index) 1393 return 0; 1394 1395 /* skip the page at the start index */ 1396 nrpages = end_index - index + 1; 1397 while (nrpages > 0) { 1398 ret = find_get_pages_contig(inode->i_mapping, index, 1399 min_t(unsigned long, 1400 nrpages, ARRAY_SIZE(pages)), pages); 1401 if (ret == 0) { 1402 ret = -EAGAIN; 1403 goto done; 1404 } 1405 /* now we have an array of pages, lock them all */ 1406 for (i = 0; i < ret; i++) { 1407 /* 1408 * the caller is taking responsibility for 1409 * locked_page 1410 */ 1411 if (pages[i] != locked_page) { 1412 lock_page(pages[i]); 1413 if (!PageDirty(pages[i]) || 1414 pages[i]->mapping != inode->i_mapping) { 1415 ret = -EAGAIN; 1416 unlock_page(pages[i]); 1417 page_cache_release(pages[i]); 1418 goto done; 1419 } 1420 } 1421 page_cache_release(pages[i]); 1422 pages_locked++; 1423 } 1424 nrpages -= ret; 1425 index += ret; 1426 cond_resched(); 1427 } 1428 ret = 0; 1429 done: 1430 if (ret && pages_locked) { 1431 __unlock_for_delalloc(inode, locked_page, 1432 delalloc_start, 1433 ((u64)(start_index + pages_locked - 1)) << 1434 PAGE_CACHE_SHIFT); 1435 } 1436 return ret; 1437 } 1438 1439 /* 1440 * find a contiguous range of bytes in the file marked as delalloc, not 1441 * more than 'max_bytes'. start and end are used to return the range, 1442 * 1443 * 1 is returned if we find something, 0 if nothing was in the tree 1444 */ 1445 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1446 struct extent_io_tree *tree, 1447 struct page *locked_page, 1448 u64 *start, u64 *end, 1449 u64 max_bytes) 1450 { 1451 u64 delalloc_start; 1452 u64 delalloc_end; 1453 u64 found; 1454 struct extent_state *cached_state = NULL; 1455 int ret; 1456 int loops = 0; 1457 1458 again: 1459 /* step one, find a bunch of delalloc bytes starting at start */ 1460 delalloc_start = *start; 1461 delalloc_end = 0; 1462 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1463 max_bytes, &cached_state); 1464 if (!found || delalloc_end <= *start) { 1465 *start = delalloc_start; 1466 *end = delalloc_end; 1467 free_extent_state(cached_state); 1468 return found; 1469 } 1470 1471 /* 1472 * start comes from the offset of locked_page. We have to lock 1473 * pages in order, so we can't process delalloc bytes before 1474 * locked_page 1475 */ 1476 if (delalloc_start < *start) 1477 delalloc_start = *start; 1478 1479 /* 1480 * make sure to limit the number of pages we try to lock down 1481 * if we're looping. 1482 */ 1483 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1484 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1485 1486 /* step two, lock all the pages after the page that has start */ 1487 ret = lock_delalloc_pages(inode, locked_page, 1488 delalloc_start, delalloc_end); 1489 if (ret == -EAGAIN) { 1490 /* some of the pages are gone, lets avoid looping by 1491 * shortening the size of the delalloc range we're searching 1492 */ 1493 free_extent_state(cached_state); 1494 if (!loops) { 1495 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1496 max_bytes = PAGE_CACHE_SIZE - offset; 1497 loops = 1; 1498 goto again; 1499 } else { 1500 found = 0; 1501 goto out_failed; 1502 } 1503 } 1504 BUG_ON(ret); 1505 1506 /* step three, lock the state bits for the whole range */ 1507 lock_extent_bits(tree, delalloc_start, delalloc_end, 1508 0, &cached_state, GFP_NOFS); 1509 1510 /* then test to make sure it is all still delalloc */ 1511 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1512 EXTENT_DELALLOC, 1, cached_state); 1513 if (!ret) { 1514 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1515 &cached_state, GFP_NOFS); 1516 __unlock_for_delalloc(inode, locked_page, 1517 delalloc_start, delalloc_end); 1518 cond_resched(); 1519 goto again; 1520 } 1521 free_extent_state(cached_state); 1522 *start = delalloc_start; 1523 *end = delalloc_end; 1524 out_failed: 1525 return found; 1526 } 1527 1528 int extent_clear_unlock_delalloc(struct inode *inode, 1529 struct extent_io_tree *tree, 1530 u64 start, u64 end, struct page *locked_page, 1531 unsigned long op) 1532 { 1533 int ret; 1534 struct page *pages[16]; 1535 unsigned long index = start >> PAGE_CACHE_SHIFT; 1536 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1537 unsigned long nr_pages = end_index - index + 1; 1538 int i; 1539 int clear_bits = 0; 1540 1541 if (op & EXTENT_CLEAR_UNLOCK) 1542 clear_bits |= EXTENT_LOCKED; 1543 if (op & EXTENT_CLEAR_DIRTY) 1544 clear_bits |= EXTENT_DIRTY; 1545 1546 if (op & EXTENT_CLEAR_DELALLOC) 1547 clear_bits |= EXTENT_DELALLOC; 1548 1549 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1550 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1551 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1552 EXTENT_SET_PRIVATE2))) 1553 return 0; 1554 1555 while (nr_pages > 0) { 1556 ret = find_get_pages_contig(inode->i_mapping, index, 1557 min_t(unsigned long, 1558 nr_pages, ARRAY_SIZE(pages)), pages); 1559 for (i = 0; i < ret; i++) { 1560 1561 if (op & EXTENT_SET_PRIVATE2) 1562 SetPagePrivate2(pages[i]); 1563 1564 if (pages[i] == locked_page) { 1565 page_cache_release(pages[i]); 1566 continue; 1567 } 1568 if (op & EXTENT_CLEAR_DIRTY) 1569 clear_page_dirty_for_io(pages[i]); 1570 if (op & EXTENT_SET_WRITEBACK) 1571 set_page_writeback(pages[i]); 1572 if (op & EXTENT_END_WRITEBACK) 1573 end_page_writeback(pages[i]); 1574 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1575 unlock_page(pages[i]); 1576 page_cache_release(pages[i]); 1577 } 1578 nr_pages -= ret; 1579 index += ret; 1580 cond_resched(); 1581 } 1582 return 0; 1583 } 1584 1585 /* 1586 * count the number of bytes in the tree that have a given bit(s) 1587 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1588 * cached. The total number found is returned. 1589 */ 1590 u64 count_range_bits(struct extent_io_tree *tree, 1591 u64 *start, u64 search_end, u64 max_bytes, 1592 unsigned long bits, int contig) 1593 { 1594 struct rb_node *node; 1595 struct extent_state *state; 1596 u64 cur_start = *start; 1597 u64 total_bytes = 0; 1598 u64 last = 0; 1599 int found = 0; 1600 1601 if (search_end <= cur_start) { 1602 WARN_ON(1); 1603 return 0; 1604 } 1605 1606 spin_lock(&tree->lock); 1607 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1608 total_bytes = tree->dirty_bytes; 1609 goto out; 1610 } 1611 /* 1612 * this search will find all the extents that end after 1613 * our range starts. 1614 */ 1615 node = tree_search(tree, cur_start); 1616 if (!node) 1617 goto out; 1618 1619 while (1) { 1620 state = rb_entry(node, struct extent_state, rb_node); 1621 if (state->start > search_end) 1622 break; 1623 if (contig && found && state->start > last + 1) 1624 break; 1625 if (state->end >= cur_start && (state->state & bits) == bits) { 1626 total_bytes += min(search_end, state->end) + 1 - 1627 max(cur_start, state->start); 1628 if (total_bytes >= max_bytes) 1629 break; 1630 if (!found) { 1631 *start = max(cur_start, state->start); 1632 found = 1; 1633 } 1634 last = state->end; 1635 } else if (contig && found) { 1636 break; 1637 } 1638 node = rb_next(node); 1639 if (!node) 1640 break; 1641 } 1642 out: 1643 spin_unlock(&tree->lock); 1644 return total_bytes; 1645 } 1646 1647 /* 1648 * set the private field for a given byte offset in the tree. If there isn't 1649 * an extent_state there already, this does nothing. 1650 */ 1651 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1652 { 1653 struct rb_node *node; 1654 struct extent_state *state; 1655 int ret = 0; 1656 1657 spin_lock(&tree->lock); 1658 /* 1659 * this search will find all the extents that end after 1660 * our range starts. 1661 */ 1662 node = tree_search(tree, start); 1663 if (!node) { 1664 ret = -ENOENT; 1665 goto out; 1666 } 1667 state = rb_entry(node, struct extent_state, rb_node); 1668 if (state->start != start) { 1669 ret = -ENOENT; 1670 goto out; 1671 } 1672 state->private = private; 1673 out: 1674 spin_unlock(&tree->lock); 1675 return ret; 1676 } 1677 1678 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1679 { 1680 struct rb_node *node; 1681 struct extent_state *state; 1682 int ret = 0; 1683 1684 spin_lock(&tree->lock); 1685 /* 1686 * this search will find all the extents that end after 1687 * our range starts. 1688 */ 1689 node = tree_search(tree, start); 1690 if (!node) { 1691 ret = -ENOENT; 1692 goto out; 1693 } 1694 state = rb_entry(node, struct extent_state, rb_node); 1695 if (state->start != start) { 1696 ret = -ENOENT; 1697 goto out; 1698 } 1699 *private = state->private; 1700 out: 1701 spin_unlock(&tree->lock); 1702 return ret; 1703 } 1704 1705 /* 1706 * searches a range in the state tree for a given mask. 1707 * If 'filled' == 1, this returns 1 only if every extent in the tree 1708 * has the bits set. Otherwise, 1 is returned if any bit in the 1709 * range is found set. 1710 */ 1711 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1712 int bits, int filled, struct extent_state *cached) 1713 { 1714 struct extent_state *state = NULL; 1715 struct rb_node *node; 1716 int bitset = 0; 1717 1718 spin_lock(&tree->lock); 1719 if (cached && cached->tree && cached->start <= start && 1720 cached->end > start) 1721 node = &cached->rb_node; 1722 else 1723 node = tree_search(tree, start); 1724 while (node && start <= end) { 1725 state = rb_entry(node, struct extent_state, rb_node); 1726 1727 if (filled && state->start > start) { 1728 bitset = 0; 1729 break; 1730 } 1731 1732 if (state->start > end) 1733 break; 1734 1735 if (state->state & bits) { 1736 bitset = 1; 1737 if (!filled) 1738 break; 1739 } else if (filled) { 1740 bitset = 0; 1741 break; 1742 } 1743 1744 if (state->end == (u64)-1) 1745 break; 1746 1747 start = state->end + 1; 1748 if (start > end) 1749 break; 1750 node = rb_next(node); 1751 if (!node) { 1752 if (filled) 1753 bitset = 0; 1754 break; 1755 } 1756 } 1757 spin_unlock(&tree->lock); 1758 return bitset; 1759 } 1760 1761 /* 1762 * helper function to set a given page up to date if all the 1763 * extents in the tree for that page are up to date 1764 */ 1765 static int check_page_uptodate(struct extent_io_tree *tree, 1766 struct page *page) 1767 { 1768 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1769 u64 end = start + PAGE_CACHE_SIZE - 1; 1770 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1771 SetPageUptodate(page); 1772 return 0; 1773 } 1774 1775 /* 1776 * helper function to unlock a page if all the extents in the tree 1777 * for that page are unlocked 1778 */ 1779 static int check_page_locked(struct extent_io_tree *tree, 1780 struct page *page) 1781 { 1782 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1783 u64 end = start + PAGE_CACHE_SIZE - 1; 1784 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1785 unlock_page(page); 1786 return 0; 1787 } 1788 1789 /* 1790 * helper function to end page writeback if all the extents 1791 * in the tree for that page are done with writeback 1792 */ 1793 static int check_page_writeback(struct extent_io_tree *tree, 1794 struct page *page) 1795 { 1796 end_page_writeback(page); 1797 return 0; 1798 } 1799 1800 /* 1801 * When IO fails, either with EIO or csum verification fails, we 1802 * try other mirrors that might have a good copy of the data. This 1803 * io_failure_record is used to record state as we go through all the 1804 * mirrors. If another mirror has good data, the page is set up to date 1805 * and things continue. If a good mirror can't be found, the original 1806 * bio end_io callback is called to indicate things have failed. 1807 */ 1808 struct io_failure_record { 1809 struct page *page; 1810 u64 start; 1811 u64 len; 1812 u64 logical; 1813 unsigned long bio_flags; 1814 int this_mirror; 1815 int failed_mirror; 1816 int in_validation; 1817 }; 1818 1819 static int free_io_failure(struct inode *inode, struct io_failure_record *rec, 1820 int did_repair) 1821 { 1822 int ret; 1823 int err = 0; 1824 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1825 1826 set_state_private(failure_tree, rec->start, 0); 1827 ret = clear_extent_bits(failure_tree, rec->start, 1828 rec->start + rec->len - 1, 1829 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1830 if (ret) 1831 err = ret; 1832 1833 if (did_repair) { 1834 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1835 rec->start + rec->len - 1, 1836 EXTENT_DAMAGED, GFP_NOFS); 1837 if (ret && !err) 1838 err = ret; 1839 } 1840 1841 kfree(rec); 1842 return err; 1843 } 1844 1845 static void repair_io_failure_callback(struct bio *bio, int err) 1846 { 1847 complete(bio->bi_private); 1848 } 1849 1850 /* 1851 * this bypasses the standard btrfs submit functions deliberately, as 1852 * the standard behavior is to write all copies in a raid setup. here we only 1853 * want to write the one bad copy. so we do the mapping for ourselves and issue 1854 * submit_bio directly. 1855 * to avoid any synchonization issues, wait for the data after writing, which 1856 * actually prevents the read that triggered the error from finishing. 1857 * currently, there can be no more than two copies of every data bit. thus, 1858 * exactly one rewrite is required. 1859 */ 1860 int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1861 u64 length, u64 logical, struct page *page, 1862 int mirror_num) 1863 { 1864 struct bio *bio; 1865 struct btrfs_device *dev; 1866 DECLARE_COMPLETION_ONSTACK(compl); 1867 u64 map_length = 0; 1868 u64 sector; 1869 struct btrfs_bio *bbio = NULL; 1870 int ret; 1871 1872 BUG_ON(!mirror_num); 1873 1874 bio = bio_alloc(GFP_NOFS, 1); 1875 if (!bio) 1876 return -EIO; 1877 bio->bi_private = &compl; 1878 bio->bi_end_io = repair_io_failure_callback; 1879 bio->bi_size = 0; 1880 map_length = length; 1881 1882 ret = btrfs_map_block(map_tree, WRITE, logical, 1883 &map_length, &bbio, mirror_num); 1884 if (ret) { 1885 bio_put(bio); 1886 return -EIO; 1887 } 1888 BUG_ON(mirror_num != bbio->mirror_num); 1889 sector = bbio->stripes[mirror_num-1].physical >> 9; 1890 bio->bi_sector = sector; 1891 dev = bbio->stripes[mirror_num-1].dev; 1892 kfree(bbio); 1893 if (!dev || !dev->bdev || !dev->writeable) { 1894 bio_put(bio); 1895 return -EIO; 1896 } 1897 bio->bi_bdev = dev->bdev; 1898 bio_add_page(bio, page, length, start-page_offset(page)); 1899 btrfsic_submit_bio(WRITE_SYNC, bio); 1900 wait_for_completion(&compl); 1901 1902 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1903 /* try to remap that extent elsewhere? */ 1904 bio_put(bio); 1905 return -EIO; 1906 } 1907 1908 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " 1909 "sector %llu)\n", page->mapping->host->i_ino, start, 1910 dev->name, sector); 1911 1912 bio_put(bio); 1913 return 0; 1914 } 1915 1916 /* 1917 * each time an IO finishes, we do a fast check in the IO failure tree 1918 * to see if we need to process or clean up an io_failure_record 1919 */ 1920 static int clean_io_failure(u64 start, struct page *page) 1921 { 1922 u64 private; 1923 u64 private_failure; 1924 struct io_failure_record *failrec; 1925 struct btrfs_mapping_tree *map_tree; 1926 struct extent_state *state; 1927 int num_copies; 1928 int did_repair = 0; 1929 int ret; 1930 struct inode *inode = page->mapping->host; 1931 1932 private = 0; 1933 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1934 (u64)-1, 1, EXTENT_DIRTY, 0); 1935 if (!ret) 1936 return 0; 1937 1938 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 1939 &private_failure); 1940 if (ret) 1941 return 0; 1942 1943 failrec = (struct io_failure_record *)(unsigned long) private_failure; 1944 BUG_ON(!failrec->this_mirror); 1945 1946 if (failrec->in_validation) { 1947 /* there was no real error, just free the record */ 1948 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 1949 failrec->start); 1950 did_repair = 1; 1951 goto out; 1952 } 1953 1954 spin_lock(&BTRFS_I(inode)->io_tree.lock); 1955 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 1956 failrec->start, 1957 EXTENT_LOCKED); 1958 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 1959 1960 if (state && state->start == failrec->start) { 1961 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 1962 num_copies = btrfs_num_copies(map_tree, failrec->logical, 1963 failrec->len); 1964 if (num_copies > 1) { 1965 ret = repair_io_failure(map_tree, start, failrec->len, 1966 failrec->logical, page, 1967 failrec->failed_mirror); 1968 did_repair = !ret; 1969 } 1970 } 1971 1972 out: 1973 if (!ret) 1974 ret = free_io_failure(inode, failrec, did_repair); 1975 1976 return ret; 1977 } 1978 1979 /* 1980 * this is a generic handler for readpage errors (default 1981 * readpage_io_failed_hook). if other copies exist, read those and write back 1982 * good data to the failed position. does not investigate in remapping the 1983 * failed extent elsewhere, hoping the device will be smart enough to do this as 1984 * needed 1985 */ 1986 1987 static int bio_readpage_error(struct bio *failed_bio, struct page *page, 1988 u64 start, u64 end, int failed_mirror, 1989 struct extent_state *state) 1990 { 1991 struct io_failure_record *failrec = NULL; 1992 u64 private; 1993 struct extent_map *em; 1994 struct inode *inode = page->mapping->host; 1995 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1996 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1997 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1998 struct bio *bio; 1999 int num_copies; 2000 int ret; 2001 int read_mode; 2002 u64 logical; 2003 2004 BUG_ON(failed_bio->bi_rw & REQ_WRITE); 2005 2006 ret = get_state_private(failure_tree, start, &private); 2007 if (ret) { 2008 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2009 if (!failrec) 2010 return -ENOMEM; 2011 failrec->start = start; 2012 failrec->len = end - start + 1; 2013 failrec->this_mirror = 0; 2014 failrec->bio_flags = 0; 2015 failrec->in_validation = 0; 2016 2017 read_lock(&em_tree->lock); 2018 em = lookup_extent_mapping(em_tree, start, failrec->len); 2019 if (!em) { 2020 read_unlock(&em_tree->lock); 2021 kfree(failrec); 2022 return -EIO; 2023 } 2024 2025 if (em->start > start || em->start + em->len < start) { 2026 free_extent_map(em); 2027 em = NULL; 2028 } 2029 read_unlock(&em_tree->lock); 2030 2031 if (!em || IS_ERR(em)) { 2032 kfree(failrec); 2033 return -EIO; 2034 } 2035 logical = start - em->start; 2036 logical = em->block_start + logical; 2037 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2038 logical = em->block_start; 2039 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2040 extent_set_compress_type(&failrec->bio_flags, 2041 em->compress_type); 2042 } 2043 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2044 "len=%llu\n", logical, start, failrec->len); 2045 failrec->logical = logical; 2046 free_extent_map(em); 2047 2048 /* set the bits in the private failure tree */ 2049 ret = set_extent_bits(failure_tree, start, end, 2050 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2051 if (ret >= 0) 2052 ret = set_state_private(failure_tree, start, 2053 (u64)(unsigned long)failrec); 2054 /* set the bits in the inode's tree */ 2055 if (ret >= 0) 2056 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2057 GFP_NOFS); 2058 if (ret < 0) { 2059 kfree(failrec); 2060 return ret; 2061 } 2062 } else { 2063 failrec = (struct io_failure_record *)(unsigned long)private; 2064 pr_debug("bio_readpage_error: (found) logical=%llu, " 2065 "start=%llu, len=%llu, validation=%d\n", 2066 failrec->logical, failrec->start, failrec->len, 2067 failrec->in_validation); 2068 /* 2069 * when data can be on disk more than twice, add to failrec here 2070 * (e.g. with a list for failed_mirror) to make 2071 * clean_io_failure() clean all those errors at once. 2072 */ 2073 } 2074 num_copies = btrfs_num_copies( 2075 &BTRFS_I(inode)->root->fs_info->mapping_tree, 2076 failrec->logical, failrec->len); 2077 if (num_copies == 1) { 2078 /* 2079 * we only have a single copy of the data, so don't bother with 2080 * all the retry and error correction code that follows. no 2081 * matter what the error is, it is very likely to persist. 2082 */ 2083 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " 2084 "state=%p, num_copies=%d, next_mirror %d, " 2085 "failed_mirror %d\n", state, num_copies, 2086 failrec->this_mirror, failed_mirror); 2087 free_io_failure(inode, failrec, 0); 2088 return -EIO; 2089 } 2090 2091 if (!state) { 2092 spin_lock(&tree->lock); 2093 state = find_first_extent_bit_state(tree, failrec->start, 2094 EXTENT_LOCKED); 2095 if (state && state->start != failrec->start) 2096 state = NULL; 2097 spin_unlock(&tree->lock); 2098 } 2099 2100 /* 2101 * there are two premises: 2102 * a) deliver good data to the caller 2103 * b) correct the bad sectors on disk 2104 */ 2105 if (failed_bio->bi_vcnt > 1) { 2106 /* 2107 * to fulfill b), we need to know the exact failing sectors, as 2108 * we don't want to rewrite any more than the failed ones. thus, 2109 * we need separate read requests for the failed bio 2110 * 2111 * if the following BUG_ON triggers, our validation request got 2112 * merged. we need separate requests for our algorithm to work. 2113 */ 2114 BUG_ON(failrec->in_validation); 2115 failrec->in_validation = 1; 2116 failrec->this_mirror = failed_mirror; 2117 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 2118 } else { 2119 /* 2120 * we're ready to fulfill a) and b) alongside. get a good copy 2121 * of the failed sector and if we succeed, we have setup 2122 * everything for repair_io_failure to do the rest for us. 2123 */ 2124 if (failrec->in_validation) { 2125 BUG_ON(failrec->this_mirror != failed_mirror); 2126 failrec->in_validation = 0; 2127 failrec->this_mirror = 0; 2128 } 2129 failrec->failed_mirror = failed_mirror; 2130 failrec->this_mirror++; 2131 if (failrec->this_mirror == failed_mirror) 2132 failrec->this_mirror++; 2133 read_mode = READ_SYNC; 2134 } 2135 2136 if (!state || failrec->this_mirror > num_copies) { 2137 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " 2138 "next_mirror %d, failed_mirror %d\n", state, 2139 num_copies, failrec->this_mirror, failed_mirror); 2140 free_io_failure(inode, failrec, 0); 2141 return -EIO; 2142 } 2143 2144 bio = bio_alloc(GFP_NOFS, 1); 2145 bio->bi_private = state; 2146 bio->bi_end_io = failed_bio->bi_end_io; 2147 bio->bi_sector = failrec->logical >> 9; 2148 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2149 bio->bi_size = 0; 2150 2151 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2152 2153 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2154 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2155 failrec->this_mirror, num_copies, failrec->in_validation); 2156 2157 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, 2158 failrec->bio_flags, 0); 2159 return 0; 2160 } 2161 2162 /* lots and lots of room for performance fixes in the end_bio funcs */ 2163 2164 /* 2165 * after a writepage IO is done, we need to: 2166 * clear the uptodate bits on error 2167 * clear the writeback bits in the extent tree for this IO 2168 * end_page_writeback if the page has no more pending IO 2169 * 2170 * Scheduling is not allowed, so the extent state tree is expected 2171 * to have one and only one object corresponding to this IO. 2172 */ 2173 static void end_bio_extent_writepage(struct bio *bio, int err) 2174 { 2175 int uptodate = err == 0; 2176 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2177 struct extent_io_tree *tree; 2178 u64 start; 2179 u64 end; 2180 int whole_page; 2181 int ret; 2182 2183 do { 2184 struct page *page = bvec->bv_page; 2185 tree = &BTRFS_I(page->mapping->host)->io_tree; 2186 2187 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2188 bvec->bv_offset; 2189 end = start + bvec->bv_len - 1; 2190 2191 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2192 whole_page = 1; 2193 else 2194 whole_page = 0; 2195 2196 if (--bvec >= bio->bi_io_vec) 2197 prefetchw(&bvec->bv_page->flags); 2198 if (tree->ops && tree->ops->writepage_end_io_hook) { 2199 ret = tree->ops->writepage_end_io_hook(page, start, 2200 end, NULL, uptodate); 2201 if (ret) 2202 uptodate = 0; 2203 } 2204 2205 if (!uptodate && tree->ops && 2206 tree->ops->writepage_io_failed_hook) { 2207 ret = tree->ops->writepage_io_failed_hook(bio, page, 2208 start, end, NULL); 2209 if (ret == 0) { 2210 uptodate = (err == 0); 2211 continue; 2212 } 2213 } 2214 2215 if (!uptodate) { 2216 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 2217 ClearPageUptodate(page); 2218 SetPageError(page); 2219 } 2220 2221 if (whole_page) 2222 end_page_writeback(page); 2223 else 2224 check_page_writeback(tree, page); 2225 } while (bvec >= bio->bi_io_vec); 2226 2227 bio_put(bio); 2228 } 2229 2230 /* 2231 * after a readpage IO is done, we need to: 2232 * clear the uptodate bits on error 2233 * set the uptodate bits if things worked 2234 * set the page up to date if all extents in the tree are uptodate 2235 * clear the lock bit in the extent tree 2236 * unlock the page if there are no other extents locked for it 2237 * 2238 * Scheduling is not allowed, so the extent state tree is expected 2239 * to have one and only one object corresponding to this IO. 2240 */ 2241 static void end_bio_extent_readpage(struct bio *bio, int err) 2242 { 2243 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 2244 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 2245 struct bio_vec *bvec = bio->bi_io_vec; 2246 struct extent_io_tree *tree; 2247 u64 start; 2248 u64 end; 2249 int whole_page; 2250 int ret; 2251 2252 if (err) 2253 uptodate = 0; 2254 2255 do { 2256 struct page *page = bvec->bv_page; 2257 struct extent_state *cached = NULL; 2258 struct extent_state *state; 2259 2260 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " 2261 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, 2262 (long int)bio->bi_bdev); 2263 tree = &BTRFS_I(page->mapping->host)->io_tree; 2264 2265 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2266 bvec->bv_offset; 2267 end = start + bvec->bv_len - 1; 2268 2269 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2270 whole_page = 1; 2271 else 2272 whole_page = 0; 2273 2274 if (++bvec <= bvec_end) 2275 prefetchw(&bvec->bv_page->flags); 2276 2277 spin_lock(&tree->lock); 2278 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); 2279 if (state && state->start == start) { 2280 /* 2281 * take a reference on the state, unlock will drop 2282 * the ref 2283 */ 2284 cache_state(state, &cached); 2285 } 2286 spin_unlock(&tree->lock); 2287 2288 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2289 ret = tree->ops->readpage_end_io_hook(page, start, end, 2290 state); 2291 if (ret) 2292 uptodate = 0; 2293 else 2294 clean_io_failure(start, page); 2295 } 2296 if (!uptodate) { 2297 int failed_mirror; 2298 failed_mirror = (int)(unsigned long)bio->bi_bdev; 2299 /* 2300 * The generic bio_readpage_error handles errors the 2301 * following way: If possible, new read requests are 2302 * created and submitted and will end up in 2303 * end_bio_extent_readpage as well (if we're lucky, not 2304 * in the !uptodate case). In that case it returns 0 and 2305 * we just go on with the next page in our bio. If it 2306 * can't handle the error it will return -EIO and we 2307 * remain responsible for that page. 2308 */ 2309 ret = bio_readpage_error(bio, page, start, end, 2310 failed_mirror, NULL); 2311 if (ret == 0) { 2312 error_handled: 2313 uptodate = 2314 test_bit(BIO_UPTODATE, &bio->bi_flags); 2315 if (err) 2316 uptodate = 0; 2317 uncache_state(&cached); 2318 continue; 2319 } 2320 if (tree->ops && tree->ops->readpage_io_failed_hook) { 2321 ret = tree->ops->readpage_io_failed_hook( 2322 bio, page, start, end, 2323 failed_mirror, state); 2324 if (ret == 0) 2325 goto error_handled; 2326 } 2327 } 2328 2329 if (uptodate) { 2330 set_extent_uptodate(tree, start, end, &cached, 2331 GFP_ATOMIC); 2332 } 2333 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2334 2335 if (whole_page) { 2336 if (uptodate) { 2337 SetPageUptodate(page); 2338 } else { 2339 ClearPageUptodate(page); 2340 SetPageError(page); 2341 } 2342 unlock_page(page); 2343 } else { 2344 if (uptodate) { 2345 check_page_uptodate(tree, page); 2346 } else { 2347 ClearPageUptodate(page); 2348 SetPageError(page); 2349 } 2350 check_page_locked(tree, page); 2351 } 2352 } while (bvec <= bvec_end); 2353 2354 bio_put(bio); 2355 } 2356 2357 struct bio * 2358 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2359 gfp_t gfp_flags) 2360 { 2361 struct bio *bio; 2362 2363 bio = bio_alloc(gfp_flags, nr_vecs); 2364 2365 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2366 while (!bio && (nr_vecs /= 2)) 2367 bio = bio_alloc(gfp_flags, nr_vecs); 2368 } 2369 2370 if (bio) { 2371 bio->bi_size = 0; 2372 bio->bi_bdev = bdev; 2373 bio->bi_sector = first_sector; 2374 } 2375 return bio; 2376 } 2377 2378 static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 2379 unsigned long bio_flags) 2380 { 2381 int ret = 0; 2382 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2383 struct page *page = bvec->bv_page; 2384 struct extent_io_tree *tree = bio->bi_private; 2385 u64 start; 2386 2387 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 2388 2389 bio->bi_private = NULL; 2390 2391 bio_get(bio); 2392 2393 if (tree->ops && tree->ops->submit_bio_hook) 2394 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2395 mirror_num, bio_flags, start); 2396 else 2397 btrfsic_submit_bio(rw, bio); 2398 2399 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2400 ret = -EOPNOTSUPP; 2401 bio_put(bio); 2402 return ret; 2403 } 2404 2405 static int submit_extent_page(int rw, struct extent_io_tree *tree, 2406 struct page *page, sector_t sector, 2407 size_t size, unsigned long offset, 2408 struct block_device *bdev, 2409 struct bio **bio_ret, 2410 unsigned long max_pages, 2411 bio_end_io_t end_io_func, 2412 int mirror_num, 2413 unsigned long prev_bio_flags, 2414 unsigned long bio_flags) 2415 { 2416 int ret = 0; 2417 struct bio *bio; 2418 int nr; 2419 int contig = 0; 2420 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 2421 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2422 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 2423 2424 if (bio_ret && *bio_ret) { 2425 bio = *bio_ret; 2426 if (old_compressed) 2427 contig = bio->bi_sector == sector; 2428 else 2429 contig = bio->bi_sector + (bio->bi_size >> 9) == 2430 sector; 2431 2432 if (prev_bio_flags != bio_flags || !contig || 2433 (tree->ops && tree->ops->merge_bio_hook && 2434 tree->ops->merge_bio_hook(page, offset, page_size, bio, 2435 bio_flags)) || 2436 bio_add_page(bio, page, page_size, offset) < page_size) { 2437 ret = submit_one_bio(rw, bio, mirror_num, 2438 prev_bio_flags); 2439 bio = NULL; 2440 } else { 2441 return 0; 2442 } 2443 } 2444 if (this_compressed) 2445 nr = BIO_MAX_PAGES; 2446 else 2447 nr = bio_get_nr_vecs(bdev); 2448 2449 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 2450 if (!bio) 2451 return -ENOMEM; 2452 2453 bio_add_page(bio, page, page_size, offset); 2454 bio->bi_end_io = end_io_func; 2455 bio->bi_private = tree; 2456 2457 if (bio_ret) 2458 *bio_ret = bio; 2459 else 2460 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 2461 2462 return ret; 2463 } 2464 2465 void set_page_extent_mapped(struct page *page) 2466 { 2467 if (!PagePrivate(page)) { 2468 SetPagePrivate(page); 2469 page_cache_get(page); 2470 set_page_private(page, EXTENT_PAGE_PRIVATE); 2471 } 2472 } 2473 2474 static void set_page_extent_head(struct page *page, unsigned long len) 2475 { 2476 WARN_ON(!PagePrivate(page)); 2477 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 2478 } 2479 2480 /* 2481 * basic readpage implementation. Locked extent state structs are inserted 2482 * into the tree that are removed when the IO is done (by the end_io 2483 * handlers) 2484 */ 2485 static int __extent_read_full_page(struct extent_io_tree *tree, 2486 struct page *page, 2487 get_extent_t *get_extent, 2488 struct bio **bio, int mirror_num, 2489 unsigned long *bio_flags) 2490 { 2491 struct inode *inode = page->mapping->host; 2492 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2493 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2494 u64 end; 2495 u64 cur = start; 2496 u64 extent_offset; 2497 u64 last_byte = i_size_read(inode); 2498 u64 block_start; 2499 u64 cur_end; 2500 sector_t sector; 2501 struct extent_map *em; 2502 struct block_device *bdev; 2503 struct btrfs_ordered_extent *ordered; 2504 int ret; 2505 int nr = 0; 2506 size_t pg_offset = 0; 2507 size_t iosize; 2508 size_t disk_io_size; 2509 size_t blocksize = inode->i_sb->s_blocksize; 2510 unsigned long this_bio_flag = 0; 2511 2512 set_page_extent_mapped(page); 2513 2514 if (!PageUptodate(page)) { 2515 if (cleancache_get_page(page) == 0) { 2516 BUG_ON(blocksize != PAGE_SIZE); 2517 goto out; 2518 } 2519 } 2520 2521 end = page_end; 2522 while (1) { 2523 lock_extent(tree, start, end, GFP_NOFS); 2524 ordered = btrfs_lookup_ordered_extent(inode, start); 2525 if (!ordered) 2526 break; 2527 unlock_extent(tree, start, end, GFP_NOFS); 2528 btrfs_start_ordered_extent(inode, ordered, 1); 2529 btrfs_put_ordered_extent(ordered); 2530 } 2531 2532 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2533 char *userpage; 2534 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2535 2536 if (zero_offset) { 2537 iosize = PAGE_CACHE_SIZE - zero_offset; 2538 userpage = kmap_atomic(page, KM_USER0); 2539 memset(userpage + zero_offset, 0, iosize); 2540 flush_dcache_page(page); 2541 kunmap_atomic(userpage, KM_USER0); 2542 } 2543 } 2544 while (cur <= end) { 2545 if (cur >= last_byte) { 2546 char *userpage; 2547 struct extent_state *cached = NULL; 2548 2549 iosize = PAGE_CACHE_SIZE - pg_offset; 2550 userpage = kmap_atomic(page, KM_USER0); 2551 memset(userpage + pg_offset, 0, iosize); 2552 flush_dcache_page(page); 2553 kunmap_atomic(userpage, KM_USER0); 2554 set_extent_uptodate(tree, cur, cur + iosize - 1, 2555 &cached, GFP_NOFS); 2556 unlock_extent_cached(tree, cur, cur + iosize - 1, 2557 &cached, GFP_NOFS); 2558 break; 2559 } 2560 em = get_extent(inode, page, pg_offset, cur, 2561 end - cur + 1, 0); 2562 if (IS_ERR_OR_NULL(em)) { 2563 SetPageError(page); 2564 unlock_extent(tree, cur, end, GFP_NOFS); 2565 break; 2566 } 2567 extent_offset = cur - em->start; 2568 BUG_ON(extent_map_end(em) <= cur); 2569 BUG_ON(end < cur); 2570 2571 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2572 this_bio_flag = EXTENT_BIO_COMPRESSED; 2573 extent_set_compress_type(&this_bio_flag, 2574 em->compress_type); 2575 } 2576 2577 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2578 cur_end = min(extent_map_end(em) - 1, end); 2579 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2580 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2581 disk_io_size = em->block_len; 2582 sector = em->block_start >> 9; 2583 } else { 2584 sector = (em->block_start + extent_offset) >> 9; 2585 disk_io_size = iosize; 2586 } 2587 bdev = em->bdev; 2588 block_start = em->block_start; 2589 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2590 block_start = EXTENT_MAP_HOLE; 2591 free_extent_map(em); 2592 em = NULL; 2593 2594 /* we've found a hole, just zero and go on */ 2595 if (block_start == EXTENT_MAP_HOLE) { 2596 char *userpage; 2597 struct extent_state *cached = NULL; 2598 2599 userpage = kmap_atomic(page, KM_USER0); 2600 memset(userpage + pg_offset, 0, iosize); 2601 flush_dcache_page(page); 2602 kunmap_atomic(userpage, KM_USER0); 2603 2604 set_extent_uptodate(tree, cur, cur + iosize - 1, 2605 &cached, GFP_NOFS); 2606 unlock_extent_cached(tree, cur, cur + iosize - 1, 2607 &cached, GFP_NOFS); 2608 cur = cur + iosize; 2609 pg_offset += iosize; 2610 continue; 2611 } 2612 /* the get_extent function already copied into the page */ 2613 if (test_range_bit(tree, cur, cur_end, 2614 EXTENT_UPTODATE, 1, NULL)) { 2615 check_page_uptodate(tree, page); 2616 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2617 cur = cur + iosize; 2618 pg_offset += iosize; 2619 continue; 2620 } 2621 /* we have an inline extent but it didn't get marked up 2622 * to date. Error out 2623 */ 2624 if (block_start == EXTENT_MAP_INLINE) { 2625 SetPageError(page); 2626 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2627 cur = cur + iosize; 2628 pg_offset += iosize; 2629 continue; 2630 } 2631 2632 ret = 0; 2633 if (tree->ops && tree->ops->readpage_io_hook) { 2634 ret = tree->ops->readpage_io_hook(page, cur, 2635 cur + iosize - 1); 2636 } 2637 if (!ret) { 2638 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2639 pnr -= page->index; 2640 ret = submit_extent_page(READ, tree, page, 2641 sector, disk_io_size, pg_offset, 2642 bdev, bio, pnr, 2643 end_bio_extent_readpage, mirror_num, 2644 *bio_flags, 2645 this_bio_flag); 2646 nr++; 2647 *bio_flags = this_bio_flag; 2648 } 2649 if (ret) 2650 SetPageError(page); 2651 cur = cur + iosize; 2652 pg_offset += iosize; 2653 } 2654 out: 2655 if (!nr) { 2656 if (!PageError(page)) 2657 SetPageUptodate(page); 2658 unlock_page(page); 2659 } 2660 return 0; 2661 } 2662 2663 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2664 get_extent_t *get_extent, int mirror_num) 2665 { 2666 struct bio *bio = NULL; 2667 unsigned long bio_flags = 0; 2668 int ret; 2669 2670 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 2671 &bio_flags); 2672 if (bio) 2673 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 2674 return ret; 2675 } 2676 2677 static noinline void update_nr_written(struct page *page, 2678 struct writeback_control *wbc, 2679 unsigned long nr_written) 2680 { 2681 wbc->nr_to_write -= nr_written; 2682 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2683 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2684 page->mapping->writeback_index = page->index + nr_written; 2685 } 2686 2687 /* 2688 * the writepage semantics are similar to regular writepage. extent 2689 * records are inserted to lock ranges in the tree, and as dirty areas 2690 * are found, they are marked writeback. Then the lock bits are removed 2691 * and the end_io handler clears the writeback ranges 2692 */ 2693 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2694 void *data) 2695 { 2696 struct inode *inode = page->mapping->host; 2697 struct extent_page_data *epd = data; 2698 struct extent_io_tree *tree = epd->tree; 2699 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2700 u64 delalloc_start; 2701 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2702 u64 end; 2703 u64 cur = start; 2704 u64 extent_offset; 2705 u64 last_byte = i_size_read(inode); 2706 u64 block_start; 2707 u64 iosize; 2708 sector_t sector; 2709 struct extent_state *cached_state = NULL; 2710 struct extent_map *em; 2711 struct block_device *bdev; 2712 int ret; 2713 int nr = 0; 2714 size_t pg_offset = 0; 2715 size_t blocksize; 2716 loff_t i_size = i_size_read(inode); 2717 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2718 u64 nr_delalloc; 2719 u64 delalloc_end; 2720 int page_started; 2721 int compressed; 2722 int write_flags; 2723 unsigned long nr_written = 0; 2724 bool fill_delalloc = true; 2725 2726 if (wbc->sync_mode == WB_SYNC_ALL) 2727 write_flags = WRITE_SYNC; 2728 else 2729 write_flags = WRITE; 2730 2731 trace___extent_writepage(page, inode, wbc); 2732 2733 WARN_ON(!PageLocked(page)); 2734 2735 ClearPageError(page); 2736 2737 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2738 if (page->index > end_index || 2739 (page->index == end_index && !pg_offset)) { 2740 page->mapping->a_ops->invalidatepage(page, 0); 2741 unlock_page(page); 2742 return 0; 2743 } 2744 2745 if (page->index == end_index) { 2746 char *userpage; 2747 2748 userpage = kmap_atomic(page, KM_USER0); 2749 memset(userpage + pg_offset, 0, 2750 PAGE_CACHE_SIZE - pg_offset); 2751 kunmap_atomic(userpage, KM_USER0); 2752 flush_dcache_page(page); 2753 } 2754 pg_offset = 0; 2755 2756 set_page_extent_mapped(page); 2757 2758 if (!tree->ops || !tree->ops->fill_delalloc) 2759 fill_delalloc = false; 2760 2761 delalloc_start = start; 2762 delalloc_end = 0; 2763 page_started = 0; 2764 if (!epd->extent_locked && fill_delalloc) { 2765 u64 delalloc_to_write = 0; 2766 /* 2767 * make sure the wbc mapping index is at least updated 2768 * to this page. 2769 */ 2770 update_nr_written(page, wbc, 0); 2771 2772 while (delalloc_end < page_end) { 2773 nr_delalloc = find_lock_delalloc_range(inode, tree, 2774 page, 2775 &delalloc_start, 2776 &delalloc_end, 2777 128 * 1024 * 1024); 2778 if (nr_delalloc == 0) { 2779 delalloc_start = delalloc_end + 1; 2780 continue; 2781 } 2782 tree->ops->fill_delalloc(inode, page, delalloc_start, 2783 delalloc_end, &page_started, 2784 &nr_written); 2785 /* 2786 * delalloc_end is already one less than the total 2787 * length, so we don't subtract one from 2788 * PAGE_CACHE_SIZE 2789 */ 2790 delalloc_to_write += (delalloc_end - delalloc_start + 2791 PAGE_CACHE_SIZE) >> 2792 PAGE_CACHE_SHIFT; 2793 delalloc_start = delalloc_end + 1; 2794 } 2795 if (wbc->nr_to_write < delalloc_to_write) { 2796 int thresh = 8192; 2797 2798 if (delalloc_to_write < thresh * 2) 2799 thresh = delalloc_to_write; 2800 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2801 thresh); 2802 } 2803 2804 /* did the fill delalloc function already unlock and start 2805 * the IO? 2806 */ 2807 if (page_started) { 2808 ret = 0; 2809 /* 2810 * we've unlocked the page, so we can't update 2811 * the mapping's writeback index, just update 2812 * nr_to_write. 2813 */ 2814 wbc->nr_to_write -= nr_written; 2815 goto done_unlocked; 2816 } 2817 } 2818 if (tree->ops && tree->ops->writepage_start_hook) { 2819 ret = tree->ops->writepage_start_hook(page, start, 2820 page_end); 2821 if (ret == -EAGAIN) { 2822 redirty_page_for_writepage(wbc, page); 2823 update_nr_written(page, wbc, nr_written); 2824 unlock_page(page); 2825 ret = 0; 2826 goto done_unlocked; 2827 } 2828 } 2829 2830 /* 2831 * we don't want to touch the inode after unlocking the page, 2832 * so we update the mapping writeback index now 2833 */ 2834 update_nr_written(page, wbc, nr_written + 1); 2835 2836 end = page_end; 2837 if (last_byte <= start) { 2838 if (tree->ops && tree->ops->writepage_end_io_hook) 2839 tree->ops->writepage_end_io_hook(page, start, 2840 page_end, NULL, 1); 2841 goto done; 2842 } 2843 2844 blocksize = inode->i_sb->s_blocksize; 2845 2846 while (cur <= end) { 2847 if (cur >= last_byte) { 2848 if (tree->ops && tree->ops->writepage_end_io_hook) 2849 tree->ops->writepage_end_io_hook(page, cur, 2850 page_end, NULL, 1); 2851 break; 2852 } 2853 em = epd->get_extent(inode, page, pg_offset, cur, 2854 end - cur + 1, 1); 2855 if (IS_ERR_OR_NULL(em)) { 2856 SetPageError(page); 2857 break; 2858 } 2859 2860 extent_offset = cur - em->start; 2861 BUG_ON(extent_map_end(em) <= cur); 2862 BUG_ON(end < cur); 2863 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2864 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2865 sector = (em->block_start + extent_offset) >> 9; 2866 bdev = em->bdev; 2867 block_start = em->block_start; 2868 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2869 free_extent_map(em); 2870 em = NULL; 2871 2872 /* 2873 * compressed and inline extents are written through other 2874 * paths in the FS 2875 */ 2876 if (compressed || block_start == EXTENT_MAP_HOLE || 2877 block_start == EXTENT_MAP_INLINE) { 2878 /* 2879 * end_io notification does not happen here for 2880 * compressed extents 2881 */ 2882 if (!compressed && tree->ops && 2883 tree->ops->writepage_end_io_hook) 2884 tree->ops->writepage_end_io_hook(page, cur, 2885 cur + iosize - 1, 2886 NULL, 1); 2887 else if (compressed) { 2888 /* we don't want to end_page_writeback on 2889 * a compressed extent. this happens 2890 * elsewhere 2891 */ 2892 nr++; 2893 } 2894 2895 cur += iosize; 2896 pg_offset += iosize; 2897 continue; 2898 } 2899 /* leave this out until we have a page_mkwrite call */ 2900 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2901 EXTENT_DIRTY, 0, NULL)) { 2902 cur = cur + iosize; 2903 pg_offset += iosize; 2904 continue; 2905 } 2906 2907 if (tree->ops && tree->ops->writepage_io_hook) { 2908 ret = tree->ops->writepage_io_hook(page, cur, 2909 cur + iosize - 1); 2910 } else { 2911 ret = 0; 2912 } 2913 if (ret) { 2914 SetPageError(page); 2915 } else { 2916 unsigned long max_nr = end_index + 1; 2917 2918 set_range_writeback(tree, cur, cur + iosize - 1); 2919 if (!PageWriteback(page)) { 2920 printk(KERN_ERR "btrfs warning page %lu not " 2921 "writeback, cur %llu end %llu\n", 2922 page->index, (unsigned long long)cur, 2923 (unsigned long long)end); 2924 } 2925 2926 ret = submit_extent_page(write_flags, tree, page, 2927 sector, iosize, pg_offset, 2928 bdev, &epd->bio, max_nr, 2929 end_bio_extent_writepage, 2930 0, 0, 0); 2931 if (ret) 2932 SetPageError(page); 2933 } 2934 cur = cur + iosize; 2935 pg_offset += iosize; 2936 nr++; 2937 } 2938 done: 2939 if (nr == 0) { 2940 /* make sure the mapping tag for page dirty gets cleared */ 2941 set_page_writeback(page); 2942 end_page_writeback(page); 2943 } 2944 unlock_page(page); 2945 2946 done_unlocked: 2947 2948 /* drop our reference on any cached states */ 2949 free_extent_state(cached_state); 2950 return 0; 2951 } 2952 2953 /** 2954 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 2955 * @mapping: address space structure to write 2956 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2957 * @writepage: function called for each page 2958 * @data: data passed to writepage function 2959 * 2960 * If a page is already under I/O, write_cache_pages() skips it, even 2961 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2962 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2963 * and msync() need to guarantee that all the data which was dirty at the time 2964 * the call was made get new I/O started against them. If wbc->sync_mode is 2965 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2966 * existing IO to complete. 2967 */ 2968 static int extent_write_cache_pages(struct extent_io_tree *tree, 2969 struct address_space *mapping, 2970 struct writeback_control *wbc, 2971 writepage_t writepage, void *data, 2972 void (*flush_fn)(void *)) 2973 { 2974 int ret = 0; 2975 int done = 0; 2976 int nr_to_write_done = 0; 2977 struct pagevec pvec; 2978 int nr_pages; 2979 pgoff_t index; 2980 pgoff_t end; /* Inclusive */ 2981 int scanned = 0; 2982 int tag; 2983 2984 pagevec_init(&pvec, 0); 2985 if (wbc->range_cyclic) { 2986 index = mapping->writeback_index; /* Start from prev offset */ 2987 end = -1; 2988 } else { 2989 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2990 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2991 scanned = 1; 2992 } 2993 if (wbc->sync_mode == WB_SYNC_ALL) 2994 tag = PAGECACHE_TAG_TOWRITE; 2995 else 2996 tag = PAGECACHE_TAG_DIRTY; 2997 retry: 2998 if (wbc->sync_mode == WB_SYNC_ALL) 2999 tag_pages_for_writeback(mapping, index, end); 3000 while (!done && !nr_to_write_done && (index <= end) && 3001 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3002 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3003 unsigned i; 3004 3005 scanned = 1; 3006 for (i = 0; i < nr_pages; i++) { 3007 struct page *page = pvec.pages[i]; 3008 3009 /* 3010 * At this point we hold neither mapping->tree_lock nor 3011 * lock on the page itself: the page may be truncated or 3012 * invalidated (changing page->mapping to NULL), or even 3013 * swizzled back from swapper_space to tmpfs file 3014 * mapping 3015 */ 3016 if (tree->ops && 3017 tree->ops->write_cache_pages_lock_hook) { 3018 tree->ops->write_cache_pages_lock_hook(page, 3019 data, flush_fn); 3020 } else { 3021 if (!trylock_page(page)) { 3022 flush_fn(data); 3023 lock_page(page); 3024 } 3025 } 3026 3027 if (unlikely(page->mapping != mapping)) { 3028 unlock_page(page); 3029 continue; 3030 } 3031 3032 if (!wbc->range_cyclic && page->index > end) { 3033 done = 1; 3034 unlock_page(page); 3035 continue; 3036 } 3037 3038 if (wbc->sync_mode != WB_SYNC_NONE) { 3039 if (PageWriteback(page)) 3040 flush_fn(data); 3041 wait_on_page_writeback(page); 3042 } 3043 3044 if (PageWriteback(page) || 3045 !clear_page_dirty_for_io(page)) { 3046 unlock_page(page); 3047 continue; 3048 } 3049 3050 ret = (*writepage)(page, wbc, data); 3051 3052 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 3053 unlock_page(page); 3054 ret = 0; 3055 } 3056 if (ret) 3057 done = 1; 3058 3059 /* 3060 * the filesystem may choose to bump up nr_to_write. 3061 * We have to make sure to honor the new nr_to_write 3062 * at any time 3063 */ 3064 nr_to_write_done = wbc->nr_to_write <= 0; 3065 } 3066 pagevec_release(&pvec); 3067 cond_resched(); 3068 } 3069 if (!scanned && !done) { 3070 /* 3071 * We hit the last page and there is more work to be done: wrap 3072 * back to the start of the file 3073 */ 3074 scanned = 1; 3075 index = 0; 3076 goto retry; 3077 } 3078 return ret; 3079 } 3080 3081 static void flush_epd_write_bio(struct extent_page_data *epd) 3082 { 3083 if (epd->bio) { 3084 if (epd->sync_io) 3085 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 3086 else 3087 submit_one_bio(WRITE, epd->bio, 0, 0); 3088 epd->bio = NULL; 3089 } 3090 } 3091 3092 static noinline void flush_write_bio(void *data) 3093 { 3094 struct extent_page_data *epd = data; 3095 flush_epd_write_bio(epd); 3096 } 3097 3098 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 3099 get_extent_t *get_extent, 3100 struct writeback_control *wbc) 3101 { 3102 int ret; 3103 struct extent_page_data epd = { 3104 .bio = NULL, 3105 .tree = tree, 3106 .get_extent = get_extent, 3107 .extent_locked = 0, 3108 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3109 }; 3110 3111 ret = __extent_writepage(page, wbc, &epd); 3112 3113 flush_epd_write_bio(&epd); 3114 return ret; 3115 } 3116 3117 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 3118 u64 start, u64 end, get_extent_t *get_extent, 3119 int mode) 3120 { 3121 int ret = 0; 3122 struct address_space *mapping = inode->i_mapping; 3123 struct page *page; 3124 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 3125 PAGE_CACHE_SHIFT; 3126 3127 struct extent_page_data epd = { 3128 .bio = NULL, 3129 .tree = tree, 3130 .get_extent = get_extent, 3131 .extent_locked = 1, 3132 .sync_io = mode == WB_SYNC_ALL, 3133 }; 3134 struct writeback_control wbc_writepages = { 3135 .sync_mode = mode, 3136 .nr_to_write = nr_pages * 2, 3137 .range_start = start, 3138 .range_end = end + 1, 3139 }; 3140 3141 while (start <= end) { 3142 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 3143 if (clear_page_dirty_for_io(page)) 3144 ret = __extent_writepage(page, &wbc_writepages, &epd); 3145 else { 3146 if (tree->ops && tree->ops->writepage_end_io_hook) 3147 tree->ops->writepage_end_io_hook(page, start, 3148 start + PAGE_CACHE_SIZE - 1, 3149 NULL, 1); 3150 unlock_page(page); 3151 } 3152 page_cache_release(page); 3153 start += PAGE_CACHE_SIZE; 3154 } 3155 3156 flush_epd_write_bio(&epd); 3157 return ret; 3158 } 3159 3160 int extent_writepages(struct extent_io_tree *tree, 3161 struct address_space *mapping, 3162 get_extent_t *get_extent, 3163 struct writeback_control *wbc) 3164 { 3165 int ret = 0; 3166 struct extent_page_data epd = { 3167 .bio = NULL, 3168 .tree = tree, 3169 .get_extent = get_extent, 3170 .extent_locked = 0, 3171 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3172 }; 3173 3174 ret = extent_write_cache_pages(tree, mapping, wbc, 3175 __extent_writepage, &epd, 3176 flush_write_bio); 3177 flush_epd_write_bio(&epd); 3178 return ret; 3179 } 3180 3181 int extent_readpages(struct extent_io_tree *tree, 3182 struct address_space *mapping, 3183 struct list_head *pages, unsigned nr_pages, 3184 get_extent_t get_extent) 3185 { 3186 struct bio *bio = NULL; 3187 unsigned page_idx; 3188 unsigned long bio_flags = 0; 3189 3190 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3191 struct page *page = list_entry(pages->prev, struct page, lru); 3192 3193 prefetchw(&page->flags); 3194 list_del(&page->lru); 3195 if (!add_to_page_cache_lru(page, mapping, 3196 page->index, GFP_NOFS)) { 3197 __extent_read_full_page(tree, page, get_extent, 3198 &bio, 0, &bio_flags); 3199 } 3200 page_cache_release(page); 3201 } 3202 BUG_ON(!list_empty(pages)); 3203 if (bio) 3204 submit_one_bio(READ, bio, 0, bio_flags); 3205 return 0; 3206 } 3207 3208 /* 3209 * basic invalidatepage code, this waits on any locked or writeback 3210 * ranges corresponding to the page, and then deletes any extent state 3211 * records from the tree 3212 */ 3213 int extent_invalidatepage(struct extent_io_tree *tree, 3214 struct page *page, unsigned long offset) 3215 { 3216 struct extent_state *cached_state = NULL; 3217 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 3218 u64 end = start + PAGE_CACHE_SIZE - 1; 3219 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3220 3221 start += (offset + blocksize - 1) & ~(blocksize - 1); 3222 if (start > end) 3223 return 0; 3224 3225 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 3226 wait_on_page_writeback(page); 3227 clear_extent_bit(tree, start, end, 3228 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 3229 EXTENT_DO_ACCOUNTING, 3230 1, 1, &cached_state, GFP_NOFS); 3231 return 0; 3232 } 3233 3234 /* 3235 * a helper for releasepage, this tests for areas of the page that 3236 * are locked or under IO and drops the related state bits if it is safe 3237 * to drop the page. 3238 */ 3239 int try_release_extent_state(struct extent_map_tree *map, 3240 struct extent_io_tree *tree, struct page *page, 3241 gfp_t mask) 3242 { 3243 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3244 u64 end = start + PAGE_CACHE_SIZE - 1; 3245 int ret = 1; 3246 3247 if (test_range_bit(tree, start, end, 3248 EXTENT_IOBITS, 0, NULL)) 3249 ret = 0; 3250 else { 3251 if ((mask & GFP_NOFS) == GFP_NOFS) 3252 mask = GFP_NOFS; 3253 /* 3254 * at this point we can safely clear everything except the 3255 * locked bit and the nodatasum bit 3256 */ 3257 ret = clear_extent_bit(tree, start, end, 3258 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 3259 0, 0, NULL, mask); 3260 3261 /* if clear_extent_bit failed for enomem reasons, 3262 * we can't allow the release to continue. 3263 */ 3264 if (ret < 0) 3265 ret = 0; 3266 else 3267 ret = 1; 3268 } 3269 return ret; 3270 } 3271 3272 /* 3273 * a helper for releasepage. As long as there are no locked extents 3274 * in the range corresponding to the page, both state records and extent 3275 * map records are removed 3276 */ 3277 int try_release_extent_mapping(struct extent_map_tree *map, 3278 struct extent_io_tree *tree, struct page *page, 3279 gfp_t mask) 3280 { 3281 struct extent_map *em; 3282 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3283 u64 end = start + PAGE_CACHE_SIZE - 1; 3284 3285 if ((mask & __GFP_WAIT) && 3286 page->mapping->host->i_size > 16 * 1024 * 1024) { 3287 u64 len; 3288 while (start <= end) { 3289 len = end - start + 1; 3290 write_lock(&map->lock); 3291 em = lookup_extent_mapping(map, start, len); 3292 if (IS_ERR_OR_NULL(em)) { 3293 write_unlock(&map->lock); 3294 break; 3295 } 3296 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 3297 em->start != start) { 3298 write_unlock(&map->lock); 3299 free_extent_map(em); 3300 break; 3301 } 3302 if (!test_range_bit(tree, em->start, 3303 extent_map_end(em) - 1, 3304 EXTENT_LOCKED | EXTENT_WRITEBACK, 3305 0, NULL)) { 3306 remove_extent_mapping(map, em); 3307 /* once for the rb tree */ 3308 free_extent_map(em); 3309 } 3310 start = extent_map_end(em); 3311 write_unlock(&map->lock); 3312 3313 /* once for us */ 3314 free_extent_map(em); 3315 } 3316 } 3317 return try_release_extent_state(map, tree, page, mask); 3318 } 3319 3320 /* 3321 * helper function for fiemap, which doesn't want to see any holes. 3322 * This maps until we find something past 'last' 3323 */ 3324 static struct extent_map *get_extent_skip_holes(struct inode *inode, 3325 u64 offset, 3326 u64 last, 3327 get_extent_t *get_extent) 3328 { 3329 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 3330 struct extent_map *em; 3331 u64 len; 3332 3333 if (offset >= last) 3334 return NULL; 3335 3336 while(1) { 3337 len = last - offset; 3338 if (len == 0) 3339 break; 3340 len = (len + sectorsize - 1) & ~(sectorsize - 1); 3341 em = get_extent(inode, NULL, 0, offset, len, 0); 3342 if (IS_ERR_OR_NULL(em)) 3343 return em; 3344 3345 /* if this isn't a hole return it */ 3346 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 3347 em->block_start != EXTENT_MAP_HOLE) { 3348 return em; 3349 } 3350 3351 /* this is a hole, advance to the next extent */ 3352 offset = extent_map_end(em); 3353 free_extent_map(em); 3354 if (offset >= last) 3355 break; 3356 } 3357 return NULL; 3358 } 3359 3360 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3361 __u64 start, __u64 len, get_extent_t *get_extent) 3362 { 3363 int ret = 0; 3364 u64 off = start; 3365 u64 max = start + len; 3366 u32 flags = 0; 3367 u32 found_type; 3368 u64 last; 3369 u64 last_for_get_extent = 0; 3370 u64 disko = 0; 3371 u64 isize = i_size_read(inode); 3372 struct btrfs_key found_key; 3373 struct extent_map *em = NULL; 3374 struct extent_state *cached_state = NULL; 3375 struct btrfs_path *path; 3376 struct btrfs_file_extent_item *item; 3377 int end = 0; 3378 u64 em_start = 0; 3379 u64 em_len = 0; 3380 u64 em_end = 0; 3381 unsigned long emflags; 3382 3383 if (len == 0) 3384 return -EINVAL; 3385 3386 path = btrfs_alloc_path(); 3387 if (!path) 3388 return -ENOMEM; 3389 path->leave_spinning = 1; 3390 3391 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 3392 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 3393 3394 /* 3395 * lookup the last file extent. We're not using i_size here 3396 * because there might be preallocation past i_size 3397 */ 3398 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 3399 path, btrfs_ino(inode), -1, 0); 3400 if (ret < 0) { 3401 btrfs_free_path(path); 3402 return ret; 3403 } 3404 WARN_ON(!ret); 3405 path->slots[0]--; 3406 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3407 struct btrfs_file_extent_item); 3408 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3409 found_type = btrfs_key_type(&found_key); 3410 3411 /* No extents, but there might be delalloc bits */ 3412 if (found_key.objectid != btrfs_ino(inode) || 3413 found_type != BTRFS_EXTENT_DATA_KEY) { 3414 /* have to trust i_size as the end */ 3415 last = (u64)-1; 3416 last_for_get_extent = isize; 3417 } else { 3418 /* 3419 * remember the start of the last extent. There are a 3420 * bunch of different factors that go into the length of the 3421 * extent, so its much less complex to remember where it started 3422 */ 3423 last = found_key.offset; 3424 last_for_get_extent = last + 1; 3425 } 3426 btrfs_free_path(path); 3427 3428 /* 3429 * we might have some extents allocated but more delalloc past those 3430 * extents. so, we trust isize unless the start of the last extent is 3431 * beyond isize 3432 */ 3433 if (last < isize) { 3434 last = (u64)-1; 3435 last_for_get_extent = isize; 3436 } 3437 3438 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3439 &cached_state, GFP_NOFS); 3440 3441 em = get_extent_skip_holes(inode, start, last_for_get_extent, 3442 get_extent); 3443 if (!em) 3444 goto out; 3445 if (IS_ERR(em)) { 3446 ret = PTR_ERR(em); 3447 goto out; 3448 } 3449 3450 while (!end) { 3451 u64 offset_in_extent; 3452 3453 /* break if the extent we found is outside the range */ 3454 if (em->start >= max || extent_map_end(em) < off) 3455 break; 3456 3457 /* 3458 * get_extent may return an extent that starts before our 3459 * requested range. We have to make sure the ranges 3460 * we return to fiemap always move forward and don't 3461 * overlap, so adjust the offsets here 3462 */ 3463 em_start = max(em->start, off); 3464 3465 /* 3466 * record the offset from the start of the extent 3467 * for adjusting the disk offset below 3468 */ 3469 offset_in_extent = em_start - em->start; 3470 em_end = extent_map_end(em); 3471 em_len = em_end - em_start; 3472 emflags = em->flags; 3473 disko = 0; 3474 flags = 0; 3475 3476 /* 3477 * bump off for our next call to get_extent 3478 */ 3479 off = extent_map_end(em); 3480 if (off >= max) 3481 end = 1; 3482 3483 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3484 end = 1; 3485 flags |= FIEMAP_EXTENT_LAST; 3486 } else if (em->block_start == EXTENT_MAP_INLINE) { 3487 flags |= (FIEMAP_EXTENT_DATA_INLINE | 3488 FIEMAP_EXTENT_NOT_ALIGNED); 3489 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 3490 flags |= (FIEMAP_EXTENT_DELALLOC | 3491 FIEMAP_EXTENT_UNKNOWN); 3492 } else { 3493 disko = em->block_start + offset_in_extent; 3494 } 3495 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3496 flags |= FIEMAP_EXTENT_ENCODED; 3497 3498 free_extent_map(em); 3499 em = NULL; 3500 if ((em_start >= last) || em_len == (u64)-1 || 3501 (last == (u64)-1 && isize <= em_end)) { 3502 flags |= FIEMAP_EXTENT_LAST; 3503 end = 1; 3504 } 3505 3506 /* now scan forward to see if this is really the last extent. */ 3507 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3508 get_extent); 3509 if (IS_ERR(em)) { 3510 ret = PTR_ERR(em); 3511 goto out; 3512 } 3513 if (!em) { 3514 flags |= FIEMAP_EXTENT_LAST; 3515 end = 1; 3516 } 3517 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3518 em_len, flags); 3519 if (ret) 3520 goto out_free; 3521 } 3522 out_free: 3523 free_extent_map(em); 3524 out: 3525 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 3526 &cached_state, GFP_NOFS); 3527 return ret; 3528 } 3529 3530 inline struct page *extent_buffer_page(struct extent_buffer *eb, 3531 unsigned long i) 3532 { 3533 struct page *p; 3534 struct address_space *mapping; 3535 3536 if (i == 0) 3537 return eb->first_page; 3538 i += eb->start >> PAGE_CACHE_SHIFT; 3539 mapping = eb->first_page->mapping; 3540 if (!mapping) 3541 return NULL; 3542 3543 /* 3544 * extent_buffer_page is only called after pinning the page 3545 * by increasing the reference count. So we know the page must 3546 * be in the radix tree. 3547 */ 3548 rcu_read_lock(); 3549 p = radix_tree_lookup(&mapping->page_tree, i); 3550 rcu_read_unlock(); 3551 3552 return p; 3553 } 3554 3555 inline unsigned long num_extent_pages(u64 start, u64 len) 3556 { 3557 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3558 (start >> PAGE_CACHE_SHIFT); 3559 } 3560 3561 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3562 u64 start, 3563 unsigned long len, 3564 gfp_t mask) 3565 { 3566 struct extent_buffer *eb = NULL; 3567 #if LEAK_DEBUG 3568 unsigned long flags; 3569 #endif 3570 3571 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3572 if (eb == NULL) 3573 return NULL; 3574 eb->start = start; 3575 eb->len = len; 3576 rwlock_init(&eb->lock); 3577 atomic_set(&eb->write_locks, 0); 3578 atomic_set(&eb->read_locks, 0); 3579 atomic_set(&eb->blocking_readers, 0); 3580 atomic_set(&eb->blocking_writers, 0); 3581 atomic_set(&eb->spinning_readers, 0); 3582 atomic_set(&eb->spinning_writers, 0); 3583 eb->lock_nested = 0; 3584 init_waitqueue_head(&eb->write_lock_wq); 3585 init_waitqueue_head(&eb->read_lock_wq); 3586 3587 #if LEAK_DEBUG 3588 spin_lock_irqsave(&leak_lock, flags); 3589 list_add(&eb->leak_list, &buffers); 3590 spin_unlock_irqrestore(&leak_lock, flags); 3591 #endif 3592 atomic_set(&eb->refs, 1); 3593 3594 return eb; 3595 } 3596 3597 static void __free_extent_buffer(struct extent_buffer *eb) 3598 { 3599 #if LEAK_DEBUG 3600 unsigned long flags; 3601 spin_lock_irqsave(&leak_lock, flags); 3602 list_del(&eb->leak_list); 3603 spin_unlock_irqrestore(&leak_lock, flags); 3604 #endif 3605 kmem_cache_free(extent_buffer_cache, eb); 3606 } 3607 3608 /* 3609 * Helper for releasing extent buffer page. 3610 */ 3611 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 3612 unsigned long start_idx) 3613 { 3614 unsigned long index; 3615 struct page *page; 3616 3617 if (!eb->first_page) 3618 return; 3619 3620 index = num_extent_pages(eb->start, eb->len); 3621 if (start_idx >= index) 3622 return; 3623 3624 do { 3625 index--; 3626 page = extent_buffer_page(eb, index); 3627 if (page) 3628 page_cache_release(page); 3629 } while (index != start_idx); 3630 } 3631 3632 /* 3633 * Helper for releasing the extent buffer. 3634 */ 3635 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 3636 { 3637 btrfs_release_extent_buffer_page(eb, 0); 3638 __free_extent_buffer(eb); 3639 } 3640 3641 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3642 u64 start, unsigned long len, 3643 struct page *page0) 3644 { 3645 unsigned long num_pages = num_extent_pages(start, len); 3646 unsigned long i; 3647 unsigned long index = start >> PAGE_CACHE_SHIFT; 3648 struct extent_buffer *eb; 3649 struct extent_buffer *exists = NULL; 3650 struct page *p; 3651 struct address_space *mapping = tree->mapping; 3652 int uptodate = 1; 3653 int ret; 3654 3655 rcu_read_lock(); 3656 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3657 if (eb && atomic_inc_not_zero(&eb->refs)) { 3658 rcu_read_unlock(); 3659 mark_page_accessed(eb->first_page); 3660 return eb; 3661 } 3662 rcu_read_unlock(); 3663 3664 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 3665 if (!eb) 3666 return NULL; 3667 3668 if (page0) { 3669 eb->first_page = page0; 3670 i = 1; 3671 index++; 3672 page_cache_get(page0); 3673 mark_page_accessed(page0); 3674 set_page_extent_mapped(page0); 3675 set_page_extent_head(page0, len); 3676 uptodate = PageUptodate(page0); 3677 } else { 3678 i = 0; 3679 } 3680 for (; i < num_pages; i++, index++) { 3681 p = find_or_create_page(mapping, index, GFP_NOFS); 3682 if (!p) { 3683 WARN_ON(1); 3684 goto free_eb; 3685 } 3686 set_page_extent_mapped(p); 3687 mark_page_accessed(p); 3688 if (i == 0) { 3689 eb->first_page = p; 3690 set_page_extent_head(p, len); 3691 } else { 3692 set_page_private(p, EXTENT_PAGE_PRIVATE); 3693 } 3694 if (!PageUptodate(p)) 3695 uptodate = 0; 3696 3697 /* 3698 * see below about how we avoid a nasty race with release page 3699 * and why we unlock later 3700 */ 3701 if (i != 0) 3702 unlock_page(p); 3703 } 3704 if (uptodate) 3705 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3706 3707 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 3708 if (ret) 3709 goto free_eb; 3710 3711 spin_lock(&tree->buffer_lock); 3712 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 3713 if (ret == -EEXIST) { 3714 exists = radix_tree_lookup(&tree->buffer, 3715 start >> PAGE_CACHE_SHIFT); 3716 /* add one reference for the caller */ 3717 atomic_inc(&exists->refs); 3718 spin_unlock(&tree->buffer_lock); 3719 radix_tree_preload_end(); 3720 goto free_eb; 3721 } 3722 /* add one reference for the tree */ 3723 atomic_inc(&eb->refs); 3724 spin_unlock(&tree->buffer_lock); 3725 radix_tree_preload_end(); 3726 3727 /* 3728 * there is a race where release page may have 3729 * tried to find this extent buffer in the radix 3730 * but failed. It will tell the VM it is safe to 3731 * reclaim the, and it will clear the page private bit. 3732 * We must make sure to set the page private bit properly 3733 * after the extent buffer is in the radix tree so 3734 * it doesn't get lost 3735 */ 3736 set_page_extent_mapped(eb->first_page); 3737 set_page_extent_head(eb->first_page, eb->len); 3738 if (!page0) 3739 unlock_page(eb->first_page); 3740 return eb; 3741 3742 free_eb: 3743 if (eb->first_page && !page0) 3744 unlock_page(eb->first_page); 3745 3746 if (!atomic_dec_and_test(&eb->refs)) 3747 return exists; 3748 btrfs_release_extent_buffer(eb); 3749 return exists; 3750 } 3751 3752 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3753 u64 start, unsigned long len) 3754 { 3755 struct extent_buffer *eb; 3756 3757 rcu_read_lock(); 3758 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3759 if (eb && atomic_inc_not_zero(&eb->refs)) { 3760 rcu_read_unlock(); 3761 mark_page_accessed(eb->first_page); 3762 return eb; 3763 } 3764 rcu_read_unlock(); 3765 3766 return NULL; 3767 } 3768 3769 void free_extent_buffer(struct extent_buffer *eb) 3770 { 3771 if (!eb) 3772 return; 3773 3774 if (!atomic_dec_and_test(&eb->refs)) 3775 return; 3776 3777 WARN_ON(1); 3778 } 3779 3780 int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3781 struct extent_buffer *eb) 3782 { 3783 unsigned long i; 3784 unsigned long num_pages; 3785 struct page *page; 3786 3787 num_pages = num_extent_pages(eb->start, eb->len); 3788 3789 for (i = 0; i < num_pages; i++) { 3790 page = extent_buffer_page(eb, i); 3791 if (!PageDirty(page)) 3792 continue; 3793 3794 lock_page(page); 3795 WARN_ON(!PagePrivate(page)); 3796 3797 set_page_extent_mapped(page); 3798 if (i == 0) 3799 set_page_extent_head(page, eb->len); 3800 3801 clear_page_dirty_for_io(page); 3802 spin_lock_irq(&page->mapping->tree_lock); 3803 if (!PageDirty(page)) { 3804 radix_tree_tag_clear(&page->mapping->page_tree, 3805 page_index(page), 3806 PAGECACHE_TAG_DIRTY); 3807 } 3808 spin_unlock_irq(&page->mapping->tree_lock); 3809 ClearPageError(page); 3810 unlock_page(page); 3811 } 3812 return 0; 3813 } 3814 3815 int set_extent_buffer_dirty(struct extent_io_tree *tree, 3816 struct extent_buffer *eb) 3817 { 3818 unsigned long i; 3819 unsigned long num_pages; 3820 int was_dirty = 0; 3821 3822 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3823 num_pages = num_extent_pages(eb->start, eb->len); 3824 for (i = 0; i < num_pages; i++) 3825 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3826 return was_dirty; 3827 } 3828 3829 static int __eb_straddles_pages(u64 start, u64 len) 3830 { 3831 if (len < PAGE_CACHE_SIZE) 3832 return 1; 3833 if (start & (PAGE_CACHE_SIZE - 1)) 3834 return 1; 3835 if ((start + len) & (PAGE_CACHE_SIZE - 1)) 3836 return 1; 3837 return 0; 3838 } 3839 3840 static int eb_straddles_pages(struct extent_buffer *eb) 3841 { 3842 return __eb_straddles_pages(eb->start, eb->len); 3843 } 3844 3845 int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3846 struct extent_buffer *eb, 3847 struct extent_state **cached_state) 3848 { 3849 unsigned long i; 3850 struct page *page; 3851 unsigned long num_pages; 3852 3853 num_pages = num_extent_pages(eb->start, eb->len); 3854 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3855 3856 if (eb_straddles_pages(eb)) { 3857 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3858 cached_state, GFP_NOFS); 3859 } 3860 for (i = 0; i < num_pages; i++) { 3861 page = extent_buffer_page(eb, i); 3862 if (page) 3863 ClearPageUptodate(page); 3864 } 3865 return 0; 3866 } 3867 3868 int set_extent_buffer_uptodate(struct extent_io_tree *tree, 3869 struct extent_buffer *eb) 3870 { 3871 unsigned long i; 3872 struct page *page; 3873 unsigned long num_pages; 3874 3875 num_pages = num_extent_pages(eb->start, eb->len); 3876 3877 if (eb_straddles_pages(eb)) { 3878 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3879 NULL, GFP_NOFS); 3880 } 3881 for (i = 0; i < num_pages; i++) { 3882 page = extent_buffer_page(eb, i); 3883 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3884 ((i == num_pages - 1) && 3885 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 3886 check_page_uptodate(tree, page); 3887 continue; 3888 } 3889 SetPageUptodate(page); 3890 } 3891 return 0; 3892 } 3893 3894 int extent_range_uptodate(struct extent_io_tree *tree, 3895 u64 start, u64 end) 3896 { 3897 struct page *page; 3898 int ret; 3899 int pg_uptodate = 1; 3900 int uptodate; 3901 unsigned long index; 3902 3903 if (__eb_straddles_pages(start, end - start + 1)) { 3904 ret = test_range_bit(tree, start, end, 3905 EXTENT_UPTODATE, 1, NULL); 3906 if (ret) 3907 return 1; 3908 } 3909 while (start <= end) { 3910 index = start >> PAGE_CACHE_SHIFT; 3911 page = find_get_page(tree->mapping, index); 3912 uptodate = PageUptodate(page); 3913 page_cache_release(page); 3914 if (!uptodate) { 3915 pg_uptodate = 0; 3916 break; 3917 } 3918 start += PAGE_CACHE_SIZE; 3919 } 3920 return pg_uptodate; 3921 } 3922 3923 int extent_buffer_uptodate(struct extent_io_tree *tree, 3924 struct extent_buffer *eb, 3925 struct extent_state *cached_state) 3926 { 3927 int ret = 0; 3928 unsigned long num_pages; 3929 unsigned long i; 3930 struct page *page; 3931 int pg_uptodate = 1; 3932 3933 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3934 return 1; 3935 3936 if (eb_straddles_pages(eb)) { 3937 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3938 EXTENT_UPTODATE, 1, cached_state); 3939 if (ret) 3940 return ret; 3941 } 3942 3943 num_pages = num_extent_pages(eb->start, eb->len); 3944 for (i = 0; i < num_pages; i++) { 3945 page = extent_buffer_page(eb, i); 3946 if (!PageUptodate(page)) { 3947 pg_uptodate = 0; 3948 break; 3949 } 3950 } 3951 return pg_uptodate; 3952 } 3953 3954 int read_extent_buffer_pages(struct extent_io_tree *tree, 3955 struct extent_buffer *eb, u64 start, int wait, 3956 get_extent_t *get_extent, int mirror_num) 3957 { 3958 unsigned long i; 3959 unsigned long start_i; 3960 struct page *page; 3961 int err; 3962 int ret = 0; 3963 int locked_pages = 0; 3964 int all_uptodate = 1; 3965 int inc_all_pages = 0; 3966 unsigned long num_pages; 3967 struct bio *bio = NULL; 3968 unsigned long bio_flags = 0; 3969 3970 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3971 return 0; 3972 3973 if (eb_straddles_pages(eb)) { 3974 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3975 EXTENT_UPTODATE, 1, NULL)) { 3976 return 0; 3977 } 3978 } 3979 3980 if (start) { 3981 WARN_ON(start < eb->start); 3982 start_i = (start >> PAGE_CACHE_SHIFT) - 3983 (eb->start >> PAGE_CACHE_SHIFT); 3984 } else { 3985 start_i = 0; 3986 } 3987 3988 num_pages = num_extent_pages(eb->start, eb->len); 3989 for (i = start_i; i < num_pages; i++) { 3990 page = extent_buffer_page(eb, i); 3991 if (wait == WAIT_NONE) { 3992 if (!trylock_page(page)) 3993 goto unlock_exit; 3994 } else { 3995 lock_page(page); 3996 } 3997 locked_pages++; 3998 if (!PageUptodate(page)) 3999 all_uptodate = 0; 4000 } 4001 if (all_uptodate) { 4002 if (start_i == 0) 4003 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4004 goto unlock_exit; 4005 } 4006 4007 for (i = start_i; i < num_pages; i++) { 4008 page = extent_buffer_page(eb, i); 4009 4010 WARN_ON(!PagePrivate(page)); 4011 4012 set_page_extent_mapped(page); 4013 if (i == 0) 4014 set_page_extent_head(page, eb->len); 4015 4016 if (inc_all_pages) 4017 page_cache_get(page); 4018 if (!PageUptodate(page)) { 4019 if (start_i == 0) 4020 inc_all_pages = 1; 4021 ClearPageError(page); 4022 err = __extent_read_full_page(tree, page, 4023 get_extent, &bio, 4024 mirror_num, &bio_flags); 4025 if (err) 4026 ret = err; 4027 } else { 4028 unlock_page(page); 4029 } 4030 } 4031 4032 if (bio) 4033 submit_one_bio(READ, bio, mirror_num, bio_flags); 4034 4035 if (ret || wait != WAIT_COMPLETE) 4036 return ret; 4037 4038 for (i = start_i; i < num_pages; i++) { 4039 page = extent_buffer_page(eb, i); 4040 wait_on_page_locked(page); 4041 if (!PageUptodate(page)) 4042 ret = -EIO; 4043 } 4044 4045 if (!ret) 4046 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4047 return ret; 4048 4049 unlock_exit: 4050 i = start_i; 4051 while (locked_pages > 0) { 4052 page = extent_buffer_page(eb, i); 4053 i++; 4054 unlock_page(page); 4055 locked_pages--; 4056 } 4057 return ret; 4058 } 4059 4060 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 4061 unsigned long start, 4062 unsigned long len) 4063 { 4064 size_t cur; 4065 size_t offset; 4066 struct page *page; 4067 char *kaddr; 4068 char *dst = (char *)dstv; 4069 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4070 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4071 4072 WARN_ON(start > eb->len); 4073 WARN_ON(start + len > eb->start + eb->len); 4074 4075 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4076 4077 while (len > 0) { 4078 page = extent_buffer_page(eb, i); 4079 4080 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4081 kaddr = page_address(page); 4082 memcpy(dst, kaddr + offset, cur); 4083 4084 dst += cur; 4085 len -= cur; 4086 offset = 0; 4087 i++; 4088 } 4089 } 4090 4091 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 4092 unsigned long min_len, char **map, 4093 unsigned long *map_start, 4094 unsigned long *map_len) 4095 { 4096 size_t offset = start & (PAGE_CACHE_SIZE - 1); 4097 char *kaddr; 4098 struct page *p; 4099 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4100 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4101 unsigned long end_i = (start_offset + start + min_len - 1) >> 4102 PAGE_CACHE_SHIFT; 4103 4104 if (i != end_i) 4105 return -EINVAL; 4106 4107 if (i == 0) { 4108 offset = start_offset; 4109 *map_start = 0; 4110 } else { 4111 offset = 0; 4112 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 4113 } 4114 4115 if (start + min_len > eb->len) { 4116 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4117 "wanted %lu %lu\n", (unsigned long long)eb->start, 4118 eb->len, start, min_len); 4119 WARN_ON(1); 4120 return -EINVAL; 4121 } 4122 4123 p = extent_buffer_page(eb, i); 4124 kaddr = page_address(p); 4125 *map = kaddr + offset; 4126 *map_len = PAGE_CACHE_SIZE - offset; 4127 return 0; 4128 } 4129 4130 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 4131 unsigned long start, 4132 unsigned long len) 4133 { 4134 size_t cur; 4135 size_t offset; 4136 struct page *page; 4137 char *kaddr; 4138 char *ptr = (char *)ptrv; 4139 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4140 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4141 int ret = 0; 4142 4143 WARN_ON(start > eb->len); 4144 WARN_ON(start + len > eb->start + eb->len); 4145 4146 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4147 4148 while (len > 0) { 4149 page = extent_buffer_page(eb, i); 4150 4151 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4152 4153 kaddr = page_address(page); 4154 ret = memcmp(ptr, kaddr + offset, cur); 4155 if (ret) 4156 break; 4157 4158 ptr += cur; 4159 len -= cur; 4160 offset = 0; 4161 i++; 4162 } 4163 return ret; 4164 } 4165 4166 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 4167 unsigned long start, unsigned long len) 4168 { 4169 size_t cur; 4170 size_t offset; 4171 struct page *page; 4172 char *kaddr; 4173 char *src = (char *)srcv; 4174 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4175 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4176 4177 WARN_ON(start > eb->len); 4178 WARN_ON(start + len > eb->start + eb->len); 4179 4180 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4181 4182 while (len > 0) { 4183 page = extent_buffer_page(eb, i); 4184 WARN_ON(!PageUptodate(page)); 4185 4186 cur = min(len, PAGE_CACHE_SIZE - offset); 4187 kaddr = page_address(page); 4188 memcpy(kaddr + offset, src, cur); 4189 4190 src += cur; 4191 len -= cur; 4192 offset = 0; 4193 i++; 4194 } 4195 } 4196 4197 void memset_extent_buffer(struct extent_buffer *eb, char c, 4198 unsigned long start, unsigned long len) 4199 { 4200 size_t cur; 4201 size_t offset; 4202 struct page *page; 4203 char *kaddr; 4204 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4205 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4206 4207 WARN_ON(start > eb->len); 4208 WARN_ON(start + len > eb->start + eb->len); 4209 4210 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4211 4212 while (len > 0) { 4213 page = extent_buffer_page(eb, i); 4214 WARN_ON(!PageUptodate(page)); 4215 4216 cur = min(len, PAGE_CACHE_SIZE - offset); 4217 kaddr = page_address(page); 4218 memset(kaddr + offset, c, cur); 4219 4220 len -= cur; 4221 offset = 0; 4222 i++; 4223 } 4224 } 4225 4226 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 4227 unsigned long dst_offset, unsigned long src_offset, 4228 unsigned long len) 4229 { 4230 u64 dst_len = dst->len; 4231 size_t cur; 4232 size_t offset; 4233 struct page *page; 4234 char *kaddr; 4235 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4236 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4237 4238 WARN_ON(src->len != dst_len); 4239 4240 offset = (start_offset + dst_offset) & 4241 ((unsigned long)PAGE_CACHE_SIZE - 1); 4242 4243 while (len > 0) { 4244 page = extent_buffer_page(dst, i); 4245 WARN_ON(!PageUptodate(page)); 4246 4247 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 4248 4249 kaddr = page_address(page); 4250 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4251 4252 src_offset += cur; 4253 len -= cur; 4254 offset = 0; 4255 i++; 4256 } 4257 } 4258 4259 static void move_pages(struct page *dst_page, struct page *src_page, 4260 unsigned long dst_off, unsigned long src_off, 4261 unsigned long len) 4262 { 4263 char *dst_kaddr = page_address(dst_page); 4264 if (dst_page == src_page) { 4265 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 4266 } else { 4267 char *src_kaddr = page_address(src_page); 4268 char *p = dst_kaddr + dst_off + len; 4269 char *s = src_kaddr + src_off + len; 4270 4271 while (len--) 4272 *--p = *--s; 4273 } 4274 } 4275 4276 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4277 { 4278 unsigned long distance = (src > dst) ? src - dst : dst - src; 4279 return distance < len; 4280 } 4281 4282 static void copy_pages(struct page *dst_page, struct page *src_page, 4283 unsigned long dst_off, unsigned long src_off, 4284 unsigned long len) 4285 { 4286 char *dst_kaddr = page_address(dst_page); 4287 char *src_kaddr; 4288 4289 if (dst_page != src_page) { 4290 src_kaddr = page_address(src_page); 4291 } else { 4292 src_kaddr = dst_kaddr; 4293 BUG_ON(areas_overlap(src_off, dst_off, len)); 4294 } 4295 4296 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 4297 } 4298 4299 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4300 unsigned long src_offset, unsigned long len) 4301 { 4302 size_t cur; 4303 size_t dst_off_in_page; 4304 size_t src_off_in_page; 4305 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4306 unsigned long dst_i; 4307 unsigned long src_i; 4308 4309 if (src_offset + len > dst->len) { 4310 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 4311 "len %lu dst len %lu\n", src_offset, len, dst->len); 4312 BUG_ON(1); 4313 } 4314 if (dst_offset + len > dst->len) { 4315 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 4316 "len %lu dst len %lu\n", dst_offset, len, dst->len); 4317 BUG_ON(1); 4318 } 4319 4320 while (len > 0) { 4321 dst_off_in_page = (start_offset + dst_offset) & 4322 ((unsigned long)PAGE_CACHE_SIZE - 1); 4323 src_off_in_page = (start_offset + src_offset) & 4324 ((unsigned long)PAGE_CACHE_SIZE - 1); 4325 4326 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4327 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 4328 4329 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 4330 src_off_in_page)); 4331 cur = min_t(unsigned long, cur, 4332 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 4333 4334 copy_pages(extent_buffer_page(dst, dst_i), 4335 extent_buffer_page(dst, src_i), 4336 dst_off_in_page, src_off_in_page, cur); 4337 4338 src_offset += cur; 4339 dst_offset += cur; 4340 len -= cur; 4341 } 4342 } 4343 4344 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4345 unsigned long src_offset, unsigned long len) 4346 { 4347 size_t cur; 4348 size_t dst_off_in_page; 4349 size_t src_off_in_page; 4350 unsigned long dst_end = dst_offset + len - 1; 4351 unsigned long src_end = src_offset + len - 1; 4352 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4353 unsigned long dst_i; 4354 unsigned long src_i; 4355 4356 if (src_offset + len > dst->len) { 4357 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 4358 "len %lu len %lu\n", src_offset, len, dst->len); 4359 BUG_ON(1); 4360 } 4361 if (dst_offset + len > dst->len) { 4362 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 4363 "len %lu len %lu\n", dst_offset, len, dst->len); 4364 BUG_ON(1); 4365 } 4366 if (!areas_overlap(src_offset, dst_offset, len)) { 4367 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4368 return; 4369 } 4370 while (len > 0) { 4371 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 4372 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 4373 4374 dst_off_in_page = (start_offset + dst_end) & 4375 ((unsigned long)PAGE_CACHE_SIZE - 1); 4376 src_off_in_page = (start_offset + src_end) & 4377 ((unsigned long)PAGE_CACHE_SIZE - 1); 4378 4379 cur = min_t(unsigned long, len, src_off_in_page + 1); 4380 cur = min(cur, dst_off_in_page + 1); 4381 move_pages(extent_buffer_page(dst, dst_i), 4382 extent_buffer_page(dst, src_i), 4383 dst_off_in_page - cur + 1, 4384 src_off_in_page - cur + 1, cur); 4385 4386 dst_end -= cur; 4387 src_end -= cur; 4388 len -= cur; 4389 } 4390 } 4391 4392 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 4393 { 4394 struct extent_buffer *eb = 4395 container_of(head, struct extent_buffer, rcu_head); 4396 4397 btrfs_release_extent_buffer(eb); 4398 } 4399 4400 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 4401 { 4402 u64 start = page_offset(page); 4403 struct extent_buffer *eb; 4404 int ret = 1; 4405 4406 spin_lock(&tree->buffer_lock); 4407 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4408 if (!eb) { 4409 spin_unlock(&tree->buffer_lock); 4410 return ret; 4411 } 4412 4413 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 4414 ret = 0; 4415 goto out; 4416 } 4417 4418 /* 4419 * set @eb->refs to 0 if it is already 1, and then release the @eb. 4420 * Or go back. 4421 */ 4422 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 4423 ret = 0; 4424 goto out; 4425 } 4426 4427 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4428 out: 4429 spin_unlock(&tree->buffer_lock); 4430 4431 /* at this point we can safely release the extent buffer */ 4432 if (atomic_read(&eb->refs) == 0) 4433 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4434 return ret; 4435 } 4436