1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 #include "volumes.h" 19 #include "check-integrity.h" 20 #include "locking.h" 21 #include "rcu-string.h" 22 #include "backref.h" 23 #include "transaction.h" 24 25 static struct kmem_cache *extent_state_cache; 26 static struct kmem_cache *extent_buffer_cache; 27 static struct bio_set *btrfs_bioset; 28 29 static inline bool extent_state_in_tree(const struct extent_state *state) 30 { 31 return !RB_EMPTY_NODE(&state->rb_node); 32 } 33 34 #ifdef CONFIG_BTRFS_DEBUG 35 static LIST_HEAD(buffers); 36 static LIST_HEAD(states); 37 38 static DEFINE_SPINLOCK(leak_lock); 39 40 static inline 41 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 42 { 43 unsigned long flags; 44 45 spin_lock_irqsave(&leak_lock, flags); 46 list_add(new, head); 47 spin_unlock_irqrestore(&leak_lock, flags); 48 } 49 50 static inline 51 void btrfs_leak_debug_del(struct list_head *entry) 52 { 53 unsigned long flags; 54 55 spin_lock_irqsave(&leak_lock, flags); 56 list_del(entry); 57 spin_unlock_irqrestore(&leak_lock, flags); 58 } 59 60 static inline 61 void btrfs_leak_debug_check(void) 62 { 63 struct extent_state *state; 64 struct extent_buffer *eb; 65 66 while (!list_empty(&states)) { 67 state = list_entry(states.next, struct extent_state, leak_list); 68 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 69 state->start, state->end, state->state, 70 extent_state_in_tree(state), 71 atomic_read(&state->refs)); 72 list_del(&state->leak_list); 73 kmem_cache_free(extent_state_cache, state); 74 } 75 76 while (!list_empty(&buffers)) { 77 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 78 pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n", 79 eb->start, eb->len, atomic_read(&eb->refs)); 80 list_del(&eb->leak_list); 81 kmem_cache_free(extent_buffer_cache, eb); 82 } 83 } 84 85 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 86 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 87 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 88 struct extent_io_tree *tree, u64 start, u64 end) 89 { 90 struct inode *inode; 91 u64 isize; 92 93 if (!tree->mapping) 94 return; 95 96 inode = tree->mapping->host; 97 isize = i_size_read(inode); 98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 99 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 100 "%s: ino %llu isize %llu odd range [%llu,%llu]", 101 caller, btrfs_ino(inode), isize, start, end); 102 } 103 } 104 #else 105 #define btrfs_leak_debug_add(new, head) do {} while (0) 106 #define btrfs_leak_debug_del(entry) do {} while (0) 107 #define btrfs_leak_debug_check() do {} while (0) 108 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 109 #endif 110 111 #define BUFFER_LRU_MAX 64 112 113 struct tree_entry { 114 u64 start; 115 u64 end; 116 struct rb_node rb_node; 117 }; 118 119 struct extent_page_data { 120 struct bio *bio; 121 struct extent_io_tree *tree; 122 get_extent_t *get_extent; 123 unsigned long bio_flags; 124 125 /* tells writepage not to lock the state bits for this range 126 * it still does the unlocking 127 */ 128 unsigned int extent_locked:1; 129 130 /* tells the submit_bio code to use REQ_SYNC */ 131 unsigned int sync_io:1; 132 }; 133 134 static void add_extent_changeset(struct extent_state *state, unsigned bits, 135 struct extent_changeset *changeset, 136 int set) 137 { 138 int ret; 139 140 if (!changeset) 141 return; 142 if (set && (state->state & bits) == bits) 143 return; 144 if (!set && (state->state & bits) == 0) 145 return; 146 changeset->bytes_changed += state->end - state->start + 1; 147 ret = ulist_add(changeset->range_changed, state->start, state->end, 148 GFP_ATOMIC); 149 /* ENOMEM */ 150 BUG_ON(ret < 0); 151 } 152 153 static noinline void flush_write_bio(void *data); 154 static inline struct btrfs_fs_info * 155 tree_fs_info(struct extent_io_tree *tree) 156 { 157 if (!tree->mapping) 158 return NULL; 159 return btrfs_sb(tree->mapping->host->i_sb); 160 } 161 162 int __init extent_io_init(void) 163 { 164 extent_state_cache = kmem_cache_create("btrfs_extent_state", 165 sizeof(struct extent_state), 0, 166 SLAB_MEM_SPREAD, NULL); 167 if (!extent_state_cache) 168 return -ENOMEM; 169 170 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 171 sizeof(struct extent_buffer), 0, 172 SLAB_MEM_SPREAD, NULL); 173 if (!extent_buffer_cache) 174 goto free_state_cache; 175 176 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 177 offsetof(struct btrfs_io_bio, bio)); 178 if (!btrfs_bioset) 179 goto free_buffer_cache; 180 181 if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) 182 goto free_bioset; 183 184 return 0; 185 186 free_bioset: 187 bioset_free(btrfs_bioset); 188 btrfs_bioset = NULL; 189 190 free_buffer_cache: 191 kmem_cache_destroy(extent_buffer_cache); 192 extent_buffer_cache = NULL; 193 194 free_state_cache: 195 kmem_cache_destroy(extent_state_cache); 196 extent_state_cache = NULL; 197 return -ENOMEM; 198 } 199 200 void extent_io_exit(void) 201 { 202 btrfs_leak_debug_check(); 203 204 /* 205 * Make sure all delayed rcu free are flushed before we 206 * destroy caches. 207 */ 208 rcu_barrier(); 209 kmem_cache_destroy(extent_state_cache); 210 kmem_cache_destroy(extent_buffer_cache); 211 if (btrfs_bioset) 212 bioset_free(btrfs_bioset); 213 } 214 215 void extent_io_tree_init(struct extent_io_tree *tree, 216 struct address_space *mapping) 217 { 218 tree->state = RB_ROOT; 219 tree->ops = NULL; 220 tree->dirty_bytes = 0; 221 spin_lock_init(&tree->lock); 222 tree->mapping = mapping; 223 } 224 225 static struct extent_state *alloc_extent_state(gfp_t mask) 226 { 227 struct extent_state *state; 228 229 state = kmem_cache_alloc(extent_state_cache, mask); 230 if (!state) 231 return state; 232 state->state = 0; 233 state->failrec = NULL; 234 RB_CLEAR_NODE(&state->rb_node); 235 btrfs_leak_debug_add(&state->leak_list, &states); 236 atomic_set(&state->refs, 1); 237 init_waitqueue_head(&state->wq); 238 trace_alloc_extent_state(state, mask, _RET_IP_); 239 return state; 240 } 241 242 void free_extent_state(struct extent_state *state) 243 { 244 if (!state) 245 return; 246 if (atomic_dec_and_test(&state->refs)) { 247 WARN_ON(extent_state_in_tree(state)); 248 btrfs_leak_debug_del(&state->leak_list); 249 trace_free_extent_state(state, _RET_IP_); 250 kmem_cache_free(extent_state_cache, state); 251 } 252 } 253 254 static struct rb_node *tree_insert(struct rb_root *root, 255 struct rb_node *search_start, 256 u64 offset, 257 struct rb_node *node, 258 struct rb_node ***p_in, 259 struct rb_node **parent_in) 260 { 261 struct rb_node **p; 262 struct rb_node *parent = NULL; 263 struct tree_entry *entry; 264 265 if (p_in && parent_in) { 266 p = *p_in; 267 parent = *parent_in; 268 goto do_insert; 269 } 270 271 p = search_start ? &search_start : &root->rb_node; 272 while (*p) { 273 parent = *p; 274 entry = rb_entry(parent, struct tree_entry, rb_node); 275 276 if (offset < entry->start) 277 p = &(*p)->rb_left; 278 else if (offset > entry->end) 279 p = &(*p)->rb_right; 280 else 281 return parent; 282 } 283 284 do_insert: 285 rb_link_node(node, parent, p); 286 rb_insert_color(node, root); 287 return NULL; 288 } 289 290 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 291 struct rb_node **prev_ret, 292 struct rb_node **next_ret, 293 struct rb_node ***p_ret, 294 struct rb_node **parent_ret) 295 { 296 struct rb_root *root = &tree->state; 297 struct rb_node **n = &root->rb_node; 298 struct rb_node *prev = NULL; 299 struct rb_node *orig_prev = NULL; 300 struct tree_entry *entry; 301 struct tree_entry *prev_entry = NULL; 302 303 while (*n) { 304 prev = *n; 305 entry = rb_entry(prev, struct tree_entry, rb_node); 306 prev_entry = entry; 307 308 if (offset < entry->start) 309 n = &(*n)->rb_left; 310 else if (offset > entry->end) 311 n = &(*n)->rb_right; 312 else 313 return *n; 314 } 315 316 if (p_ret) 317 *p_ret = n; 318 if (parent_ret) 319 *parent_ret = prev; 320 321 if (prev_ret) { 322 orig_prev = prev; 323 while (prev && offset > prev_entry->end) { 324 prev = rb_next(prev); 325 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 326 } 327 *prev_ret = prev; 328 prev = orig_prev; 329 } 330 331 if (next_ret) { 332 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 333 while (prev && offset < prev_entry->start) { 334 prev = rb_prev(prev); 335 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 336 } 337 *next_ret = prev; 338 } 339 return NULL; 340 } 341 342 static inline struct rb_node * 343 tree_search_for_insert(struct extent_io_tree *tree, 344 u64 offset, 345 struct rb_node ***p_ret, 346 struct rb_node **parent_ret) 347 { 348 struct rb_node *prev = NULL; 349 struct rb_node *ret; 350 351 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); 352 if (!ret) 353 return prev; 354 return ret; 355 } 356 357 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 358 u64 offset) 359 { 360 return tree_search_for_insert(tree, offset, NULL, NULL); 361 } 362 363 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 364 struct extent_state *other) 365 { 366 if (tree->ops && tree->ops->merge_extent_hook) 367 tree->ops->merge_extent_hook(tree->mapping->host, new, 368 other); 369 } 370 371 /* 372 * utility function to look for merge candidates inside a given range. 373 * Any extents with matching state are merged together into a single 374 * extent in the tree. Extents with EXTENT_IO in their state field 375 * are not merged because the end_io handlers need to be able to do 376 * operations on them without sleeping (or doing allocations/splits). 377 * 378 * This should be called with the tree lock held. 379 */ 380 static void merge_state(struct extent_io_tree *tree, 381 struct extent_state *state) 382 { 383 struct extent_state *other; 384 struct rb_node *other_node; 385 386 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 387 return; 388 389 other_node = rb_prev(&state->rb_node); 390 if (other_node) { 391 other = rb_entry(other_node, struct extent_state, rb_node); 392 if (other->end == state->start - 1 && 393 other->state == state->state) { 394 merge_cb(tree, state, other); 395 state->start = other->start; 396 rb_erase(&other->rb_node, &tree->state); 397 RB_CLEAR_NODE(&other->rb_node); 398 free_extent_state(other); 399 } 400 } 401 other_node = rb_next(&state->rb_node); 402 if (other_node) { 403 other = rb_entry(other_node, struct extent_state, rb_node); 404 if (other->start == state->end + 1 && 405 other->state == state->state) { 406 merge_cb(tree, state, other); 407 state->end = other->end; 408 rb_erase(&other->rb_node, &tree->state); 409 RB_CLEAR_NODE(&other->rb_node); 410 free_extent_state(other); 411 } 412 } 413 } 414 415 static void set_state_cb(struct extent_io_tree *tree, 416 struct extent_state *state, unsigned *bits) 417 { 418 if (tree->ops && tree->ops->set_bit_hook) 419 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 420 } 421 422 static void clear_state_cb(struct extent_io_tree *tree, 423 struct extent_state *state, unsigned *bits) 424 { 425 if (tree->ops && tree->ops->clear_bit_hook) 426 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 427 } 428 429 static void set_state_bits(struct extent_io_tree *tree, 430 struct extent_state *state, unsigned *bits, 431 struct extent_changeset *changeset); 432 433 /* 434 * insert an extent_state struct into the tree. 'bits' are set on the 435 * struct before it is inserted. 436 * 437 * This may return -EEXIST if the extent is already there, in which case the 438 * state struct is freed. 439 * 440 * The tree lock is not taken internally. This is a utility function and 441 * probably isn't what you want to call (see set/clear_extent_bit). 442 */ 443 static int insert_state(struct extent_io_tree *tree, 444 struct extent_state *state, u64 start, u64 end, 445 struct rb_node ***p, 446 struct rb_node **parent, 447 unsigned *bits, struct extent_changeset *changeset) 448 { 449 struct rb_node *node; 450 451 if (end < start) 452 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 453 end, start); 454 state->start = start; 455 state->end = end; 456 457 set_state_bits(tree, state, bits, changeset); 458 459 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 460 if (node) { 461 struct extent_state *found; 462 found = rb_entry(node, struct extent_state, rb_node); 463 pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 464 found->start, found->end, start, end); 465 return -EEXIST; 466 } 467 merge_state(tree, state); 468 return 0; 469 } 470 471 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 472 u64 split) 473 { 474 if (tree->ops && tree->ops->split_extent_hook) 475 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 476 } 477 478 /* 479 * split a given extent state struct in two, inserting the preallocated 480 * struct 'prealloc' as the newly created second half. 'split' indicates an 481 * offset inside 'orig' where it should be split. 482 * 483 * Before calling, 484 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 485 * are two extent state structs in the tree: 486 * prealloc: [orig->start, split - 1] 487 * orig: [ split, orig->end ] 488 * 489 * The tree locks are not taken by this function. They need to be held 490 * by the caller. 491 */ 492 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 493 struct extent_state *prealloc, u64 split) 494 { 495 struct rb_node *node; 496 497 split_cb(tree, orig, split); 498 499 prealloc->start = orig->start; 500 prealloc->end = split - 1; 501 prealloc->state = orig->state; 502 orig->start = split; 503 504 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 505 &prealloc->rb_node, NULL, NULL); 506 if (node) { 507 free_extent_state(prealloc); 508 return -EEXIST; 509 } 510 return 0; 511 } 512 513 static struct extent_state *next_state(struct extent_state *state) 514 { 515 struct rb_node *next = rb_next(&state->rb_node); 516 if (next) 517 return rb_entry(next, struct extent_state, rb_node); 518 else 519 return NULL; 520 } 521 522 /* 523 * utility function to clear some bits in an extent state struct. 524 * it will optionally wake up any one waiting on this state (wake == 1). 525 * 526 * If no bits are set on the state struct after clearing things, the 527 * struct is freed and removed from the tree 528 */ 529 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 530 struct extent_state *state, 531 unsigned *bits, int wake, 532 struct extent_changeset *changeset) 533 { 534 struct extent_state *next; 535 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 536 537 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 538 u64 range = state->end - state->start + 1; 539 WARN_ON(range > tree->dirty_bytes); 540 tree->dirty_bytes -= range; 541 } 542 clear_state_cb(tree, state, bits); 543 add_extent_changeset(state, bits_to_clear, changeset, 0); 544 state->state &= ~bits_to_clear; 545 if (wake) 546 wake_up(&state->wq); 547 if (state->state == 0) { 548 next = next_state(state); 549 if (extent_state_in_tree(state)) { 550 rb_erase(&state->rb_node, &tree->state); 551 RB_CLEAR_NODE(&state->rb_node); 552 free_extent_state(state); 553 } else { 554 WARN_ON(1); 555 } 556 } else { 557 merge_state(tree, state); 558 next = next_state(state); 559 } 560 return next; 561 } 562 563 static struct extent_state * 564 alloc_extent_state_atomic(struct extent_state *prealloc) 565 { 566 if (!prealloc) 567 prealloc = alloc_extent_state(GFP_ATOMIC); 568 569 return prealloc; 570 } 571 572 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 573 { 574 btrfs_panic(tree_fs_info(tree), err, 575 "Locking error: Extent tree was modified by another thread while locked."); 576 } 577 578 /* 579 * clear some bits on a range in the tree. This may require splitting 580 * or inserting elements in the tree, so the gfp mask is used to 581 * indicate which allocations or sleeping are allowed. 582 * 583 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 584 * the given range from the tree regardless of state (ie for truncate). 585 * 586 * the range [start, end] is inclusive. 587 * 588 * This takes the tree lock, and returns 0 on success and < 0 on error. 589 */ 590 static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 591 unsigned bits, int wake, int delete, 592 struct extent_state **cached_state, 593 gfp_t mask, struct extent_changeset *changeset) 594 { 595 struct extent_state *state; 596 struct extent_state *cached; 597 struct extent_state *prealloc = NULL; 598 struct rb_node *node; 599 u64 last_end; 600 int err; 601 int clear = 0; 602 603 btrfs_debug_check_extent_io_range(tree, start, end); 604 605 if (bits & EXTENT_DELALLOC) 606 bits |= EXTENT_NORESERVE; 607 608 if (delete) 609 bits |= ~EXTENT_CTLBITS; 610 bits |= EXTENT_FIRST_DELALLOC; 611 612 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 613 clear = 1; 614 again: 615 if (!prealloc && gfpflags_allow_blocking(mask)) { 616 /* 617 * Don't care for allocation failure here because we might end 618 * up not needing the pre-allocated extent state at all, which 619 * is the case if we only have in the tree extent states that 620 * cover our input range and don't cover too any other range. 621 * If we end up needing a new extent state we allocate it later. 622 */ 623 prealloc = alloc_extent_state(mask); 624 } 625 626 spin_lock(&tree->lock); 627 if (cached_state) { 628 cached = *cached_state; 629 630 if (clear) { 631 *cached_state = NULL; 632 cached_state = NULL; 633 } 634 635 if (cached && extent_state_in_tree(cached) && 636 cached->start <= start && cached->end > start) { 637 if (clear) 638 atomic_dec(&cached->refs); 639 state = cached; 640 goto hit_next; 641 } 642 if (clear) 643 free_extent_state(cached); 644 } 645 /* 646 * this search will find the extents that end after 647 * our range starts 648 */ 649 node = tree_search(tree, start); 650 if (!node) 651 goto out; 652 state = rb_entry(node, struct extent_state, rb_node); 653 hit_next: 654 if (state->start > end) 655 goto out; 656 WARN_ON(state->end < start); 657 last_end = state->end; 658 659 /* the state doesn't have the wanted bits, go ahead */ 660 if (!(state->state & bits)) { 661 state = next_state(state); 662 goto next; 663 } 664 665 /* 666 * | ---- desired range ---- | 667 * | state | or 668 * | ------------- state -------------- | 669 * 670 * We need to split the extent we found, and may flip 671 * bits on second half. 672 * 673 * If the extent we found extends past our range, we 674 * just split and search again. It'll get split again 675 * the next time though. 676 * 677 * If the extent we found is inside our range, we clear 678 * the desired bit on it. 679 */ 680 681 if (state->start < start) { 682 prealloc = alloc_extent_state_atomic(prealloc); 683 BUG_ON(!prealloc); 684 err = split_state(tree, state, prealloc, start); 685 if (err) 686 extent_io_tree_panic(tree, err); 687 688 prealloc = NULL; 689 if (err) 690 goto out; 691 if (state->end <= end) { 692 state = clear_state_bit(tree, state, &bits, wake, 693 changeset); 694 goto next; 695 } 696 goto search_again; 697 } 698 /* 699 * | ---- desired range ---- | 700 * | state | 701 * We need to split the extent, and clear the bit 702 * on the first half 703 */ 704 if (state->start <= end && state->end > end) { 705 prealloc = alloc_extent_state_atomic(prealloc); 706 BUG_ON(!prealloc); 707 err = split_state(tree, state, prealloc, end + 1); 708 if (err) 709 extent_io_tree_panic(tree, err); 710 711 if (wake) 712 wake_up(&state->wq); 713 714 clear_state_bit(tree, prealloc, &bits, wake, changeset); 715 716 prealloc = NULL; 717 goto out; 718 } 719 720 state = clear_state_bit(tree, state, &bits, wake, changeset); 721 next: 722 if (last_end == (u64)-1) 723 goto out; 724 start = last_end + 1; 725 if (start <= end && state && !need_resched()) 726 goto hit_next; 727 728 search_again: 729 if (start > end) 730 goto out; 731 spin_unlock(&tree->lock); 732 if (gfpflags_allow_blocking(mask)) 733 cond_resched(); 734 goto again; 735 736 out: 737 spin_unlock(&tree->lock); 738 if (prealloc) 739 free_extent_state(prealloc); 740 741 return 0; 742 743 } 744 745 static void wait_on_state(struct extent_io_tree *tree, 746 struct extent_state *state) 747 __releases(tree->lock) 748 __acquires(tree->lock) 749 { 750 DEFINE_WAIT(wait); 751 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 752 spin_unlock(&tree->lock); 753 schedule(); 754 spin_lock(&tree->lock); 755 finish_wait(&state->wq, &wait); 756 } 757 758 /* 759 * waits for one or more bits to clear on a range in the state tree. 760 * The range [start, end] is inclusive. 761 * The tree lock is taken by this function 762 */ 763 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 764 unsigned long bits) 765 { 766 struct extent_state *state; 767 struct rb_node *node; 768 769 btrfs_debug_check_extent_io_range(tree, start, end); 770 771 spin_lock(&tree->lock); 772 again: 773 while (1) { 774 /* 775 * this search will find all the extents that end after 776 * our range starts 777 */ 778 node = tree_search(tree, start); 779 process_node: 780 if (!node) 781 break; 782 783 state = rb_entry(node, struct extent_state, rb_node); 784 785 if (state->start > end) 786 goto out; 787 788 if (state->state & bits) { 789 start = state->start; 790 atomic_inc(&state->refs); 791 wait_on_state(tree, state); 792 free_extent_state(state); 793 goto again; 794 } 795 start = state->end + 1; 796 797 if (start > end) 798 break; 799 800 if (!cond_resched_lock(&tree->lock)) { 801 node = rb_next(node); 802 goto process_node; 803 } 804 } 805 out: 806 spin_unlock(&tree->lock); 807 } 808 809 static void set_state_bits(struct extent_io_tree *tree, 810 struct extent_state *state, 811 unsigned *bits, struct extent_changeset *changeset) 812 { 813 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 814 815 set_state_cb(tree, state, bits); 816 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 817 u64 range = state->end - state->start + 1; 818 tree->dirty_bytes += range; 819 } 820 add_extent_changeset(state, bits_to_set, changeset, 1); 821 state->state |= bits_to_set; 822 } 823 824 static void cache_state_if_flags(struct extent_state *state, 825 struct extent_state **cached_ptr, 826 unsigned flags) 827 { 828 if (cached_ptr && !(*cached_ptr)) { 829 if (!flags || (state->state & flags)) { 830 *cached_ptr = state; 831 atomic_inc(&state->refs); 832 } 833 } 834 } 835 836 static void cache_state(struct extent_state *state, 837 struct extent_state **cached_ptr) 838 { 839 return cache_state_if_flags(state, cached_ptr, 840 EXTENT_IOBITS | EXTENT_BOUNDARY); 841 } 842 843 /* 844 * set some bits on a range in the tree. This may require allocations or 845 * sleeping, so the gfp mask is used to indicate what is allowed. 846 * 847 * If any of the exclusive bits are set, this will fail with -EEXIST if some 848 * part of the range already has the desired bits set. The start of the 849 * existing range is returned in failed_start in this case. 850 * 851 * [start, end] is inclusive This takes the tree lock. 852 */ 853 854 static int __must_check 855 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 856 unsigned bits, unsigned exclusive_bits, 857 u64 *failed_start, struct extent_state **cached_state, 858 gfp_t mask, struct extent_changeset *changeset) 859 { 860 struct extent_state *state; 861 struct extent_state *prealloc = NULL; 862 struct rb_node *node; 863 struct rb_node **p; 864 struct rb_node *parent; 865 int err = 0; 866 u64 last_start; 867 u64 last_end; 868 869 btrfs_debug_check_extent_io_range(tree, start, end); 870 871 bits |= EXTENT_FIRST_DELALLOC; 872 again: 873 if (!prealloc && gfpflags_allow_blocking(mask)) { 874 /* 875 * Don't care for allocation failure here because we might end 876 * up not needing the pre-allocated extent state at all, which 877 * is the case if we only have in the tree extent states that 878 * cover our input range and don't cover too any other range. 879 * If we end up needing a new extent state we allocate it later. 880 */ 881 prealloc = alloc_extent_state(mask); 882 } 883 884 spin_lock(&tree->lock); 885 if (cached_state && *cached_state) { 886 state = *cached_state; 887 if (state->start <= start && state->end > start && 888 extent_state_in_tree(state)) { 889 node = &state->rb_node; 890 goto hit_next; 891 } 892 } 893 /* 894 * this search will find all the extents that end after 895 * our range starts. 896 */ 897 node = tree_search_for_insert(tree, start, &p, &parent); 898 if (!node) { 899 prealloc = alloc_extent_state_atomic(prealloc); 900 BUG_ON(!prealloc); 901 err = insert_state(tree, prealloc, start, end, 902 &p, &parent, &bits, changeset); 903 if (err) 904 extent_io_tree_panic(tree, err); 905 906 cache_state(prealloc, cached_state); 907 prealloc = NULL; 908 goto out; 909 } 910 state = rb_entry(node, struct extent_state, rb_node); 911 hit_next: 912 last_start = state->start; 913 last_end = state->end; 914 915 /* 916 * | ---- desired range ---- | 917 * | state | 918 * 919 * Just lock what we found and keep going 920 */ 921 if (state->start == start && state->end <= end) { 922 if (state->state & exclusive_bits) { 923 *failed_start = state->start; 924 err = -EEXIST; 925 goto out; 926 } 927 928 set_state_bits(tree, state, &bits, changeset); 929 cache_state(state, cached_state); 930 merge_state(tree, state); 931 if (last_end == (u64)-1) 932 goto out; 933 start = last_end + 1; 934 state = next_state(state); 935 if (start < end && state && state->start == start && 936 !need_resched()) 937 goto hit_next; 938 goto search_again; 939 } 940 941 /* 942 * | ---- desired range ---- | 943 * | state | 944 * or 945 * | ------------- state -------------- | 946 * 947 * We need to split the extent we found, and may flip bits on 948 * second half. 949 * 950 * If the extent we found extends past our 951 * range, we just split and search again. It'll get split 952 * again the next time though. 953 * 954 * If the extent we found is inside our range, we set the 955 * desired bit on it. 956 */ 957 if (state->start < start) { 958 if (state->state & exclusive_bits) { 959 *failed_start = start; 960 err = -EEXIST; 961 goto out; 962 } 963 964 prealloc = alloc_extent_state_atomic(prealloc); 965 BUG_ON(!prealloc); 966 err = split_state(tree, state, prealloc, start); 967 if (err) 968 extent_io_tree_panic(tree, err); 969 970 prealloc = NULL; 971 if (err) 972 goto out; 973 if (state->end <= end) { 974 set_state_bits(tree, state, &bits, changeset); 975 cache_state(state, cached_state); 976 merge_state(tree, state); 977 if (last_end == (u64)-1) 978 goto out; 979 start = last_end + 1; 980 state = next_state(state); 981 if (start < end && state && state->start == start && 982 !need_resched()) 983 goto hit_next; 984 } 985 goto search_again; 986 } 987 /* 988 * | ---- desired range ---- | 989 * | state | or | state | 990 * 991 * There's a hole, we need to insert something in it and 992 * ignore the extent we found. 993 */ 994 if (state->start > start) { 995 u64 this_end; 996 if (end < last_start) 997 this_end = end; 998 else 999 this_end = last_start - 1; 1000 1001 prealloc = alloc_extent_state_atomic(prealloc); 1002 BUG_ON(!prealloc); 1003 1004 /* 1005 * Avoid to free 'prealloc' if it can be merged with 1006 * the later extent. 1007 */ 1008 err = insert_state(tree, prealloc, start, this_end, 1009 NULL, NULL, &bits, changeset); 1010 if (err) 1011 extent_io_tree_panic(tree, err); 1012 1013 cache_state(prealloc, cached_state); 1014 prealloc = NULL; 1015 start = this_end + 1; 1016 goto search_again; 1017 } 1018 /* 1019 * | ---- desired range ---- | 1020 * | state | 1021 * We need to split the extent, and set the bit 1022 * on the first half 1023 */ 1024 if (state->start <= end && state->end > end) { 1025 if (state->state & exclusive_bits) { 1026 *failed_start = start; 1027 err = -EEXIST; 1028 goto out; 1029 } 1030 1031 prealloc = alloc_extent_state_atomic(prealloc); 1032 BUG_ON(!prealloc); 1033 err = split_state(tree, state, prealloc, end + 1); 1034 if (err) 1035 extent_io_tree_panic(tree, err); 1036 1037 set_state_bits(tree, prealloc, &bits, changeset); 1038 cache_state(prealloc, cached_state); 1039 merge_state(tree, prealloc); 1040 prealloc = NULL; 1041 goto out; 1042 } 1043 1044 search_again: 1045 if (start > end) 1046 goto out; 1047 spin_unlock(&tree->lock); 1048 if (gfpflags_allow_blocking(mask)) 1049 cond_resched(); 1050 goto again; 1051 1052 out: 1053 spin_unlock(&tree->lock); 1054 if (prealloc) 1055 free_extent_state(prealloc); 1056 1057 return err; 1058 1059 } 1060 1061 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1062 unsigned bits, u64 * failed_start, 1063 struct extent_state **cached_state, gfp_t mask) 1064 { 1065 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1066 cached_state, mask, NULL); 1067 } 1068 1069 1070 /** 1071 * convert_extent_bit - convert all bits in a given range from one bit to 1072 * another 1073 * @tree: the io tree to search 1074 * @start: the start offset in bytes 1075 * @end: the end offset in bytes (inclusive) 1076 * @bits: the bits to set in this range 1077 * @clear_bits: the bits to clear in this range 1078 * @cached_state: state that we're going to cache 1079 * 1080 * This will go through and set bits for the given range. If any states exist 1081 * already in this range they are set with the given bit and cleared of the 1082 * clear_bits. This is only meant to be used by things that are mergeable, ie 1083 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1084 * boundary bits like LOCK. 1085 * 1086 * All allocations are done with GFP_NOFS. 1087 */ 1088 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1089 unsigned bits, unsigned clear_bits, 1090 struct extent_state **cached_state) 1091 { 1092 struct extent_state *state; 1093 struct extent_state *prealloc = NULL; 1094 struct rb_node *node; 1095 struct rb_node **p; 1096 struct rb_node *parent; 1097 int err = 0; 1098 u64 last_start; 1099 u64 last_end; 1100 bool first_iteration = true; 1101 1102 btrfs_debug_check_extent_io_range(tree, start, end); 1103 1104 again: 1105 if (!prealloc) { 1106 /* 1107 * Best effort, don't worry if extent state allocation fails 1108 * here for the first iteration. We might have a cached state 1109 * that matches exactly the target range, in which case no 1110 * extent state allocations are needed. We'll only know this 1111 * after locking the tree. 1112 */ 1113 prealloc = alloc_extent_state(GFP_NOFS); 1114 if (!prealloc && !first_iteration) 1115 return -ENOMEM; 1116 } 1117 1118 spin_lock(&tree->lock); 1119 if (cached_state && *cached_state) { 1120 state = *cached_state; 1121 if (state->start <= start && state->end > start && 1122 extent_state_in_tree(state)) { 1123 node = &state->rb_node; 1124 goto hit_next; 1125 } 1126 } 1127 1128 /* 1129 * this search will find all the extents that end after 1130 * our range starts. 1131 */ 1132 node = tree_search_for_insert(tree, start, &p, &parent); 1133 if (!node) { 1134 prealloc = alloc_extent_state_atomic(prealloc); 1135 if (!prealloc) { 1136 err = -ENOMEM; 1137 goto out; 1138 } 1139 err = insert_state(tree, prealloc, start, end, 1140 &p, &parent, &bits, NULL); 1141 if (err) 1142 extent_io_tree_panic(tree, err); 1143 cache_state(prealloc, cached_state); 1144 prealloc = NULL; 1145 goto out; 1146 } 1147 state = rb_entry(node, struct extent_state, rb_node); 1148 hit_next: 1149 last_start = state->start; 1150 last_end = state->end; 1151 1152 /* 1153 * | ---- desired range ---- | 1154 * | state | 1155 * 1156 * Just lock what we found and keep going 1157 */ 1158 if (state->start == start && state->end <= end) { 1159 set_state_bits(tree, state, &bits, NULL); 1160 cache_state(state, cached_state); 1161 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1162 if (last_end == (u64)-1) 1163 goto out; 1164 start = last_end + 1; 1165 if (start < end && state && state->start == start && 1166 !need_resched()) 1167 goto hit_next; 1168 goto search_again; 1169 } 1170 1171 /* 1172 * | ---- desired range ---- | 1173 * | state | 1174 * or 1175 * | ------------- state -------------- | 1176 * 1177 * We need to split the extent we found, and may flip bits on 1178 * second half. 1179 * 1180 * If the extent we found extends past our 1181 * range, we just split and search again. It'll get split 1182 * again the next time though. 1183 * 1184 * If the extent we found is inside our range, we set the 1185 * desired bit on it. 1186 */ 1187 if (state->start < start) { 1188 prealloc = alloc_extent_state_atomic(prealloc); 1189 if (!prealloc) { 1190 err = -ENOMEM; 1191 goto out; 1192 } 1193 err = split_state(tree, state, prealloc, start); 1194 if (err) 1195 extent_io_tree_panic(tree, err); 1196 prealloc = NULL; 1197 if (err) 1198 goto out; 1199 if (state->end <= end) { 1200 set_state_bits(tree, state, &bits, NULL); 1201 cache_state(state, cached_state); 1202 state = clear_state_bit(tree, state, &clear_bits, 0, 1203 NULL); 1204 if (last_end == (u64)-1) 1205 goto out; 1206 start = last_end + 1; 1207 if (start < end && state && state->start == start && 1208 !need_resched()) 1209 goto hit_next; 1210 } 1211 goto search_again; 1212 } 1213 /* 1214 * | ---- desired range ---- | 1215 * | state | or | state | 1216 * 1217 * There's a hole, we need to insert something in it and 1218 * ignore the extent we found. 1219 */ 1220 if (state->start > start) { 1221 u64 this_end; 1222 if (end < last_start) 1223 this_end = end; 1224 else 1225 this_end = last_start - 1; 1226 1227 prealloc = alloc_extent_state_atomic(prealloc); 1228 if (!prealloc) { 1229 err = -ENOMEM; 1230 goto out; 1231 } 1232 1233 /* 1234 * Avoid to free 'prealloc' if it can be merged with 1235 * the later extent. 1236 */ 1237 err = insert_state(tree, prealloc, start, this_end, 1238 NULL, NULL, &bits, NULL); 1239 if (err) 1240 extent_io_tree_panic(tree, err); 1241 cache_state(prealloc, cached_state); 1242 prealloc = NULL; 1243 start = this_end + 1; 1244 goto search_again; 1245 } 1246 /* 1247 * | ---- desired range ---- | 1248 * | state | 1249 * We need to split the extent, and set the bit 1250 * on the first half 1251 */ 1252 if (state->start <= end && state->end > end) { 1253 prealloc = alloc_extent_state_atomic(prealloc); 1254 if (!prealloc) { 1255 err = -ENOMEM; 1256 goto out; 1257 } 1258 1259 err = split_state(tree, state, prealloc, end + 1); 1260 if (err) 1261 extent_io_tree_panic(tree, err); 1262 1263 set_state_bits(tree, prealloc, &bits, NULL); 1264 cache_state(prealloc, cached_state); 1265 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1266 prealloc = NULL; 1267 goto out; 1268 } 1269 1270 search_again: 1271 if (start > end) 1272 goto out; 1273 spin_unlock(&tree->lock); 1274 cond_resched(); 1275 first_iteration = false; 1276 goto again; 1277 1278 out: 1279 spin_unlock(&tree->lock); 1280 if (prealloc) 1281 free_extent_state(prealloc); 1282 1283 return err; 1284 } 1285 1286 /* wrappers around set/clear extent bit */ 1287 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1288 unsigned bits, struct extent_changeset *changeset) 1289 { 1290 /* 1291 * We don't support EXTENT_LOCKED yet, as current changeset will 1292 * record any bits changed, so for EXTENT_LOCKED case, it will 1293 * either fail with -EEXIST or changeset will record the whole 1294 * range. 1295 */ 1296 BUG_ON(bits & EXTENT_LOCKED); 1297 1298 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1299 changeset); 1300 } 1301 1302 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1303 unsigned bits, int wake, int delete, 1304 struct extent_state **cached, gfp_t mask) 1305 { 1306 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1307 cached, mask, NULL); 1308 } 1309 1310 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1311 unsigned bits, struct extent_changeset *changeset) 1312 { 1313 /* 1314 * Don't support EXTENT_LOCKED case, same reason as 1315 * set_record_extent_bits(). 1316 */ 1317 BUG_ON(bits & EXTENT_LOCKED); 1318 1319 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1320 changeset); 1321 } 1322 1323 /* 1324 * either insert or lock state struct between start and end use mask to tell 1325 * us if waiting is desired. 1326 */ 1327 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1328 struct extent_state **cached_state) 1329 { 1330 int err; 1331 u64 failed_start; 1332 1333 while (1) { 1334 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1335 EXTENT_LOCKED, &failed_start, 1336 cached_state, GFP_NOFS, NULL); 1337 if (err == -EEXIST) { 1338 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1339 start = failed_start; 1340 } else 1341 break; 1342 WARN_ON(start > end); 1343 } 1344 return err; 1345 } 1346 1347 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1348 { 1349 int err; 1350 u64 failed_start; 1351 1352 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1353 &failed_start, NULL, GFP_NOFS, NULL); 1354 if (err == -EEXIST) { 1355 if (failed_start > start) 1356 clear_extent_bit(tree, start, failed_start - 1, 1357 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1358 return 0; 1359 } 1360 return 1; 1361 } 1362 1363 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1364 { 1365 unsigned long index = start >> PAGE_SHIFT; 1366 unsigned long end_index = end >> PAGE_SHIFT; 1367 struct page *page; 1368 1369 while (index <= end_index) { 1370 page = find_get_page(inode->i_mapping, index); 1371 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1372 clear_page_dirty_for_io(page); 1373 put_page(page); 1374 index++; 1375 } 1376 } 1377 1378 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1379 { 1380 unsigned long index = start >> PAGE_SHIFT; 1381 unsigned long end_index = end >> PAGE_SHIFT; 1382 struct page *page; 1383 1384 while (index <= end_index) { 1385 page = find_get_page(inode->i_mapping, index); 1386 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1387 __set_page_dirty_nobuffers(page); 1388 account_page_redirty(page); 1389 put_page(page); 1390 index++; 1391 } 1392 } 1393 1394 /* 1395 * helper function to set both pages and extents in the tree writeback 1396 */ 1397 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1398 { 1399 unsigned long index = start >> PAGE_SHIFT; 1400 unsigned long end_index = end >> PAGE_SHIFT; 1401 struct page *page; 1402 1403 while (index <= end_index) { 1404 page = find_get_page(tree->mapping, index); 1405 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1406 set_page_writeback(page); 1407 put_page(page); 1408 index++; 1409 } 1410 } 1411 1412 /* find the first state struct with 'bits' set after 'start', and 1413 * return it. tree->lock must be held. NULL will returned if 1414 * nothing was found after 'start' 1415 */ 1416 static struct extent_state * 1417 find_first_extent_bit_state(struct extent_io_tree *tree, 1418 u64 start, unsigned bits) 1419 { 1420 struct rb_node *node; 1421 struct extent_state *state; 1422 1423 /* 1424 * this search will find all the extents that end after 1425 * our range starts. 1426 */ 1427 node = tree_search(tree, start); 1428 if (!node) 1429 goto out; 1430 1431 while (1) { 1432 state = rb_entry(node, struct extent_state, rb_node); 1433 if (state->end >= start && (state->state & bits)) 1434 return state; 1435 1436 node = rb_next(node); 1437 if (!node) 1438 break; 1439 } 1440 out: 1441 return NULL; 1442 } 1443 1444 /* 1445 * find the first offset in the io tree with 'bits' set. zero is 1446 * returned if we find something, and *start_ret and *end_ret are 1447 * set to reflect the state struct that was found. 1448 * 1449 * If nothing was found, 1 is returned. If found something, return 0. 1450 */ 1451 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1452 u64 *start_ret, u64 *end_ret, unsigned bits, 1453 struct extent_state **cached_state) 1454 { 1455 struct extent_state *state; 1456 struct rb_node *n; 1457 int ret = 1; 1458 1459 spin_lock(&tree->lock); 1460 if (cached_state && *cached_state) { 1461 state = *cached_state; 1462 if (state->end == start - 1 && extent_state_in_tree(state)) { 1463 n = rb_next(&state->rb_node); 1464 while (n) { 1465 state = rb_entry(n, struct extent_state, 1466 rb_node); 1467 if (state->state & bits) 1468 goto got_it; 1469 n = rb_next(n); 1470 } 1471 free_extent_state(*cached_state); 1472 *cached_state = NULL; 1473 goto out; 1474 } 1475 free_extent_state(*cached_state); 1476 *cached_state = NULL; 1477 } 1478 1479 state = find_first_extent_bit_state(tree, start, bits); 1480 got_it: 1481 if (state) { 1482 cache_state_if_flags(state, cached_state, 0); 1483 *start_ret = state->start; 1484 *end_ret = state->end; 1485 ret = 0; 1486 } 1487 out: 1488 spin_unlock(&tree->lock); 1489 return ret; 1490 } 1491 1492 /* 1493 * find a contiguous range of bytes in the file marked as delalloc, not 1494 * more than 'max_bytes'. start and end are used to return the range, 1495 * 1496 * 1 is returned if we find something, 0 if nothing was in the tree 1497 */ 1498 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1499 u64 *start, u64 *end, u64 max_bytes, 1500 struct extent_state **cached_state) 1501 { 1502 struct rb_node *node; 1503 struct extent_state *state; 1504 u64 cur_start = *start; 1505 u64 found = 0; 1506 u64 total_bytes = 0; 1507 1508 spin_lock(&tree->lock); 1509 1510 /* 1511 * this search will find all the extents that end after 1512 * our range starts. 1513 */ 1514 node = tree_search(tree, cur_start); 1515 if (!node) { 1516 if (!found) 1517 *end = (u64)-1; 1518 goto out; 1519 } 1520 1521 while (1) { 1522 state = rb_entry(node, struct extent_state, rb_node); 1523 if (found && (state->start != cur_start || 1524 (state->state & EXTENT_BOUNDARY))) { 1525 goto out; 1526 } 1527 if (!(state->state & EXTENT_DELALLOC)) { 1528 if (!found) 1529 *end = state->end; 1530 goto out; 1531 } 1532 if (!found) { 1533 *start = state->start; 1534 *cached_state = state; 1535 atomic_inc(&state->refs); 1536 } 1537 found++; 1538 *end = state->end; 1539 cur_start = state->end + 1; 1540 node = rb_next(node); 1541 total_bytes += state->end - state->start + 1; 1542 if (total_bytes >= max_bytes) 1543 break; 1544 if (!node) 1545 break; 1546 } 1547 out: 1548 spin_unlock(&tree->lock); 1549 return found; 1550 } 1551 1552 static noinline void __unlock_for_delalloc(struct inode *inode, 1553 struct page *locked_page, 1554 u64 start, u64 end) 1555 { 1556 int ret; 1557 struct page *pages[16]; 1558 unsigned long index = start >> PAGE_SHIFT; 1559 unsigned long end_index = end >> PAGE_SHIFT; 1560 unsigned long nr_pages = end_index - index + 1; 1561 int i; 1562 1563 if (index == locked_page->index && end_index == index) 1564 return; 1565 1566 while (nr_pages > 0) { 1567 ret = find_get_pages_contig(inode->i_mapping, index, 1568 min_t(unsigned long, nr_pages, 1569 ARRAY_SIZE(pages)), pages); 1570 for (i = 0; i < ret; i++) { 1571 if (pages[i] != locked_page) 1572 unlock_page(pages[i]); 1573 put_page(pages[i]); 1574 } 1575 nr_pages -= ret; 1576 index += ret; 1577 cond_resched(); 1578 } 1579 } 1580 1581 static noinline int lock_delalloc_pages(struct inode *inode, 1582 struct page *locked_page, 1583 u64 delalloc_start, 1584 u64 delalloc_end) 1585 { 1586 unsigned long index = delalloc_start >> PAGE_SHIFT; 1587 unsigned long start_index = index; 1588 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1589 unsigned long pages_locked = 0; 1590 struct page *pages[16]; 1591 unsigned long nrpages; 1592 int ret; 1593 int i; 1594 1595 /* the caller is responsible for locking the start index */ 1596 if (index == locked_page->index && index == end_index) 1597 return 0; 1598 1599 /* skip the page at the start index */ 1600 nrpages = end_index - index + 1; 1601 while (nrpages > 0) { 1602 ret = find_get_pages_contig(inode->i_mapping, index, 1603 min_t(unsigned long, 1604 nrpages, ARRAY_SIZE(pages)), pages); 1605 if (ret == 0) { 1606 ret = -EAGAIN; 1607 goto done; 1608 } 1609 /* now we have an array of pages, lock them all */ 1610 for (i = 0; i < ret; i++) { 1611 /* 1612 * the caller is taking responsibility for 1613 * locked_page 1614 */ 1615 if (pages[i] != locked_page) { 1616 lock_page(pages[i]); 1617 if (!PageDirty(pages[i]) || 1618 pages[i]->mapping != inode->i_mapping) { 1619 ret = -EAGAIN; 1620 unlock_page(pages[i]); 1621 put_page(pages[i]); 1622 goto done; 1623 } 1624 } 1625 put_page(pages[i]); 1626 pages_locked++; 1627 } 1628 nrpages -= ret; 1629 index += ret; 1630 cond_resched(); 1631 } 1632 ret = 0; 1633 done: 1634 if (ret && pages_locked) { 1635 __unlock_for_delalloc(inode, locked_page, 1636 delalloc_start, 1637 ((u64)(start_index + pages_locked - 1)) << 1638 PAGE_SHIFT); 1639 } 1640 return ret; 1641 } 1642 1643 /* 1644 * find a contiguous range of bytes in the file marked as delalloc, not 1645 * more than 'max_bytes'. start and end are used to return the range, 1646 * 1647 * 1 is returned if we find something, 0 if nothing was in the tree 1648 */ 1649 STATIC u64 find_lock_delalloc_range(struct inode *inode, 1650 struct extent_io_tree *tree, 1651 struct page *locked_page, u64 *start, 1652 u64 *end, u64 max_bytes) 1653 { 1654 u64 delalloc_start; 1655 u64 delalloc_end; 1656 u64 found; 1657 struct extent_state *cached_state = NULL; 1658 int ret; 1659 int loops = 0; 1660 1661 again: 1662 /* step one, find a bunch of delalloc bytes starting at start */ 1663 delalloc_start = *start; 1664 delalloc_end = 0; 1665 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1666 max_bytes, &cached_state); 1667 if (!found || delalloc_end <= *start) { 1668 *start = delalloc_start; 1669 *end = delalloc_end; 1670 free_extent_state(cached_state); 1671 return 0; 1672 } 1673 1674 /* 1675 * start comes from the offset of locked_page. We have to lock 1676 * pages in order, so we can't process delalloc bytes before 1677 * locked_page 1678 */ 1679 if (delalloc_start < *start) 1680 delalloc_start = *start; 1681 1682 /* 1683 * make sure to limit the number of pages we try to lock down 1684 */ 1685 if (delalloc_end + 1 - delalloc_start > max_bytes) 1686 delalloc_end = delalloc_start + max_bytes - 1; 1687 1688 /* step two, lock all the pages after the page that has start */ 1689 ret = lock_delalloc_pages(inode, locked_page, 1690 delalloc_start, delalloc_end); 1691 if (ret == -EAGAIN) { 1692 /* some of the pages are gone, lets avoid looping by 1693 * shortening the size of the delalloc range we're searching 1694 */ 1695 free_extent_state(cached_state); 1696 cached_state = NULL; 1697 if (!loops) { 1698 max_bytes = PAGE_SIZE; 1699 loops = 1; 1700 goto again; 1701 } else { 1702 found = 0; 1703 goto out_failed; 1704 } 1705 } 1706 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1707 1708 /* step three, lock the state bits for the whole range */ 1709 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1710 1711 /* then test to make sure it is all still delalloc */ 1712 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1713 EXTENT_DELALLOC, 1, cached_state); 1714 if (!ret) { 1715 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1716 &cached_state, GFP_NOFS); 1717 __unlock_for_delalloc(inode, locked_page, 1718 delalloc_start, delalloc_end); 1719 cond_resched(); 1720 goto again; 1721 } 1722 free_extent_state(cached_state); 1723 *start = delalloc_start; 1724 *end = delalloc_end; 1725 out_failed: 1726 return found; 1727 } 1728 1729 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1730 u64 delalloc_end, struct page *locked_page, 1731 unsigned clear_bits, 1732 unsigned long page_ops) 1733 { 1734 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1735 int ret; 1736 struct page *pages[16]; 1737 unsigned long index = start >> PAGE_SHIFT; 1738 unsigned long end_index = end >> PAGE_SHIFT; 1739 unsigned long nr_pages = end_index - index + 1; 1740 int i; 1741 1742 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1743 if (page_ops == 0) 1744 return; 1745 1746 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1747 mapping_set_error(inode->i_mapping, -EIO); 1748 1749 while (nr_pages > 0) { 1750 ret = find_get_pages_contig(inode->i_mapping, index, 1751 min_t(unsigned long, 1752 nr_pages, ARRAY_SIZE(pages)), pages); 1753 for (i = 0; i < ret; i++) { 1754 1755 if (page_ops & PAGE_SET_PRIVATE2) 1756 SetPagePrivate2(pages[i]); 1757 1758 if (pages[i] == locked_page) { 1759 put_page(pages[i]); 1760 continue; 1761 } 1762 if (page_ops & PAGE_CLEAR_DIRTY) 1763 clear_page_dirty_for_io(pages[i]); 1764 if (page_ops & PAGE_SET_WRITEBACK) 1765 set_page_writeback(pages[i]); 1766 if (page_ops & PAGE_SET_ERROR) 1767 SetPageError(pages[i]); 1768 if (page_ops & PAGE_END_WRITEBACK) 1769 end_page_writeback(pages[i]); 1770 if (page_ops & PAGE_UNLOCK) 1771 unlock_page(pages[i]); 1772 put_page(pages[i]); 1773 } 1774 nr_pages -= ret; 1775 index += ret; 1776 cond_resched(); 1777 } 1778 } 1779 1780 /* 1781 * count the number of bytes in the tree that have a given bit(s) 1782 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1783 * cached. The total number found is returned. 1784 */ 1785 u64 count_range_bits(struct extent_io_tree *tree, 1786 u64 *start, u64 search_end, u64 max_bytes, 1787 unsigned bits, int contig) 1788 { 1789 struct rb_node *node; 1790 struct extent_state *state; 1791 u64 cur_start = *start; 1792 u64 total_bytes = 0; 1793 u64 last = 0; 1794 int found = 0; 1795 1796 if (WARN_ON(search_end <= cur_start)) 1797 return 0; 1798 1799 spin_lock(&tree->lock); 1800 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1801 total_bytes = tree->dirty_bytes; 1802 goto out; 1803 } 1804 /* 1805 * this search will find all the extents that end after 1806 * our range starts. 1807 */ 1808 node = tree_search(tree, cur_start); 1809 if (!node) 1810 goto out; 1811 1812 while (1) { 1813 state = rb_entry(node, struct extent_state, rb_node); 1814 if (state->start > search_end) 1815 break; 1816 if (contig && found && state->start > last + 1) 1817 break; 1818 if (state->end >= cur_start && (state->state & bits) == bits) { 1819 total_bytes += min(search_end, state->end) + 1 - 1820 max(cur_start, state->start); 1821 if (total_bytes >= max_bytes) 1822 break; 1823 if (!found) { 1824 *start = max(cur_start, state->start); 1825 found = 1; 1826 } 1827 last = state->end; 1828 } else if (contig && found) { 1829 break; 1830 } 1831 node = rb_next(node); 1832 if (!node) 1833 break; 1834 } 1835 out: 1836 spin_unlock(&tree->lock); 1837 return total_bytes; 1838 } 1839 1840 /* 1841 * set the private field for a given byte offset in the tree. If there isn't 1842 * an extent_state there already, this does nothing. 1843 */ 1844 static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, 1845 struct io_failure_record *failrec) 1846 { 1847 struct rb_node *node; 1848 struct extent_state *state; 1849 int ret = 0; 1850 1851 spin_lock(&tree->lock); 1852 /* 1853 * this search will find all the extents that end after 1854 * our range starts. 1855 */ 1856 node = tree_search(tree, start); 1857 if (!node) { 1858 ret = -ENOENT; 1859 goto out; 1860 } 1861 state = rb_entry(node, struct extent_state, rb_node); 1862 if (state->start != start) { 1863 ret = -ENOENT; 1864 goto out; 1865 } 1866 state->failrec = failrec; 1867 out: 1868 spin_unlock(&tree->lock); 1869 return ret; 1870 } 1871 1872 static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, 1873 struct io_failure_record **failrec) 1874 { 1875 struct rb_node *node; 1876 struct extent_state *state; 1877 int ret = 0; 1878 1879 spin_lock(&tree->lock); 1880 /* 1881 * this search will find all the extents that end after 1882 * our range starts. 1883 */ 1884 node = tree_search(tree, start); 1885 if (!node) { 1886 ret = -ENOENT; 1887 goto out; 1888 } 1889 state = rb_entry(node, struct extent_state, rb_node); 1890 if (state->start != start) { 1891 ret = -ENOENT; 1892 goto out; 1893 } 1894 *failrec = state->failrec; 1895 out: 1896 spin_unlock(&tree->lock); 1897 return ret; 1898 } 1899 1900 /* 1901 * searches a range in the state tree for a given mask. 1902 * If 'filled' == 1, this returns 1 only if every extent in the tree 1903 * has the bits set. Otherwise, 1 is returned if any bit in the 1904 * range is found set. 1905 */ 1906 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1907 unsigned bits, int filled, struct extent_state *cached) 1908 { 1909 struct extent_state *state = NULL; 1910 struct rb_node *node; 1911 int bitset = 0; 1912 1913 spin_lock(&tree->lock); 1914 if (cached && extent_state_in_tree(cached) && cached->start <= start && 1915 cached->end > start) 1916 node = &cached->rb_node; 1917 else 1918 node = tree_search(tree, start); 1919 while (node && start <= end) { 1920 state = rb_entry(node, struct extent_state, rb_node); 1921 1922 if (filled && state->start > start) { 1923 bitset = 0; 1924 break; 1925 } 1926 1927 if (state->start > end) 1928 break; 1929 1930 if (state->state & bits) { 1931 bitset = 1; 1932 if (!filled) 1933 break; 1934 } else if (filled) { 1935 bitset = 0; 1936 break; 1937 } 1938 1939 if (state->end == (u64)-1) 1940 break; 1941 1942 start = state->end + 1; 1943 if (start > end) 1944 break; 1945 node = rb_next(node); 1946 if (!node) { 1947 if (filled) 1948 bitset = 0; 1949 break; 1950 } 1951 } 1952 spin_unlock(&tree->lock); 1953 return bitset; 1954 } 1955 1956 /* 1957 * helper function to set a given page up to date if all the 1958 * extents in the tree for that page are up to date 1959 */ 1960 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1961 { 1962 u64 start = page_offset(page); 1963 u64 end = start + PAGE_SIZE - 1; 1964 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1965 SetPageUptodate(page); 1966 } 1967 1968 int free_io_failure(struct inode *inode, struct io_failure_record *rec) 1969 { 1970 int ret; 1971 int err = 0; 1972 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1973 1974 set_state_failrec(failure_tree, rec->start, NULL); 1975 ret = clear_extent_bits(failure_tree, rec->start, 1976 rec->start + rec->len - 1, 1977 EXTENT_LOCKED | EXTENT_DIRTY); 1978 if (ret) 1979 err = ret; 1980 1981 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1982 rec->start + rec->len - 1, 1983 EXTENT_DAMAGED); 1984 if (ret && !err) 1985 err = ret; 1986 1987 kfree(rec); 1988 return err; 1989 } 1990 1991 /* 1992 * this bypasses the standard btrfs submit functions deliberately, as 1993 * the standard behavior is to write all copies in a raid setup. here we only 1994 * want to write the one bad copy. so we do the mapping for ourselves and issue 1995 * submit_bio directly. 1996 * to avoid any synchronization issues, wait for the data after writing, which 1997 * actually prevents the read that triggered the error from finishing. 1998 * currently, there can be no more than two copies of every data bit. thus, 1999 * exactly one rewrite is required. 2000 */ 2001 int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, 2002 struct page *page, unsigned int pg_offset, int mirror_num) 2003 { 2004 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2005 struct bio *bio; 2006 struct btrfs_device *dev; 2007 u64 map_length = 0; 2008 u64 sector; 2009 struct btrfs_bio *bbio = NULL; 2010 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 2011 int ret; 2012 2013 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); 2014 BUG_ON(!mirror_num); 2015 2016 /* we can't repair anything in raid56 yet */ 2017 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 2018 return 0; 2019 2020 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2021 if (!bio) 2022 return -EIO; 2023 bio->bi_iter.bi_size = 0; 2024 map_length = length; 2025 2026 /* 2027 * Avoid races with device replace and make sure our bbio has devices 2028 * associated to its stripes that don't go away while we are doing the 2029 * read repair operation. 2030 */ 2031 btrfs_bio_counter_inc_blocked(fs_info); 2032 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2033 &map_length, &bbio, mirror_num); 2034 if (ret) { 2035 btrfs_bio_counter_dec(fs_info); 2036 bio_put(bio); 2037 return -EIO; 2038 } 2039 BUG_ON(mirror_num != bbio->mirror_num); 2040 sector = bbio->stripes[mirror_num-1].physical >> 9; 2041 bio->bi_iter.bi_sector = sector; 2042 dev = bbio->stripes[mirror_num-1].dev; 2043 btrfs_put_bbio(bbio); 2044 if (!dev || !dev->bdev || !dev->writeable) { 2045 btrfs_bio_counter_dec(fs_info); 2046 bio_put(bio); 2047 return -EIO; 2048 } 2049 bio->bi_bdev = dev->bdev; 2050 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2051 bio_add_page(bio, page, length, pg_offset); 2052 2053 if (btrfsic_submit_bio_wait(bio)) { 2054 /* try to remap that extent elsewhere? */ 2055 btrfs_bio_counter_dec(fs_info); 2056 bio_put(bio); 2057 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2058 return -EIO; 2059 } 2060 2061 btrfs_info_rl_in_rcu(fs_info, 2062 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2063 btrfs_ino(inode), start, 2064 rcu_str_deref(dev->name), sector); 2065 btrfs_bio_counter_dec(fs_info); 2066 bio_put(bio); 2067 return 0; 2068 } 2069 2070 int repair_eb_io_failure(struct btrfs_fs_info *fs_info, 2071 struct extent_buffer *eb, int mirror_num) 2072 { 2073 u64 start = eb->start; 2074 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2075 int ret = 0; 2076 2077 if (fs_info->sb->s_flags & MS_RDONLY) 2078 return -EROFS; 2079 2080 for (i = 0; i < num_pages; i++) { 2081 struct page *p = eb->pages[i]; 2082 2083 ret = repair_io_failure(fs_info->btree_inode, start, 2084 PAGE_SIZE, start, p, 2085 start - page_offset(p), mirror_num); 2086 if (ret) 2087 break; 2088 start += PAGE_SIZE; 2089 } 2090 2091 return ret; 2092 } 2093 2094 /* 2095 * each time an IO finishes, we do a fast check in the IO failure tree 2096 * to see if we need to process or clean up an io_failure_record 2097 */ 2098 int clean_io_failure(struct inode *inode, u64 start, struct page *page, 2099 unsigned int pg_offset) 2100 { 2101 u64 private; 2102 struct io_failure_record *failrec; 2103 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2104 struct extent_state *state; 2105 int num_copies; 2106 int ret; 2107 2108 private = 0; 2109 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2110 (u64)-1, 1, EXTENT_DIRTY, 0); 2111 if (!ret) 2112 return 0; 2113 2114 ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start, 2115 &failrec); 2116 if (ret) 2117 return 0; 2118 2119 BUG_ON(!failrec->this_mirror); 2120 2121 if (failrec->in_validation) { 2122 /* there was no real error, just free the record */ 2123 btrfs_debug(fs_info, 2124 "clean_io_failure: freeing dummy error at %llu", 2125 failrec->start); 2126 goto out; 2127 } 2128 if (fs_info->sb->s_flags & MS_RDONLY) 2129 goto out; 2130 2131 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2132 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2133 failrec->start, 2134 EXTENT_LOCKED); 2135 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2136 2137 if (state && state->start <= failrec->start && 2138 state->end >= failrec->start + failrec->len - 1) { 2139 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2140 failrec->len); 2141 if (num_copies > 1) { 2142 repair_io_failure(inode, start, failrec->len, 2143 failrec->logical, page, 2144 pg_offset, failrec->failed_mirror); 2145 } 2146 } 2147 2148 out: 2149 free_io_failure(inode, failrec); 2150 2151 return 0; 2152 } 2153 2154 /* 2155 * Can be called when 2156 * - hold extent lock 2157 * - under ordered extent 2158 * - the inode is freeing 2159 */ 2160 void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) 2161 { 2162 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2163 struct io_failure_record *failrec; 2164 struct extent_state *state, *next; 2165 2166 if (RB_EMPTY_ROOT(&failure_tree->state)) 2167 return; 2168 2169 spin_lock(&failure_tree->lock); 2170 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2171 while (state) { 2172 if (state->start > end) 2173 break; 2174 2175 ASSERT(state->end <= end); 2176 2177 next = next_state(state); 2178 2179 failrec = state->failrec; 2180 free_extent_state(state); 2181 kfree(failrec); 2182 2183 state = next; 2184 } 2185 spin_unlock(&failure_tree->lock); 2186 } 2187 2188 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2189 struct io_failure_record **failrec_ret) 2190 { 2191 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2192 struct io_failure_record *failrec; 2193 struct extent_map *em; 2194 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2195 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2196 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2197 int ret; 2198 u64 logical; 2199 2200 ret = get_state_failrec(failure_tree, start, &failrec); 2201 if (ret) { 2202 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2203 if (!failrec) 2204 return -ENOMEM; 2205 2206 failrec->start = start; 2207 failrec->len = end - start + 1; 2208 failrec->this_mirror = 0; 2209 failrec->bio_flags = 0; 2210 failrec->in_validation = 0; 2211 2212 read_lock(&em_tree->lock); 2213 em = lookup_extent_mapping(em_tree, start, failrec->len); 2214 if (!em) { 2215 read_unlock(&em_tree->lock); 2216 kfree(failrec); 2217 return -EIO; 2218 } 2219 2220 if (em->start > start || em->start + em->len <= start) { 2221 free_extent_map(em); 2222 em = NULL; 2223 } 2224 read_unlock(&em_tree->lock); 2225 if (!em) { 2226 kfree(failrec); 2227 return -EIO; 2228 } 2229 2230 logical = start - em->start; 2231 logical = em->block_start + logical; 2232 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2233 logical = em->block_start; 2234 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2235 extent_set_compress_type(&failrec->bio_flags, 2236 em->compress_type); 2237 } 2238 2239 btrfs_debug(fs_info, 2240 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2241 logical, start, failrec->len); 2242 2243 failrec->logical = logical; 2244 free_extent_map(em); 2245 2246 /* set the bits in the private failure tree */ 2247 ret = set_extent_bits(failure_tree, start, end, 2248 EXTENT_LOCKED | EXTENT_DIRTY); 2249 if (ret >= 0) 2250 ret = set_state_failrec(failure_tree, start, failrec); 2251 /* set the bits in the inode's tree */ 2252 if (ret >= 0) 2253 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2254 if (ret < 0) { 2255 kfree(failrec); 2256 return ret; 2257 } 2258 } else { 2259 btrfs_debug(fs_info, 2260 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2261 failrec->logical, failrec->start, failrec->len, 2262 failrec->in_validation); 2263 /* 2264 * when data can be on disk more than twice, add to failrec here 2265 * (e.g. with a list for failed_mirror) to make 2266 * clean_io_failure() clean all those errors at once. 2267 */ 2268 } 2269 2270 *failrec_ret = failrec; 2271 2272 return 0; 2273 } 2274 2275 int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, 2276 struct io_failure_record *failrec, int failed_mirror) 2277 { 2278 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2279 int num_copies; 2280 2281 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2282 if (num_copies == 1) { 2283 /* 2284 * we only have a single copy of the data, so don't bother with 2285 * all the retry and error correction code that follows. no 2286 * matter what the error is, it is very likely to persist. 2287 */ 2288 btrfs_debug(fs_info, 2289 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2290 num_copies, failrec->this_mirror, failed_mirror); 2291 return 0; 2292 } 2293 2294 /* 2295 * there are two premises: 2296 * a) deliver good data to the caller 2297 * b) correct the bad sectors on disk 2298 */ 2299 if (failed_bio->bi_vcnt > 1) { 2300 /* 2301 * to fulfill b), we need to know the exact failing sectors, as 2302 * we don't want to rewrite any more than the failed ones. thus, 2303 * we need separate read requests for the failed bio 2304 * 2305 * if the following BUG_ON triggers, our validation request got 2306 * merged. we need separate requests for our algorithm to work. 2307 */ 2308 BUG_ON(failrec->in_validation); 2309 failrec->in_validation = 1; 2310 failrec->this_mirror = failed_mirror; 2311 } else { 2312 /* 2313 * we're ready to fulfill a) and b) alongside. get a good copy 2314 * of the failed sector and if we succeed, we have setup 2315 * everything for repair_io_failure to do the rest for us. 2316 */ 2317 if (failrec->in_validation) { 2318 BUG_ON(failrec->this_mirror != failed_mirror); 2319 failrec->in_validation = 0; 2320 failrec->this_mirror = 0; 2321 } 2322 failrec->failed_mirror = failed_mirror; 2323 failrec->this_mirror++; 2324 if (failrec->this_mirror == failed_mirror) 2325 failrec->this_mirror++; 2326 } 2327 2328 if (failrec->this_mirror > num_copies) { 2329 btrfs_debug(fs_info, 2330 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2331 num_copies, failrec->this_mirror, failed_mirror); 2332 return 0; 2333 } 2334 2335 return 1; 2336 } 2337 2338 2339 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2340 struct io_failure_record *failrec, 2341 struct page *page, int pg_offset, int icsum, 2342 bio_end_io_t *endio_func, void *data) 2343 { 2344 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2345 struct bio *bio; 2346 struct btrfs_io_bio *btrfs_failed_bio; 2347 struct btrfs_io_bio *btrfs_bio; 2348 2349 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2350 if (!bio) 2351 return NULL; 2352 2353 bio->bi_end_io = endio_func; 2354 bio->bi_iter.bi_sector = failrec->logical >> 9; 2355 bio->bi_bdev = fs_info->fs_devices->latest_bdev; 2356 bio->bi_iter.bi_size = 0; 2357 bio->bi_private = data; 2358 2359 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2360 if (btrfs_failed_bio->csum) { 2361 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2362 2363 btrfs_bio = btrfs_io_bio(bio); 2364 btrfs_bio->csum = btrfs_bio->csum_inline; 2365 icsum *= csum_size; 2366 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2367 csum_size); 2368 } 2369 2370 bio_add_page(bio, page, failrec->len, pg_offset); 2371 2372 return bio; 2373 } 2374 2375 /* 2376 * this is a generic handler for readpage errors (default 2377 * readpage_io_failed_hook). if other copies exist, read those and write back 2378 * good data to the failed position. does not investigate in remapping the 2379 * failed extent elsewhere, hoping the device will be smart enough to do this as 2380 * needed 2381 */ 2382 2383 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2384 struct page *page, u64 start, u64 end, 2385 int failed_mirror) 2386 { 2387 struct io_failure_record *failrec; 2388 struct inode *inode = page->mapping->host; 2389 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2390 struct bio *bio; 2391 int read_mode = 0; 2392 int ret; 2393 2394 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2395 2396 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2397 if (ret) 2398 return ret; 2399 2400 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); 2401 if (!ret) { 2402 free_io_failure(inode, failrec); 2403 return -EIO; 2404 } 2405 2406 if (failed_bio->bi_vcnt > 1) 2407 read_mode |= REQ_FAILFAST_DEV; 2408 2409 phy_offset >>= inode->i_sb->s_blocksize_bits; 2410 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2411 start - page_offset(page), 2412 (int)phy_offset, failed_bio->bi_end_io, 2413 NULL); 2414 if (!bio) { 2415 free_io_failure(inode, failrec); 2416 return -EIO; 2417 } 2418 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 2419 2420 btrfs_debug(btrfs_sb(inode->i_sb), 2421 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2422 read_mode, failrec->this_mirror, failrec->in_validation); 2423 2424 ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, 2425 failrec->bio_flags, 0); 2426 if (ret) { 2427 free_io_failure(inode, failrec); 2428 bio_put(bio); 2429 } 2430 2431 return ret; 2432 } 2433 2434 /* lots and lots of room for performance fixes in the end_bio funcs */ 2435 2436 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2437 { 2438 int uptodate = (err == 0); 2439 struct extent_io_tree *tree; 2440 int ret = 0; 2441 2442 tree = &BTRFS_I(page->mapping->host)->io_tree; 2443 2444 if (tree->ops && tree->ops->writepage_end_io_hook) { 2445 ret = tree->ops->writepage_end_io_hook(page, start, 2446 end, NULL, uptodate); 2447 if (ret) 2448 uptodate = 0; 2449 } 2450 2451 if (!uptodate) { 2452 ClearPageUptodate(page); 2453 SetPageError(page); 2454 ret = ret < 0 ? ret : -EIO; 2455 mapping_set_error(page->mapping, ret); 2456 } 2457 } 2458 2459 /* 2460 * after a writepage IO is done, we need to: 2461 * clear the uptodate bits on error 2462 * clear the writeback bits in the extent tree for this IO 2463 * end_page_writeback if the page has no more pending IO 2464 * 2465 * Scheduling is not allowed, so the extent state tree is expected 2466 * to have one and only one object corresponding to this IO. 2467 */ 2468 static void end_bio_extent_writepage(struct bio *bio) 2469 { 2470 struct bio_vec *bvec; 2471 u64 start; 2472 u64 end; 2473 int i; 2474 2475 bio_for_each_segment_all(bvec, bio, i) { 2476 struct page *page = bvec->bv_page; 2477 struct inode *inode = page->mapping->host; 2478 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2479 2480 /* We always issue full-page reads, but if some block 2481 * in a page fails to read, blk_update_request() will 2482 * advance bv_offset and adjust bv_len to compensate. 2483 * Print a warning for nonzero offsets, and an error 2484 * if they don't add up to a full page. */ 2485 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2486 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2487 btrfs_err(fs_info, 2488 "partial page write in btrfs with offset %u and length %u", 2489 bvec->bv_offset, bvec->bv_len); 2490 else 2491 btrfs_info(fs_info, 2492 "incomplete page write in btrfs with offset %u and length %u", 2493 bvec->bv_offset, bvec->bv_len); 2494 } 2495 2496 start = page_offset(page); 2497 end = start + bvec->bv_offset + bvec->bv_len - 1; 2498 2499 end_extent_writepage(page, bio->bi_error, start, end); 2500 end_page_writeback(page); 2501 } 2502 2503 bio_put(bio); 2504 } 2505 2506 static void 2507 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2508 int uptodate) 2509 { 2510 struct extent_state *cached = NULL; 2511 u64 end = start + len - 1; 2512 2513 if (uptodate && tree->track_uptodate) 2514 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2515 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2516 } 2517 2518 /* 2519 * after a readpage IO is done, we need to: 2520 * clear the uptodate bits on error 2521 * set the uptodate bits if things worked 2522 * set the page up to date if all extents in the tree are uptodate 2523 * clear the lock bit in the extent tree 2524 * unlock the page if there are no other extents locked for it 2525 * 2526 * Scheduling is not allowed, so the extent state tree is expected 2527 * to have one and only one object corresponding to this IO. 2528 */ 2529 static void end_bio_extent_readpage(struct bio *bio) 2530 { 2531 struct bio_vec *bvec; 2532 int uptodate = !bio->bi_error; 2533 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2534 struct extent_io_tree *tree; 2535 u64 offset = 0; 2536 u64 start; 2537 u64 end; 2538 u64 len; 2539 u64 extent_start = 0; 2540 u64 extent_len = 0; 2541 int mirror; 2542 int ret; 2543 int i; 2544 2545 bio_for_each_segment_all(bvec, bio, i) { 2546 struct page *page = bvec->bv_page; 2547 struct inode *inode = page->mapping->host; 2548 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2549 2550 btrfs_debug(fs_info, 2551 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2552 (u64)bio->bi_iter.bi_sector, bio->bi_error, 2553 io_bio->mirror_num); 2554 tree = &BTRFS_I(inode)->io_tree; 2555 2556 /* We always issue full-page reads, but if some block 2557 * in a page fails to read, blk_update_request() will 2558 * advance bv_offset and adjust bv_len to compensate. 2559 * Print a warning for nonzero offsets, and an error 2560 * if they don't add up to a full page. */ 2561 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2562 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2563 btrfs_err(fs_info, 2564 "partial page read in btrfs with offset %u and length %u", 2565 bvec->bv_offset, bvec->bv_len); 2566 else 2567 btrfs_info(fs_info, 2568 "incomplete page read in btrfs with offset %u and length %u", 2569 bvec->bv_offset, bvec->bv_len); 2570 } 2571 2572 start = page_offset(page); 2573 end = start + bvec->bv_offset + bvec->bv_len - 1; 2574 len = bvec->bv_len; 2575 2576 mirror = io_bio->mirror_num; 2577 if (likely(uptodate && tree->ops && 2578 tree->ops->readpage_end_io_hook)) { 2579 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2580 page, start, end, 2581 mirror); 2582 if (ret) 2583 uptodate = 0; 2584 else 2585 clean_io_failure(inode, start, page, 0); 2586 } 2587 2588 if (likely(uptodate)) 2589 goto readpage_ok; 2590 2591 if (tree->ops && tree->ops->readpage_io_failed_hook) { 2592 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2593 if (!ret && !bio->bi_error) 2594 uptodate = 1; 2595 } else { 2596 /* 2597 * The generic bio_readpage_error handles errors the 2598 * following way: If possible, new read requests are 2599 * created and submitted and will end up in 2600 * end_bio_extent_readpage as well (if we're lucky, not 2601 * in the !uptodate case). In that case it returns 0 and 2602 * we just go on with the next page in our bio. If it 2603 * can't handle the error it will return -EIO and we 2604 * remain responsible for that page. 2605 */ 2606 ret = bio_readpage_error(bio, offset, page, start, end, 2607 mirror); 2608 if (ret == 0) { 2609 uptodate = !bio->bi_error; 2610 offset += len; 2611 continue; 2612 } 2613 } 2614 readpage_ok: 2615 if (likely(uptodate)) { 2616 loff_t i_size = i_size_read(inode); 2617 pgoff_t end_index = i_size >> PAGE_SHIFT; 2618 unsigned off; 2619 2620 /* Zero out the end if this page straddles i_size */ 2621 off = i_size & (PAGE_SIZE-1); 2622 if (page->index == end_index && off) 2623 zero_user_segment(page, off, PAGE_SIZE); 2624 SetPageUptodate(page); 2625 } else { 2626 ClearPageUptodate(page); 2627 SetPageError(page); 2628 } 2629 unlock_page(page); 2630 offset += len; 2631 2632 if (unlikely(!uptodate)) { 2633 if (extent_len) { 2634 endio_readpage_release_extent(tree, 2635 extent_start, 2636 extent_len, 1); 2637 extent_start = 0; 2638 extent_len = 0; 2639 } 2640 endio_readpage_release_extent(tree, start, 2641 end - start + 1, 0); 2642 } else if (!extent_len) { 2643 extent_start = start; 2644 extent_len = end + 1 - start; 2645 } else if (extent_start + extent_len == start) { 2646 extent_len += end + 1 - start; 2647 } else { 2648 endio_readpage_release_extent(tree, extent_start, 2649 extent_len, uptodate); 2650 extent_start = start; 2651 extent_len = end + 1 - start; 2652 } 2653 } 2654 2655 if (extent_len) 2656 endio_readpage_release_extent(tree, extent_start, extent_len, 2657 uptodate); 2658 if (io_bio->end_io) 2659 io_bio->end_io(io_bio, bio->bi_error); 2660 bio_put(bio); 2661 } 2662 2663 /* 2664 * this allocates from the btrfs_bioset. We're returning a bio right now 2665 * but you can call btrfs_io_bio for the appropriate container_of magic 2666 */ 2667 struct bio * 2668 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2669 gfp_t gfp_flags) 2670 { 2671 struct btrfs_io_bio *btrfs_bio; 2672 struct bio *bio; 2673 2674 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2675 2676 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2677 while (!bio && (nr_vecs /= 2)) { 2678 bio = bio_alloc_bioset(gfp_flags, 2679 nr_vecs, btrfs_bioset); 2680 } 2681 } 2682 2683 if (bio) { 2684 bio->bi_bdev = bdev; 2685 bio->bi_iter.bi_sector = first_sector; 2686 btrfs_bio = btrfs_io_bio(bio); 2687 btrfs_bio->csum = NULL; 2688 btrfs_bio->csum_allocated = NULL; 2689 btrfs_bio->end_io = NULL; 2690 } 2691 return bio; 2692 } 2693 2694 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2695 { 2696 struct btrfs_io_bio *btrfs_bio; 2697 struct bio *new; 2698 2699 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2700 if (new) { 2701 btrfs_bio = btrfs_io_bio(new); 2702 btrfs_bio->csum = NULL; 2703 btrfs_bio->csum_allocated = NULL; 2704 btrfs_bio->end_io = NULL; 2705 } 2706 return new; 2707 } 2708 2709 /* this also allocates from the btrfs_bioset */ 2710 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2711 { 2712 struct btrfs_io_bio *btrfs_bio; 2713 struct bio *bio; 2714 2715 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2716 if (bio) { 2717 btrfs_bio = btrfs_io_bio(bio); 2718 btrfs_bio->csum = NULL; 2719 btrfs_bio->csum_allocated = NULL; 2720 btrfs_bio->end_io = NULL; 2721 } 2722 return bio; 2723 } 2724 2725 2726 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 2727 unsigned long bio_flags) 2728 { 2729 int ret = 0; 2730 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2731 struct page *page = bvec->bv_page; 2732 struct extent_io_tree *tree = bio->bi_private; 2733 u64 start; 2734 2735 start = page_offset(page) + bvec->bv_offset; 2736 2737 bio->bi_private = NULL; 2738 bio_get(bio); 2739 2740 if (tree->ops && tree->ops->submit_bio_hook) 2741 ret = tree->ops->submit_bio_hook(page->mapping->host, bio, 2742 mirror_num, bio_flags, start); 2743 else 2744 btrfsic_submit_bio(bio); 2745 2746 bio_put(bio); 2747 return ret; 2748 } 2749 2750 static int merge_bio(struct extent_io_tree *tree, struct page *page, 2751 unsigned long offset, size_t size, struct bio *bio, 2752 unsigned long bio_flags) 2753 { 2754 int ret = 0; 2755 if (tree->ops && tree->ops->merge_bio_hook) 2756 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2757 bio_flags); 2758 return ret; 2759 2760 } 2761 2762 static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, 2763 struct writeback_control *wbc, 2764 struct page *page, sector_t sector, 2765 size_t size, unsigned long offset, 2766 struct block_device *bdev, 2767 struct bio **bio_ret, 2768 unsigned long max_pages, 2769 bio_end_io_t end_io_func, 2770 int mirror_num, 2771 unsigned long prev_bio_flags, 2772 unsigned long bio_flags, 2773 bool force_bio_submit) 2774 { 2775 int ret = 0; 2776 struct bio *bio; 2777 int contig = 0; 2778 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2779 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2780 2781 if (bio_ret && *bio_ret) { 2782 bio = *bio_ret; 2783 if (old_compressed) 2784 contig = bio->bi_iter.bi_sector == sector; 2785 else 2786 contig = bio_end_sector(bio) == sector; 2787 2788 if (prev_bio_flags != bio_flags || !contig || 2789 force_bio_submit || 2790 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2791 bio_add_page(bio, page, page_size, offset) < page_size) { 2792 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2793 if (ret < 0) { 2794 *bio_ret = NULL; 2795 return ret; 2796 } 2797 bio = NULL; 2798 } else { 2799 if (wbc) 2800 wbc_account_io(wbc, page, page_size); 2801 return 0; 2802 } 2803 } 2804 2805 bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, 2806 GFP_NOFS | __GFP_HIGH); 2807 if (!bio) 2808 return -ENOMEM; 2809 2810 bio_add_page(bio, page, page_size, offset); 2811 bio->bi_end_io = end_io_func; 2812 bio->bi_private = tree; 2813 bio_set_op_attrs(bio, op, op_flags); 2814 if (wbc) { 2815 wbc_init_bio(wbc, bio); 2816 wbc_account_io(wbc, page, page_size); 2817 } 2818 2819 if (bio_ret) 2820 *bio_ret = bio; 2821 else 2822 ret = submit_one_bio(bio, mirror_num, bio_flags); 2823 2824 return ret; 2825 } 2826 2827 static void attach_extent_buffer_page(struct extent_buffer *eb, 2828 struct page *page) 2829 { 2830 if (!PagePrivate(page)) { 2831 SetPagePrivate(page); 2832 get_page(page); 2833 set_page_private(page, (unsigned long)eb); 2834 } else { 2835 WARN_ON(page->private != (unsigned long)eb); 2836 } 2837 } 2838 2839 void set_page_extent_mapped(struct page *page) 2840 { 2841 if (!PagePrivate(page)) { 2842 SetPagePrivate(page); 2843 get_page(page); 2844 set_page_private(page, EXTENT_PAGE_PRIVATE); 2845 } 2846 } 2847 2848 static struct extent_map * 2849 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2850 u64 start, u64 len, get_extent_t *get_extent, 2851 struct extent_map **em_cached) 2852 { 2853 struct extent_map *em; 2854 2855 if (em_cached && *em_cached) { 2856 em = *em_cached; 2857 if (extent_map_in_tree(em) && start >= em->start && 2858 start < extent_map_end(em)) { 2859 atomic_inc(&em->refs); 2860 return em; 2861 } 2862 2863 free_extent_map(em); 2864 *em_cached = NULL; 2865 } 2866 2867 em = get_extent(inode, page, pg_offset, start, len, 0); 2868 if (em_cached && !IS_ERR_OR_NULL(em)) { 2869 BUG_ON(*em_cached); 2870 atomic_inc(&em->refs); 2871 *em_cached = em; 2872 } 2873 return em; 2874 } 2875 /* 2876 * basic readpage implementation. Locked extent state structs are inserted 2877 * into the tree that are removed when the IO is done (by the end_io 2878 * handlers) 2879 * XXX JDM: This needs looking at to ensure proper page locking 2880 * return 0 on success, otherwise return error 2881 */ 2882 static int __do_readpage(struct extent_io_tree *tree, 2883 struct page *page, 2884 get_extent_t *get_extent, 2885 struct extent_map **em_cached, 2886 struct bio **bio, int mirror_num, 2887 unsigned long *bio_flags, int read_flags, 2888 u64 *prev_em_start) 2889 { 2890 struct inode *inode = page->mapping->host; 2891 u64 start = page_offset(page); 2892 u64 page_end = start + PAGE_SIZE - 1; 2893 u64 end; 2894 u64 cur = start; 2895 u64 extent_offset; 2896 u64 last_byte = i_size_read(inode); 2897 u64 block_start; 2898 u64 cur_end; 2899 sector_t sector; 2900 struct extent_map *em; 2901 struct block_device *bdev; 2902 int ret = 0; 2903 int nr = 0; 2904 size_t pg_offset = 0; 2905 size_t iosize; 2906 size_t disk_io_size; 2907 size_t blocksize = inode->i_sb->s_blocksize; 2908 unsigned long this_bio_flag = 0; 2909 2910 set_page_extent_mapped(page); 2911 2912 end = page_end; 2913 if (!PageUptodate(page)) { 2914 if (cleancache_get_page(page) == 0) { 2915 BUG_ON(blocksize != PAGE_SIZE); 2916 unlock_extent(tree, start, end); 2917 goto out; 2918 } 2919 } 2920 2921 if (page->index == last_byte >> PAGE_SHIFT) { 2922 char *userpage; 2923 size_t zero_offset = last_byte & (PAGE_SIZE - 1); 2924 2925 if (zero_offset) { 2926 iosize = PAGE_SIZE - zero_offset; 2927 userpage = kmap_atomic(page); 2928 memset(userpage + zero_offset, 0, iosize); 2929 flush_dcache_page(page); 2930 kunmap_atomic(userpage); 2931 } 2932 } 2933 while (cur <= end) { 2934 unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1; 2935 bool force_bio_submit = false; 2936 2937 if (cur >= last_byte) { 2938 char *userpage; 2939 struct extent_state *cached = NULL; 2940 2941 iosize = PAGE_SIZE - pg_offset; 2942 userpage = kmap_atomic(page); 2943 memset(userpage + pg_offset, 0, iosize); 2944 flush_dcache_page(page); 2945 kunmap_atomic(userpage); 2946 set_extent_uptodate(tree, cur, cur + iosize - 1, 2947 &cached, GFP_NOFS); 2948 unlock_extent_cached(tree, cur, 2949 cur + iosize - 1, 2950 &cached, GFP_NOFS); 2951 break; 2952 } 2953 em = __get_extent_map(inode, page, pg_offset, cur, 2954 end - cur + 1, get_extent, em_cached); 2955 if (IS_ERR_OR_NULL(em)) { 2956 SetPageError(page); 2957 unlock_extent(tree, cur, end); 2958 break; 2959 } 2960 extent_offset = cur - em->start; 2961 BUG_ON(extent_map_end(em) <= cur); 2962 BUG_ON(end < cur); 2963 2964 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2965 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2966 extent_set_compress_type(&this_bio_flag, 2967 em->compress_type); 2968 } 2969 2970 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2971 cur_end = min(extent_map_end(em) - 1, end); 2972 iosize = ALIGN(iosize, blocksize); 2973 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2974 disk_io_size = em->block_len; 2975 sector = em->block_start >> 9; 2976 } else { 2977 sector = (em->block_start + extent_offset) >> 9; 2978 disk_io_size = iosize; 2979 } 2980 bdev = em->bdev; 2981 block_start = em->block_start; 2982 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2983 block_start = EXTENT_MAP_HOLE; 2984 2985 /* 2986 * If we have a file range that points to a compressed extent 2987 * and it's followed by a consecutive file range that points to 2988 * to the same compressed extent (possibly with a different 2989 * offset and/or length, so it either points to the whole extent 2990 * or only part of it), we must make sure we do not submit a 2991 * single bio to populate the pages for the 2 ranges because 2992 * this makes the compressed extent read zero out the pages 2993 * belonging to the 2nd range. Imagine the following scenario: 2994 * 2995 * File layout 2996 * [0 - 8K] [8K - 24K] 2997 * | | 2998 * | | 2999 * points to extent X, points to extent X, 3000 * offset 4K, length of 8K offset 0, length 16K 3001 * 3002 * [extent X, compressed length = 4K uncompressed length = 16K] 3003 * 3004 * If the bio to read the compressed extent covers both ranges, 3005 * it will decompress extent X into the pages belonging to the 3006 * first range and then it will stop, zeroing out the remaining 3007 * pages that belong to the other range that points to extent X. 3008 * So here we make sure we submit 2 bios, one for the first 3009 * range and another one for the third range. Both will target 3010 * the same physical extent from disk, but we can't currently 3011 * make the compressed bio endio callback populate the pages 3012 * for both ranges because each compressed bio is tightly 3013 * coupled with a single extent map, and each range can have 3014 * an extent map with a different offset value relative to the 3015 * uncompressed data of our extent and different lengths. This 3016 * is a corner case so we prioritize correctness over 3017 * non-optimal behavior (submitting 2 bios for the same extent). 3018 */ 3019 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3020 prev_em_start && *prev_em_start != (u64)-1 && 3021 *prev_em_start != em->orig_start) 3022 force_bio_submit = true; 3023 3024 if (prev_em_start) 3025 *prev_em_start = em->orig_start; 3026 3027 free_extent_map(em); 3028 em = NULL; 3029 3030 /* we've found a hole, just zero and go on */ 3031 if (block_start == EXTENT_MAP_HOLE) { 3032 char *userpage; 3033 struct extent_state *cached = NULL; 3034 3035 userpage = kmap_atomic(page); 3036 memset(userpage + pg_offset, 0, iosize); 3037 flush_dcache_page(page); 3038 kunmap_atomic(userpage); 3039 3040 set_extent_uptodate(tree, cur, cur + iosize - 1, 3041 &cached, GFP_NOFS); 3042 unlock_extent_cached(tree, cur, 3043 cur + iosize - 1, 3044 &cached, GFP_NOFS); 3045 cur = cur + iosize; 3046 pg_offset += iosize; 3047 continue; 3048 } 3049 /* the get_extent function already copied into the page */ 3050 if (test_range_bit(tree, cur, cur_end, 3051 EXTENT_UPTODATE, 1, NULL)) { 3052 check_page_uptodate(tree, page); 3053 unlock_extent(tree, cur, cur + iosize - 1); 3054 cur = cur + iosize; 3055 pg_offset += iosize; 3056 continue; 3057 } 3058 /* we have an inline extent but it didn't get marked up 3059 * to date. Error out 3060 */ 3061 if (block_start == EXTENT_MAP_INLINE) { 3062 SetPageError(page); 3063 unlock_extent(tree, cur, cur + iosize - 1); 3064 cur = cur + iosize; 3065 pg_offset += iosize; 3066 continue; 3067 } 3068 3069 pnr -= page->index; 3070 ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, 3071 page, sector, disk_io_size, pg_offset, 3072 bdev, bio, pnr, 3073 end_bio_extent_readpage, mirror_num, 3074 *bio_flags, 3075 this_bio_flag, 3076 force_bio_submit); 3077 if (!ret) { 3078 nr++; 3079 *bio_flags = this_bio_flag; 3080 } else { 3081 SetPageError(page); 3082 unlock_extent(tree, cur, cur + iosize - 1); 3083 goto out; 3084 } 3085 cur = cur + iosize; 3086 pg_offset += iosize; 3087 } 3088 out: 3089 if (!nr) { 3090 if (!PageError(page)) 3091 SetPageUptodate(page); 3092 unlock_page(page); 3093 } 3094 return ret; 3095 } 3096 3097 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 3098 struct page *pages[], int nr_pages, 3099 u64 start, u64 end, 3100 get_extent_t *get_extent, 3101 struct extent_map **em_cached, 3102 struct bio **bio, int mirror_num, 3103 unsigned long *bio_flags, 3104 u64 *prev_em_start) 3105 { 3106 struct inode *inode; 3107 struct btrfs_ordered_extent *ordered; 3108 int index; 3109 3110 inode = pages[0]->mapping->host; 3111 while (1) { 3112 lock_extent(tree, start, end); 3113 ordered = btrfs_lookup_ordered_range(inode, start, 3114 end - start + 1); 3115 if (!ordered) 3116 break; 3117 unlock_extent(tree, start, end); 3118 btrfs_start_ordered_extent(inode, ordered, 1); 3119 btrfs_put_ordered_extent(ordered); 3120 } 3121 3122 for (index = 0; index < nr_pages; index++) { 3123 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 3124 mirror_num, bio_flags, 0, prev_em_start); 3125 put_page(pages[index]); 3126 } 3127 } 3128 3129 static void __extent_readpages(struct extent_io_tree *tree, 3130 struct page *pages[], 3131 int nr_pages, get_extent_t *get_extent, 3132 struct extent_map **em_cached, 3133 struct bio **bio, int mirror_num, 3134 unsigned long *bio_flags, 3135 u64 *prev_em_start) 3136 { 3137 u64 start = 0; 3138 u64 end = 0; 3139 u64 page_start; 3140 int index; 3141 int first_index = 0; 3142 3143 for (index = 0; index < nr_pages; index++) { 3144 page_start = page_offset(pages[index]); 3145 if (!end) { 3146 start = page_start; 3147 end = start + PAGE_SIZE - 1; 3148 first_index = index; 3149 } else if (end + 1 == page_start) { 3150 end += PAGE_SIZE; 3151 } else { 3152 __do_contiguous_readpages(tree, &pages[first_index], 3153 index - first_index, start, 3154 end, get_extent, em_cached, 3155 bio, mirror_num, bio_flags, 3156 prev_em_start); 3157 start = page_start; 3158 end = start + PAGE_SIZE - 1; 3159 first_index = index; 3160 } 3161 } 3162 3163 if (end) 3164 __do_contiguous_readpages(tree, &pages[first_index], 3165 index - first_index, start, 3166 end, get_extent, em_cached, bio, 3167 mirror_num, bio_flags, 3168 prev_em_start); 3169 } 3170 3171 static int __extent_read_full_page(struct extent_io_tree *tree, 3172 struct page *page, 3173 get_extent_t *get_extent, 3174 struct bio **bio, int mirror_num, 3175 unsigned long *bio_flags, int read_flags) 3176 { 3177 struct inode *inode = page->mapping->host; 3178 struct btrfs_ordered_extent *ordered; 3179 u64 start = page_offset(page); 3180 u64 end = start + PAGE_SIZE - 1; 3181 int ret; 3182 3183 while (1) { 3184 lock_extent(tree, start, end); 3185 ordered = btrfs_lookup_ordered_range(inode, start, 3186 PAGE_SIZE); 3187 if (!ordered) 3188 break; 3189 unlock_extent(tree, start, end); 3190 btrfs_start_ordered_extent(inode, ordered, 1); 3191 btrfs_put_ordered_extent(ordered); 3192 } 3193 3194 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3195 bio_flags, read_flags, NULL); 3196 return ret; 3197 } 3198 3199 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3200 get_extent_t *get_extent, int mirror_num) 3201 { 3202 struct bio *bio = NULL; 3203 unsigned long bio_flags = 0; 3204 int ret; 3205 3206 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3207 &bio_flags, 0); 3208 if (bio) 3209 ret = submit_one_bio(bio, mirror_num, bio_flags); 3210 return ret; 3211 } 3212 3213 static void update_nr_written(struct page *page, struct writeback_control *wbc, 3214 unsigned long nr_written) 3215 { 3216 wbc->nr_to_write -= nr_written; 3217 } 3218 3219 /* 3220 * helper for __extent_writepage, doing all of the delayed allocation setup. 3221 * 3222 * This returns 1 if our fill_delalloc function did all the work required 3223 * to write the page (copy into inline extent). In this case the IO has 3224 * been started and the page is already unlocked. 3225 * 3226 * This returns 0 if all went well (page still locked) 3227 * This returns < 0 if there were errors (page still locked) 3228 */ 3229 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3230 struct page *page, struct writeback_control *wbc, 3231 struct extent_page_data *epd, 3232 u64 delalloc_start, 3233 unsigned long *nr_written) 3234 { 3235 struct extent_io_tree *tree = epd->tree; 3236 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3237 u64 nr_delalloc; 3238 u64 delalloc_to_write = 0; 3239 u64 delalloc_end = 0; 3240 int ret; 3241 int page_started = 0; 3242 3243 if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) 3244 return 0; 3245 3246 while (delalloc_end < page_end) { 3247 nr_delalloc = find_lock_delalloc_range(inode, tree, 3248 page, 3249 &delalloc_start, 3250 &delalloc_end, 3251 BTRFS_MAX_EXTENT_SIZE); 3252 if (nr_delalloc == 0) { 3253 delalloc_start = delalloc_end + 1; 3254 continue; 3255 } 3256 ret = tree->ops->fill_delalloc(inode, page, 3257 delalloc_start, 3258 delalloc_end, 3259 &page_started, 3260 nr_written); 3261 /* File system has been set read-only */ 3262 if (ret) { 3263 SetPageError(page); 3264 /* fill_delalloc should be return < 0 for error 3265 * but just in case, we use > 0 here meaning the 3266 * IO is started, so we don't want to return > 0 3267 * unless things are going well. 3268 */ 3269 ret = ret < 0 ? ret : -EIO; 3270 goto done; 3271 } 3272 /* 3273 * delalloc_end is already one less than the total length, so 3274 * we don't subtract one from PAGE_SIZE 3275 */ 3276 delalloc_to_write += (delalloc_end - delalloc_start + 3277 PAGE_SIZE) >> PAGE_SHIFT; 3278 delalloc_start = delalloc_end + 1; 3279 } 3280 if (wbc->nr_to_write < delalloc_to_write) { 3281 int thresh = 8192; 3282 3283 if (delalloc_to_write < thresh * 2) 3284 thresh = delalloc_to_write; 3285 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3286 thresh); 3287 } 3288 3289 /* did the fill delalloc function already unlock and start 3290 * the IO? 3291 */ 3292 if (page_started) { 3293 /* 3294 * we've unlocked the page, so we can't update 3295 * the mapping's writeback index, just update 3296 * nr_to_write. 3297 */ 3298 wbc->nr_to_write -= *nr_written; 3299 return 1; 3300 } 3301 3302 ret = 0; 3303 3304 done: 3305 return ret; 3306 } 3307 3308 /* 3309 * helper for __extent_writepage. This calls the writepage start hooks, 3310 * and does the loop to map the page into extents and bios. 3311 * 3312 * We return 1 if the IO is started and the page is unlocked, 3313 * 0 if all went well (page still locked) 3314 * < 0 if there were errors (page still locked) 3315 */ 3316 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3317 struct page *page, 3318 struct writeback_control *wbc, 3319 struct extent_page_data *epd, 3320 loff_t i_size, 3321 unsigned long nr_written, 3322 int write_flags, int *nr_ret) 3323 { 3324 struct extent_io_tree *tree = epd->tree; 3325 u64 start = page_offset(page); 3326 u64 page_end = start + PAGE_SIZE - 1; 3327 u64 end; 3328 u64 cur = start; 3329 u64 extent_offset; 3330 u64 block_start; 3331 u64 iosize; 3332 sector_t sector; 3333 struct extent_state *cached_state = NULL; 3334 struct extent_map *em; 3335 struct block_device *bdev; 3336 size_t pg_offset = 0; 3337 size_t blocksize; 3338 int ret = 0; 3339 int nr = 0; 3340 bool compressed; 3341 3342 if (tree->ops && tree->ops->writepage_start_hook) { 3343 ret = tree->ops->writepage_start_hook(page, start, 3344 page_end); 3345 if (ret) { 3346 /* Fixup worker will requeue */ 3347 if (ret == -EBUSY) 3348 wbc->pages_skipped++; 3349 else 3350 redirty_page_for_writepage(wbc, page); 3351 3352 update_nr_written(page, wbc, nr_written); 3353 unlock_page(page); 3354 ret = 1; 3355 goto done_unlocked; 3356 } 3357 } 3358 3359 /* 3360 * we don't want to touch the inode after unlocking the page, 3361 * so we update the mapping writeback index now 3362 */ 3363 update_nr_written(page, wbc, nr_written + 1); 3364 3365 end = page_end; 3366 if (i_size <= start) { 3367 if (tree->ops && tree->ops->writepage_end_io_hook) 3368 tree->ops->writepage_end_io_hook(page, start, 3369 page_end, NULL, 1); 3370 goto done; 3371 } 3372 3373 blocksize = inode->i_sb->s_blocksize; 3374 3375 while (cur <= end) { 3376 u64 em_end; 3377 unsigned long max_nr; 3378 3379 if (cur >= i_size) { 3380 if (tree->ops && tree->ops->writepage_end_io_hook) 3381 tree->ops->writepage_end_io_hook(page, cur, 3382 page_end, NULL, 1); 3383 break; 3384 } 3385 em = epd->get_extent(inode, page, pg_offset, cur, 3386 end - cur + 1, 1); 3387 if (IS_ERR_OR_NULL(em)) { 3388 SetPageError(page); 3389 ret = PTR_ERR_OR_ZERO(em); 3390 break; 3391 } 3392 3393 extent_offset = cur - em->start; 3394 em_end = extent_map_end(em); 3395 BUG_ON(em_end <= cur); 3396 BUG_ON(end < cur); 3397 iosize = min(em_end - cur, end - cur + 1); 3398 iosize = ALIGN(iosize, blocksize); 3399 sector = (em->block_start + extent_offset) >> 9; 3400 bdev = em->bdev; 3401 block_start = em->block_start; 3402 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3403 free_extent_map(em); 3404 em = NULL; 3405 3406 /* 3407 * compressed and inline extents are written through other 3408 * paths in the FS 3409 */ 3410 if (compressed || block_start == EXTENT_MAP_HOLE || 3411 block_start == EXTENT_MAP_INLINE) { 3412 /* 3413 * end_io notification does not happen here for 3414 * compressed extents 3415 */ 3416 if (!compressed && tree->ops && 3417 tree->ops->writepage_end_io_hook) 3418 tree->ops->writepage_end_io_hook(page, cur, 3419 cur + iosize - 1, 3420 NULL, 1); 3421 else if (compressed) { 3422 /* we don't want to end_page_writeback on 3423 * a compressed extent. this happens 3424 * elsewhere 3425 */ 3426 nr++; 3427 } 3428 3429 cur += iosize; 3430 pg_offset += iosize; 3431 continue; 3432 } 3433 3434 max_nr = (i_size >> PAGE_SHIFT) + 1; 3435 3436 set_range_writeback(tree, cur, cur + iosize - 1); 3437 if (!PageWriteback(page)) { 3438 btrfs_err(BTRFS_I(inode)->root->fs_info, 3439 "page %lu not writeback, cur %llu end %llu", 3440 page->index, cur, end); 3441 } 3442 3443 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3444 page, sector, iosize, pg_offset, 3445 bdev, &epd->bio, max_nr, 3446 end_bio_extent_writepage, 3447 0, 0, 0, false); 3448 if (ret) 3449 SetPageError(page); 3450 3451 cur = cur + iosize; 3452 pg_offset += iosize; 3453 nr++; 3454 } 3455 done: 3456 *nr_ret = nr; 3457 3458 done_unlocked: 3459 3460 /* drop our reference on any cached states */ 3461 free_extent_state(cached_state); 3462 return ret; 3463 } 3464 3465 /* 3466 * the writepage semantics are similar to regular writepage. extent 3467 * records are inserted to lock ranges in the tree, and as dirty areas 3468 * are found, they are marked writeback. Then the lock bits are removed 3469 * and the end_io handler clears the writeback ranges 3470 */ 3471 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3472 void *data) 3473 { 3474 struct inode *inode = page->mapping->host; 3475 struct extent_page_data *epd = data; 3476 u64 start = page_offset(page); 3477 u64 page_end = start + PAGE_SIZE - 1; 3478 int ret; 3479 int nr = 0; 3480 size_t pg_offset = 0; 3481 loff_t i_size = i_size_read(inode); 3482 unsigned long end_index = i_size >> PAGE_SHIFT; 3483 int write_flags = 0; 3484 unsigned long nr_written = 0; 3485 3486 if (wbc->sync_mode == WB_SYNC_ALL) 3487 write_flags = REQ_SYNC; 3488 3489 trace___extent_writepage(page, inode, wbc); 3490 3491 WARN_ON(!PageLocked(page)); 3492 3493 ClearPageError(page); 3494 3495 pg_offset = i_size & (PAGE_SIZE - 1); 3496 if (page->index > end_index || 3497 (page->index == end_index && !pg_offset)) { 3498 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3499 unlock_page(page); 3500 return 0; 3501 } 3502 3503 if (page->index == end_index) { 3504 char *userpage; 3505 3506 userpage = kmap_atomic(page); 3507 memset(userpage + pg_offset, 0, 3508 PAGE_SIZE - pg_offset); 3509 kunmap_atomic(userpage); 3510 flush_dcache_page(page); 3511 } 3512 3513 pg_offset = 0; 3514 3515 set_page_extent_mapped(page); 3516 3517 ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); 3518 if (ret == 1) 3519 goto done_unlocked; 3520 if (ret) 3521 goto done; 3522 3523 ret = __extent_writepage_io(inode, page, wbc, epd, 3524 i_size, nr_written, write_flags, &nr); 3525 if (ret == 1) 3526 goto done_unlocked; 3527 3528 done: 3529 if (nr == 0) { 3530 /* make sure the mapping tag for page dirty gets cleared */ 3531 set_page_writeback(page); 3532 end_page_writeback(page); 3533 } 3534 if (PageError(page)) { 3535 ret = ret < 0 ? ret : -EIO; 3536 end_extent_writepage(page, ret, start, page_end); 3537 } 3538 unlock_page(page); 3539 return ret; 3540 3541 done_unlocked: 3542 return 0; 3543 } 3544 3545 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3546 { 3547 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3548 TASK_UNINTERRUPTIBLE); 3549 } 3550 3551 static noinline_for_stack int 3552 lock_extent_buffer_for_io(struct extent_buffer *eb, 3553 struct btrfs_fs_info *fs_info, 3554 struct extent_page_data *epd) 3555 { 3556 unsigned long i, num_pages; 3557 int flush = 0; 3558 int ret = 0; 3559 3560 if (!btrfs_try_tree_write_lock(eb)) { 3561 flush = 1; 3562 flush_write_bio(epd); 3563 btrfs_tree_lock(eb); 3564 } 3565 3566 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3567 btrfs_tree_unlock(eb); 3568 if (!epd->sync_io) 3569 return 0; 3570 if (!flush) { 3571 flush_write_bio(epd); 3572 flush = 1; 3573 } 3574 while (1) { 3575 wait_on_extent_buffer_writeback(eb); 3576 btrfs_tree_lock(eb); 3577 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3578 break; 3579 btrfs_tree_unlock(eb); 3580 } 3581 } 3582 3583 /* 3584 * We need to do this to prevent races in people who check if the eb is 3585 * under IO since we can end up having no IO bits set for a short period 3586 * of time. 3587 */ 3588 spin_lock(&eb->refs_lock); 3589 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3590 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3591 spin_unlock(&eb->refs_lock); 3592 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3593 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3594 -eb->len, 3595 fs_info->dirty_metadata_batch); 3596 ret = 1; 3597 } else { 3598 spin_unlock(&eb->refs_lock); 3599 } 3600 3601 btrfs_tree_unlock(eb); 3602 3603 if (!ret) 3604 return ret; 3605 3606 num_pages = num_extent_pages(eb->start, eb->len); 3607 for (i = 0; i < num_pages; i++) { 3608 struct page *p = eb->pages[i]; 3609 3610 if (!trylock_page(p)) { 3611 if (!flush) { 3612 flush_write_bio(epd); 3613 flush = 1; 3614 } 3615 lock_page(p); 3616 } 3617 } 3618 3619 return ret; 3620 } 3621 3622 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3623 { 3624 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3625 smp_mb__after_atomic(); 3626 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3627 } 3628 3629 static void set_btree_ioerr(struct page *page) 3630 { 3631 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3632 3633 SetPageError(page); 3634 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3635 return; 3636 3637 /* 3638 * If writeback for a btree extent that doesn't belong to a log tree 3639 * failed, increment the counter transaction->eb_write_errors. 3640 * We do this because while the transaction is running and before it's 3641 * committing (when we call filemap_fdata[write|wait]_range against 3642 * the btree inode), we might have 3643 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3644 * returns an error or an error happens during writeback, when we're 3645 * committing the transaction we wouldn't know about it, since the pages 3646 * can be no longer dirty nor marked anymore for writeback (if a 3647 * subsequent modification to the extent buffer didn't happen before the 3648 * transaction commit), which makes filemap_fdata[write|wait]_range not 3649 * able to find the pages tagged with SetPageError at transaction 3650 * commit time. So if this happens we must abort the transaction, 3651 * otherwise we commit a super block with btree roots that point to 3652 * btree nodes/leafs whose content on disk is invalid - either garbage 3653 * or the content of some node/leaf from a past generation that got 3654 * cowed or deleted and is no longer valid. 3655 * 3656 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3657 * not be enough - we need to distinguish between log tree extents vs 3658 * non-log tree extents, and the next filemap_fdatawait_range() call 3659 * will catch and clear such errors in the mapping - and that call might 3660 * be from a log sync and not from a transaction commit. Also, checking 3661 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3662 * not done and would not be reliable - the eb might have been released 3663 * from memory and reading it back again means that flag would not be 3664 * set (since it's a runtime flag, not persisted on disk). 3665 * 3666 * Using the flags below in the btree inode also makes us achieve the 3667 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3668 * writeback for all dirty pages and before filemap_fdatawait_range() 3669 * is called, the writeback for all dirty pages had already finished 3670 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3671 * filemap_fdatawait_range() would return success, as it could not know 3672 * that writeback errors happened (the pages were no longer tagged for 3673 * writeback). 3674 */ 3675 switch (eb->log_index) { 3676 case -1: 3677 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3678 break; 3679 case 0: 3680 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3681 break; 3682 case 1: 3683 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3684 break; 3685 default: 3686 BUG(); /* unexpected, logic error */ 3687 } 3688 } 3689 3690 static void end_bio_extent_buffer_writepage(struct bio *bio) 3691 { 3692 struct bio_vec *bvec; 3693 struct extent_buffer *eb; 3694 int i, done; 3695 3696 bio_for_each_segment_all(bvec, bio, i) { 3697 struct page *page = bvec->bv_page; 3698 3699 eb = (struct extent_buffer *)page->private; 3700 BUG_ON(!eb); 3701 done = atomic_dec_and_test(&eb->io_pages); 3702 3703 if (bio->bi_error || 3704 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3705 ClearPageUptodate(page); 3706 set_btree_ioerr(page); 3707 } 3708 3709 end_page_writeback(page); 3710 3711 if (!done) 3712 continue; 3713 3714 end_extent_buffer_writeback(eb); 3715 } 3716 3717 bio_put(bio); 3718 } 3719 3720 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3721 struct btrfs_fs_info *fs_info, 3722 struct writeback_control *wbc, 3723 struct extent_page_data *epd) 3724 { 3725 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3726 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3727 u64 offset = eb->start; 3728 u32 nritems; 3729 unsigned long i, num_pages; 3730 unsigned long bio_flags = 0; 3731 unsigned long start, end; 3732 int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; 3733 int ret = 0; 3734 3735 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3736 num_pages = num_extent_pages(eb->start, eb->len); 3737 atomic_set(&eb->io_pages, num_pages); 3738 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3739 bio_flags = EXTENT_BIO_TREE_LOG; 3740 3741 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3742 nritems = btrfs_header_nritems(eb); 3743 if (btrfs_header_level(eb) > 0) { 3744 end = btrfs_node_key_ptr_offset(nritems); 3745 3746 memzero_extent_buffer(eb, end, eb->len - end); 3747 } else { 3748 /* 3749 * leaf: 3750 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3751 */ 3752 start = btrfs_item_nr_offset(nritems); 3753 end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); 3754 memzero_extent_buffer(eb, start, end - start); 3755 } 3756 3757 for (i = 0; i < num_pages; i++) { 3758 struct page *p = eb->pages[i]; 3759 3760 clear_page_dirty_for_io(p); 3761 set_page_writeback(p); 3762 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3763 p, offset >> 9, PAGE_SIZE, 0, bdev, 3764 &epd->bio, -1, 3765 end_bio_extent_buffer_writepage, 3766 0, epd->bio_flags, bio_flags, false); 3767 epd->bio_flags = bio_flags; 3768 if (ret) { 3769 set_btree_ioerr(p); 3770 end_page_writeback(p); 3771 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3772 end_extent_buffer_writeback(eb); 3773 ret = -EIO; 3774 break; 3775 } 3776 offset += PAGE_SIZE; 3777 update_nr_written(p, wbc, 1); 3778 unlock_page(p); 3779 } 3780 3781 if (unlikely(ret)) { 3782 for (; i < num_pages; i++) { 3783 struct page *p = eb->pages[i]; 3784 clear_page_dirty_for_io(p); 3785 unlock_page(p); 3786 } 3787 } 3788 3789 return ret; 3790 } 3791 3792 int btree_write_cache_pages(struct address_space *mapping, 3793 struct writeback_control *wbc) 3794 { 3795 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3796 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3797 struct extent_buffer *eb, *prev_eb = NULL; 3798 struct extent_page_data epd = { 3799 .bio = NULL, 3800 .tree = tree, 3801 .extent_locked = 0, 3802 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3803 .bio_flags = 0, 3804 }; 3805 int ret = 0; 3806 int done = 0; 3807 int nr_to_write_done = 0; 3808 struct pagevec pvec; 3809 int nr_pages; 3810 pgoff_t index; 3811 pgoff_t end; /* Inclusive */ 3812 int scanned = 0; 3813 int tag; 3814 3815 pagevec_init(&pvec, 0); 3816 if (wbc->range_cyclic) { 3817 index = mapping->writeback_index; /* Start from prev offset */ 3818 end = -1; 3819 } else { 3820 index = wbc->range_start >> PAGE_SHIFT; 3821 end = wbc->range_end >> PAGE_SHIFT; 3822 scanned = 1; 3823 } 3824 if (wbc->sync_mode == WB_SYNC_ALL) 3825 tag = PAGECACHE_TAG_TOWRITE; 3826 else 3827 tag = PAGECACHE_TAG_DIRTY; 3828 retry: 3829 if (wbc->sync_mode == WB_SYNC_ALL) 3830 tag_pages_for_writeback(mapping, index, end); 3831 while (!done && !nr_to_write_done && (index <= end) && 3832 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3833 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3834 unsigned i; 3835 3836 scanned = 1; 3837 for (i = 0; i < nr_pages; i++) { 3838 struct page *page = pvec.pages[i]; 3839 3840 if (!PagePrivate(page)) 3841 continue; 3842 3843 if (!wbc->range_cyclic && page->index > end) { 3844 done = 1; 3845 break; 3846 } 3847 3848 spin_lock(&mapping->private_lock); 3849 if (!PagePrivate(page)) { 3850 spin_unlock(&mapping->private_lock); 3851 continue; 3852 } 3853 3854 eb = (struct extent_buffer *)page->private; 3855 3856 /* 3857 * Shouldn't happen and normally this would be a BUG_ON 3858 * but no sense in crashing the users box for something 3859 * we can survive anyway. 3860 */ 3861 if (WARN_ON(!eb)) { 3862 spin_unlock(&mapping->private_lock); 3863 continue; 3864 } 3865 3866 if (eb == prev_eb) { 3867 spin_unlock(&mapping->private_lock); 3868 continue; 3869 } 3870 3871 ret = atomic_inc_not_zero(&eb->refs); 3872 spin_unlock(&mapping->private_lock); 3873 if (!ret) 3874 continue; 3875 3876 prev_eb = eb; 3877 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3878 if (!ret) { 3879 free_extent_buffer(eb); 3880 continue; 3881 } 3882 3883 ret = write_one_eb(eb, fs_info, wbc, &epd); 3884 if (ret) { 3885 done = 1; 3886 free_extent_buffer(eb); 3887 break; 3888 } 3889 free_extent_buffer(eb); 3890 3891 /* 3892 * the filesystem may choose to bump up nr_to_write. 3893 * We have to make sure to honor the new nr_to_write 3894 * at any time 3895 */ 3896 nr_to_write_done = wbc->nr_to_write <= 0; 3897 } 3898 pagevec_release(&pvec); 3899 cond_resched(); 3900 } 3901 if (!scanned && !done) { 3902 /* 3903 * We hit the last page and there is more work to be done: wrap 3904 * back to the start of the file 3905 */ 3906 scanned = 1; 3907 index = 0; 3908 goto retry; 3909 } 3910 flush_write_bio(&epd); 3911 return ret; 3912 } 3913 3914 /** 3915 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3916 * @mapping: address space structure to write 3917 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3918 * @writepage: function called for each page 3919 * @data: data passed to writepage function 3920 * 3921 * If a page is already under I/O, write_cache_pages() skips it, even 3922 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3923 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3924 * and msync() need to guarantee that all the data which was dirty at the time 3925 * the call was made get new I/O started against them. If wbc->sync_mode is 3926 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3927 * existing IO to complete. 3928 */ 3929 static int extent_write_cache_pages(struct extent_io_tree *tree, 3930 struct address_space *mapping, 3931 struct writeback_control *wbc, 3932 writepage_t writepage, void *data, 3933 void (*flush_fn)(void *)) 3934 { 3935 struct inode *inode = mapping->host; 3936 int ret = 0; 3937 int done = 0; 3938 int nr_to_write_done = 0; 3939 struct pagevec pvec; 3940 int nr_pages; 3941 pgoff_t index; 3942 pgoff_t end; /* Inclusive */ 3943 pgoff_t done_index; 3944 int range_whole = 0; 3945 int scanned = 0; 3946 int tag; 3947 3948 /* 3949 * We have to hold onto the inode so that ordered extents can do their 3950 * work when the IO finishes. The alternative to this is failing to add 3951 * an ordered extent if the igrab() fails there and that is a huge pain 3952 * to deal with, so instead just hold onto the inode throughout the 3953 * writepages operation. If it fails here we are freeing up the inode 3954 * anyway and we'd rather not waste our time writing out stuff that is 3955 * going to be truncated anyway. 3956 */ 3957 if (!igrab(inode)) 3958 return 0; 3959 3960 pagevec_init(&pvec, 0); 3961 if (wbc->range_cyclic) { 3962 index = mapping->writeback_index; /* Start from prev offset */ 3963 end = -1; 3964 } else { 3965 index = wbc->range_start >> PAGE_SHIFT; 3966 end = wbc->range_end >> PAGE_SHIFT; 3967 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 3968 range_whole = 1; 3969 scanned = 1; 3970 } 3971 if (wbc->sync_mode == WB_SYNC_ALL) 3972 tag = PAGECACHE_TAG_TOWRITE; 3973 else 3974 tag = PAGECACHE_TAG_DIRTY; 3975 retry: 3976 if (wbc->sync_mode == WB_SYNC_ALL) 3977 tag_pages_for_writeback(mapping, index, end); 3978 done_index = index; 3979 while (!done && !nr_to_write_done && (index <= end) && 3980 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3981 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3982 unsigned i; 3983 3984 scanned = 1; 3985 for (i = 0; i < nr_pages; i++) { 3986 struct page *page = pvec.pages[i]; 3987 3988 done_index = page->index; 3989 /* 3990 * At this point we hold neither mapping->tree_lock nor 3991 * lock on the page itself: the page may be truncated or 3992 * invalidated (changing page->mapping to NULL), or even 3993 * swizzled back from swapper_space to tmpfs file 3994 * mapping 3995 */ 3996 if (!trylock_page(page)) { 3997 flush_fn(data); 3998 lock_page(page); 3999 } 4000 4001 if (unlikely(page->mapping != mapping)) { 4002 unlock_page(page); 4003 continue; 4004 } 4005 4006 if (!wbc->range_cyclic && page->index > end) { 4007 done = 1; 4008 unlock_page(page); 4009 continue; 4010 } 4011 4012 if (wbc->sync_mode != WB_SYNC_NONE) { 4013 if (PageWriteback(page)) 4014 flush_fn(data); 4015 wait_on_page_writeback(page); 4016 } 4017 4018 if (PageWriteback(page) || 4019 !clear_page_dirty_for_io(page)) { 4020 unlock_page(page); 4021 continue; 4022 } 4023 4024 ret = (*writepage)(page, wbc, data); 4025 4026 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 4027 unlock_page(page); 4028 ret = 0; 4029 } 4030 if (ret < 0) { 4031 /* 4032 * done_index is set past this page, 4033 * so media errors will not choke 4034 * background writeout for the entire 4035 * file. This has consequences for 4036 * range_cyclic semantics (ie. it may 4037 * not be suitable for data integrity 4038 * writeout). 4039 */ 4040 done_index = page->index + 1; 4041 done = 1; 4042 break; 4043 } 4044 4045 /* 4046 * the filesystem may choose to bump up nr_to_write. 4047 * We have to make sure to honor the new nr_to_write 4048 * at any time 4049 */ 4050 nr_to_write_done = wbc->nr_to_write <= 0; 4051 } 4052 pagevec_release(&pvec); 4053 cond_resched(); 4054 } 4055 if (!scanned && !done) { 4056 /* 4057 * We hit the last page and there is more work to be done: wrap 4058 * back to the start of the file 4059 */ 4060 scanned = 1; 4061 index = 0; 4062 goto retry; 4063 } 4064 4065 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4066 mapping->writeback_index = done_index; 4067 4068 btrfs_add_delayed_iput(inode); 4069 return ret; 4070 } 4071 4072 static void flush_epd_write_bio(struct extent_page_data *epd) 4073 { 4074 if (epd->bio) { 4075 int ret; 4076 4077 bio_set_op_attrs(epd->bio, REQ_OP_WRITE, 4078 epd->sync_io ? REQ_SYNC : 0); 4079 4080 ret = submit_one_bio(epd->bio, 0, epd->bio_flags); 4081 BUG_ON(ret < 0); /* -ENOMEM */ 4082 epd->bio = NULL; 4083 } 4084 } 4085 4086 static noinline void flush_write_bio(void *data) 4087 { 4088 struct extent_page_data *epd = data; 4089 flush_epd_write_bio(epd); 4090 } 4091 4092 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 4093 get_extent_t *get_extent, 4094 struct writeback_control *wbc) 4095 { 4096 int ret; 4097 struct extent_page_data epd = { 4098 .bio = NULL, 4099 .tree = tree, 4100 .get_extent = get_extent, 4101 .extent_locked = 0, 4102 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4103 .bio_flags = 0, 4104 }; 4105 4106 ret = __extent_writepage(page, wbc, &epd); 4107 4108 flush_epd_write_bio(&epd); 4109 return ret; 4110 } 4111 4112 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 4113 u64 start, u64 end, get_extent_t *get_extent, 4114 int mode) 4115 { 4116 int ret = 0; 4117 struct address_space *mapping = inode->i_mapping; 4118 struct page *page; 4119 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4120 PAGE_SHIFT; 4121 4122 struct extent_page_data epd = { 4123 .bio = NULL, 4124 .tree = tree, 4125 .get_extent = get_extent, 4126 .extent_locked = 1, 4127 .sync_io = mode == WB_SYNC_ALL, 4128 .bio_flags = 0, 4129 }; 4130 struct writeback_control wbc_writepages = { 4131 .sync_mode = mode, 4132 .nr_to_write = nr_pages * 2, 4133 .range_start = start, 4134 .range_end = end + 1, 4135 }; 4136 4137 while (start <= end) { 4138 page = find_get_page(mapping, start >> PAGE_SHIFT); 4139 if (clear_page_dirty_for_io(page)) 4140 ret = __extent_writepage(page, &wbc_writepages, &epd); 4141 else { 4142 if (tree->ops && tree->ops->writepage_end_io_hook) 4143 tree->ops->writepage_end_io_hook(page, start, 4144 start + PAGE_SIZE - 1, 4145 NULL, 1); 4146 unlock_page(page); 4147 } 4148 put_page(page); 4149 start += PAGE_SIZE; 4150 } 4151 4152 flush_epd_write_bio(&epd); 4153 return ret; 4154 } 4155 4156 int extent_writepages(struct extent_io_tree *tree, 4157 struct address_space *mapping, 4158 get_extent_t *get_extent, 4159 struct writeback_control *wbc) 4160 { 4161 int ret = 0; 4162 struct extent_page_data epd = { 4163 .bio = NULL, 4164 .tree = tree, 4165 .get_extent = get_extent, 4166 .extent_locked = 0, 4167 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4168 .bio_flags = 0, 4169 }; 4170 4171 ret = extent_write_cache_pages(tree, mapping, wbc, 4172 __extent_writepage, &epd, 4173 flush_write_bio); 4174 flush_epd_write_bio(&epd); 4175 return ret; 4176 } 4177 4178 int extent_readpages(struct extent_io_tree *tree, 4179 struct address_space *mapping, 4180 struct list_head *pages, unsigned nr_pages, 4181 get_extent_t get_extent) 4182 { 4183 struct bio *bio = NULL; 4184 unsigned page_idx; 4185 unsigned long bio_flags = 0; 4186 struct page *pagepool[16]; 4187 struct page *page; 4188 struct extent_map *em_cached = NULL; 4189 int nr = 0; 4190 u64 prev_em_start = (u64)-1; 4191 4192 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 4193 page = list_entry(pages->prev, struct page, lru); 4194 4195 prefetchw(&page->flags); 4196 list_del(&page->lru); 4197 if (add_to_page_cache_lru(page, mapping, 4198 page->index, 4199 readahead_gfp_mask(mapping))) { 4200 put_page(page); 4201 continue; 4202 } 4203 4204 pagepool[nr++] = page; 4205 if (nr < ARRAY_SIZE(pagepool)) 4206 continue; 4207 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4208 &bio, 0, &bio_flags, &prev_em_start); 4209 nr = 0; 4210 } 4211 if (nr) 4212 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4213 &bio, 0, &bio_flags, &prev_em_start); 4214 4215 if (em_cached) 4216 free_extent_map(em_cached); 4217 4218 BUG_ON(!list_empty(pages)); 4219 if (bio) 4220 return submit_one_bio(bio, 0, bio_flags); 4221 return 0; 4222 } 4223 4224 /* 4225 * basic invalidatepage code, this waits on any locked or writeback 4226 * ranges corresponding to the page, and then deletes any extent state 4227 * records from the tree 4228 */ 4229 int extent_invalidatepage(struct extent_io_tree *tree, 4230 struct page *page, unsigned long offset) 4231 { 4232 struct extent_state *cached_state = NULL; 4233 u64 start = page_offset(page); 4234 u64 end = start + PAGE_SIZE - 1; 4235 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4236 4237 start += ALIGN(offset, blocksize); 4238 if (start > end) 4239 return 0; 4240 4241 lock_extent_bits(tree, start, end, &cached_state); 4242 wait_on_page_writeback(page); 4243 clear_extent_bit(tree, start, end, 4244 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4245 EXTENT_DO_ACCOUNTING, 4246 1, 1, &cached_state, GFP_NOFS); 4247 return 0; 4248 } 4249 4250 /* 4251 * a helper for releasepage, this tests for areas of the page that 4252 * are locked or under IO and drops the related state bits if it is safe 4253 * to drop the page. 4254 */ 4255 static int try_release_extent_state(struct extent_map_tree *map, 4256 struct extent_io_tree *tree, 4257 struct page *page, gfp_t mask) 4258 { 4259 u64 start = page_offset(page); 4260 u64 end = start + PAGE_SIZE - 1; 4261 int ret = 1; 4262 4263 if (test_range_bit(tree, start, end, 4264 EXTENT_IOBITS, 0, NULL)) 4265 ret = 0; 4266 else { 4267 if ((mask & GFP_NOFS) == GFP_NOFS) 4268 mask = GFP_NOFS; 4269 /* 4270 * at this point we can safely clear everything except the 4271 * locked bit and the nodatasum bit 4272 */ 4273 ret = clear_extent_bit(tree, start, end, 4274 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4275 0, 0, NULL, mask); 4276 4277 /* if clear_extent_bit failed for enomem reasons, 4278 * we can't allow the release to continue. 4279 */ 4280 if (ret < 0) 4281 ret = 0; 4282 else 4283 ret = 1; 4284 } 4285 return ret; 4286 } 4287 4288 /* 4289 * a helper for releasepage. As long as there are no locked extents 4290 * in the range corresponding to the page, both state records and extent 4291 * map records are removed 4292 */ 4293 int try_release_extent_mapping(struct extent_map_tree *map, 4294 struct extent_io_tree *tree, struct page *page, 4295 gfp_t mask) 4296 { 4297 struct extent_map *em; 4298 u64 start = page_offset(page); 4299 u64 end = start + PAGE_SIZE - 1; 4300 4301 if (gfpflags_allow_blocking(mask) && 4302 page->mapping->host->i_size > SZ_16M) { 4303 u64 len; 4304 while (start <= end) { 4305 len = end - start + 1; 4306 write_lock(&map->lock); 4307 em = lookup_extent_mapping(map, start, len); 4308 if (!em) { 4309 write_unlock(&map->lock); 4310 break; 4311 } 4312 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4313 em->start != start) { 4314 write_unlock(&map->lock); 4315 free_extent_map(em); 4316 break; 4317 } 4318 if (!test_range_bit(tree, em->start, 4319 extent_map_end(em) - 1, 4320 EXTENT_LOCKED | EXTENT_WRITEBACK, 4321 0, NULL)) { 4322 remove_extent_mapping(map, em); 4323 /* once for the rb tree */ 4324 free_extent_map(em); 4325 } 4326 start = extent_map_end(em); 4327 write_unlock(&map->lock); 4328 4329 /* once for us */ 4330 free_extent_map(em); 4331 } 4332 } 4333 return try_release_extent_state(map, tree, page, mask); 4334 } 4335 4336 /* 4337 * helper function for fiemap, which doesn't want to see any holes. 4338 * This maps until we find something past 'last' 4339 */ 4340 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4341 u64 offset, 4342 u64 last, 4343 get_extent_t *get_extent) 4344 { 4345 u64 sectorsize = btrfs_inode_sectorsize(inode); 4346 struct extent_map *em; 4347 u64 len; 4348 4349 if (offset >= last) 4350 return NULL; 4351 4352 while (1) { 4353 len = last - offset; 4354 if (len == 0) 4355 break; 4356 len = ALIGN(len, sectorsize); 4357 em = get_extent(inode, NULL, 0, offset, len, 0); 4358 if (IS_ERR_OR_NULL(em)) 4359 return em; 4360 4361 /* if this isn't a hole return it */ 4362 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 4363 em->block_start != EXTENT_MAP_HOLE) { 4364 return em; 4365 } 4366 4367 /* this is a hole, advance to the next extent */ 4368 offset = extent_map_end(em); 4369 free_extent_map(em); 4370 if (offset >= last) 4371 break; 4372 } 4373 return NULL; 4374 } 4375 4376 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4377 __u64 start, __u64 len, get_extent_t *get_extent) 4378 { 4379 int ret = 0; 4380 u64 off = start; 4381 u64 max = start + len; 4382 u32 flags = 0; 4383 u32 found_type; 4384 u64 last; 4385 u64 last_for_get_extent = 0; 4386 u64 disko = 0; 4387 u64 isize = i_size_read(inode); 4388 struct btrfs_key found_key; 4389 struct extent_map *em = NULL; 4390 struct extent_state *cached_state = NULL; 4391 struct btrfs_path *path; 4392 struct btrfs_root *root = BTRFS_I(inode)->root; 4393 int end = 0; 4394 u64 em_start = 0; 4395 u64 em_len = 0; 4396 u64 em_end = 0; 4397 4398 if (len == 0) 4399 return -EINVAL; 4400 4401 path = btrfs_alloc_path(); 4402 if (!path) 4403 return -ENOMEM; 4404 path->leave_spinning = 1; 4405 4406 start = round_down(start, btrfs_inode_sectorsize(inode)); 4407 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4408 4409 /* 4410 * lookup the last file extent. We're not using i_size here 4411 * because there might be preallocation past i_size 4412 */ 4413 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, 4414 0); 4415 if (ret < 0) { 4416 btrfs_free_path(path); 4417 return ret; 4418 } else { 4419 WARN_ON(!ret); 4420 if (ret == 1) 4421 ret = 0; 4422 } 4423 4424 path->slots[0]--; 4425 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4426 found_type = found_key.type; 4427 4428 /* No extents, but there might be delalloc bits */ 4429 if (found_key.objectid != btrfs_ino(inode) || 4430 found_type != BTRFS_EXTENT_DATA_KEY) { 4431 /* have to trust i_size as the end */ 4432 last = (u64)-1; 4433 last_for_get_extent = isize; 4434 } else { 4435 /* 4436 * remember the start of the last extent. There are a 4437 * bunch of different factors that go into the length of the 4438 * extent, so its much less complex to remember where it started 4439 */ 4440 last = found_key.offset; 4441 last_for_get_extent = last + 1; 4442 } 4443 btrfs_release_path(path); 4444 4445 /* 4446 * we might have some extents allocated but more delalloc past those 4447 * extents. so, we trust isize unless the start of the last extent is 4448 * beyond isize 4449 */ 4450 if (last < isize) { 4451 last = (u64)-1; 4452 last_for_get_extent = isize; 4453 } 4454 4455 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4456 &cached_state); 4457 4458 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4459 get_extent); 4460 if (!em) 4461 goto out; 4462 if (IS_ERR(em)) { 4463 ret = PTR_ERR(em); 4464 goto out; 4465 } 4466 4467 while (!end) { 4468 u64 offset_in_extent = 0; 4469 4470 /* break if the extent we found is outside the range */ 4471 if (em->start >= max || extent_map_end(em) < off) 4472 break; 4473 4474 /* 4475 * get_extent may return an extent that starts before our 4476 * requested range. We have to make sure the ranges 4477 * we return to fiemap always move forward and don't 4478 * overlap, so adjust the offsets here 4479 */ 4480 em_start = max(em->start, off); 4481 4482 /* 4483 * record the offset from the start of the extent 4484 * for adjusting the disk offset below. Only do this if the 4485 * extent isn't compressed since our in ram offset may be past 4486 * what we have actually allocated on disk. 4487 */ 4488 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4489 offset_in_extent = em_start - em->start; 4490 em_end = extent_map_end(em); 4491 em_len = em_end - em_start; 4492 disko = 0; 4493 flags = 0; 4494 4495 /* 4496 * bump off for our next call to get_extent 4497 */ 4498 off = extent_map_end(em); 4499 if (off >= max) 4500 end = 1; 4501 4502 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4503 end = 1; 4504 flags |= FIEMAP_EXTENT_LAST; 4505 } else if (em->block_start == EXTENT_MAP_INLINE) { 4506 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4507 FIEMAP_EXTENT_NOT_ALIGNED); 4508 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4509 flags |= (FIEMAP_EXTENT_DELALLOC | 4510 FIEMAP_EXTENT_UNKNOWN); 4511 } else if (fieinfo->fi_extents_max) { 4512 struct btrfs_trans_handle *trans; 4513 4514 u64 bytenr = em->block_start - 4515 (em->start - em->orig_start); 4516 4517 disko = em->block_start + offset_in_extent; 4518 4519 /* 4520 * We need a trans handle to get delayed refs 4521 */ 4522 trans = btrfs_join_transaction(root); 4523 /* 4524 * It's OK if we can't start a trans we can still check 4525 * from commit_root 4526 */ 4527 if (IS_ERR(trans)) 4528 trans = NULL; 4529 4530 /* 4531 * As btrfs supports shared space, this information 4532 * can be exported to userspace tools via 4533 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4534 * then we're just getting a count and we can skip the 4535 * lookup stuff. 4536 */ 4537 ret = btrfs_check_shared(trans, root->fs_info, 4538 root->objectid, 4539 btrfs_ino(inode), bytenr); 4540 if (trans) 4541 btrfs_end_transaction(trans); 4542 if (ret < 0) 4543 goto out_free; 4544 if (ret) 4545 flags |= FIEMAP_EXTENT_SHARED; 4546 ret = 0; 4547 } 4548 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4549 flags |= FIEMAP_EXTENT_ENCODED; 4550 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4551 flags |= FIEMAP_EXTENT_UNWRITTEN; 4552 4553 free_extent_map(em); 4554 em = NULL; 4555 if ((em_start >= last) || em_len == (u64)-1 || 4556 (last == (u64)-1 && isize <= em_end)) { 4557 flags |= FIEMAP_EXTENT_LAST; 4558 end = 1; 4559 } 4560 4561 /* now scan forward to see if this is really the last extent. */ 4562 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4563 get_extent); 4564 if (IS_ERR(em)) { 4565 ret = PTR_ERR(em); 4566 goto out; 4567 } 4568 if (!em) { 4569 flags |= FIEMAP_EXTENT_LAST; 4570 end = 1; 4571 } 4572 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4573 em_len, flags); 4574 if (ret) { 4575 if (ret == 1) 4576 ret = 0; 4577 goto out_free; 4578 } 4579 } 4580 out_free: 4581 free_extent_map(em); 4582 out: 4583 btrfs_free_path(path); 4584 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4585 &cached_state, GFP_NOFS); 4586 return ret; 4587 } 4588 4589 static void __free_extent_buffer(struct extent_buffer *eb) 4590 { 4591 btrfs_leak_debug_del(&eb->leak_list); 4592 kmem_cache_free(extent_buffer_cache, eb); 4593 } 4594 4595 int extent_buffer_under_io(struct extent_buffer *eb) 4596 { 4597 return (atomic_read(&eb->io_pages) || 4598 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4599 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4600 } 4601 4602 /* 4603 * Helper for releasing extent buffer page. 4604 */ 4605 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) 4606 { 4607 unsigned long index; 4608 struct page *page; 4609 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4610 4611 BUG_ON(extent_buffer_under_io(eb)); 4612 4613 index = num_extent_pages(eb->start, eb->len); 4614 if (index == 0) 4615 return; 4616 4617 do { 4618 index--; 4619 page = eb->pages[index]; 4620 if (!page) 4621 continue; 4622 if (mapped) 4623 spin_lock(&page->mapping->private_lock); 4624 /* 4625 * We do this since we'll remove the pages after we've 4626 * removed the eb from the radix tree, so we could race 4627 * and have this page now attached to the new eb. So 4628 * only clear page_private if it's still connected to 4629 * this eb. 4630 */ 4631 if (PagePrivate(page) && 4632 page->private == (unsigned long)eb) { 4633 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4634 BUG_ON(PageDirty(page)); 4635 BUG_ON(PageWriteback(page)); 4636 /* 4637 * We need to make sure we haven't be attached 4638 * to a new eb. 4639 */ 4640 ClearPagePrivate(page); 4641 set_page_private(page, 0); 4642 /* One for the page private */ 4643 put_page(page); 4644 } 4645 4646 if (mapped) 4647 spin_unlock(&page->mapping->private_lock); 4648 4649 /* One for when we allocated the page */ 4650 put_page(page); 4651 } while (index != 0); 4652 } 4653 4654 /* 4655 * Helper for releasing the extent buffer. 4656 */ 4657 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4658 { 4659 btrfs_release_extent_buffer_page(eb); 4660 __free_extent_buffer(eb); 4661 } 4662 4663 static struct extent_buffer * 4664 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4665 unsigned long len) 4666 { 4667 struct extent_buffer *eb = NULL; 4668 4669 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4670 eb->start = start; 4671 eb->len = len; 4672 eb->fs_info = fs_info; 4673 eb->bflags = 0; 4674 rwlock_init(&eb->lock); 4675 atomic_set(&eb->write_locks, 0); 4676 atomic_set(&eb->read_locks, 0); 4677 atomic_set(&eb->blocking_readers, 0); 4678 atomic_set(&eb->blocking_writers, 0); 4679 atomic_set(&eb->spinning_readers, 0); 4680 atomic_set(&eb->spinning_writers, 0); 4681 eb->lock_nested = 0; 4682 init_waitqueue_head(&eb->write_lock_wq); 4683 init_waitqueue_head(&eb->read_lock_wq); 4684 4685 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4686 4687 spin_lock_init(&eb->refs_lock); 4688 atomic_set(&eb->refs, 1); 4689 atomic_set(&eb->io_pages, 0); 4690 4691 /* 4692 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4693 */ 4694 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4695 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4696 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4697 4698 return eb; 4699 } 4700 4701 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4702 { 4703 unsigned long i; 4704 struct page *p; 4705 struct extent_buffer *new; 4706 unsigned long num_pages = num_extent_pages(src->start, src->len); 4707 4708 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4709 if (new == NULL) 4710 return NULL; 4711 4712 for (i = 0; i < num_pages; i++) { 4713 p = alloc_page(GFP_NOFS); 4714 if (!p) { 4715 btrfs_release_extent_buffer(new); 4716 return NULL; 4717 } 4718 attach_extent_buffer_page(new, p); 4719 WARN_ON(PageDirty(p)); 4720 SetPageUptodate(p); 4721 new->pages[i] = p; 4722 copy_page(page_address(p), page_address(src->pages[i])); 4723 } 4724 4725 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4726 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4727 4728 return new; 4729 } 4730 4731 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4732 u64 start, unsigned long len) 4733 { 4734 struct extent_buffer *eb; 4735 unsigned long num_pages; 4736 unsigned long i; 4737 4738 num_pages = num_extent_pages(start, len); 4739 4740 eb = __alloc_extent_buffer(fs_info, start, len); 4741 if (!eb) 4742 return NULL; 4743 4744 for (i = 0; i < num_pages; i++) { 4745 eb->pages[i] = alloc_page(GFP_NOFS); 4746 if (!eb->pages[i]) 4747 goto err; 4748 } 4749 set_extent_buffer_uptodate(eb); 4750 btrfs_set_header_nritems(eb, 0); 4751 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4752 4753 return eb; 4754 err: 4755 for (; i > 0; i--) 4756 __free_page(eb->pages[i - 1]); 4757 __free_extent_buffer(eb); 4758 return NULL; 4759 } 4760 4761 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4762 u64 start) 4763 { 4764 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 4765 } 4766 4767 static void check_buffer_tree_ref(struct extent_buffer *eb) 4768 { 4769 int refs; 4770 /* the ref bit is tricky. We have to make sure it is set 4771 * if we have the buffer dirty. Otherwise the 4772 * code to free a buffer can end up dropping a dirty 4773 * page 4774 * 4775 * Once the ref bit is set, it won't go away while the 4776 * buffer is dirty or in writeback, and it also won't 4777 * go away while we have the reference count on the 4778 * eb bumped. 4779 * 4780 * We can't just set the ref bit without bumping the 4781 * ref on the eb because free_extent_buffer might 4782 * see the ref bit and try to clear it. If this happens 4783 * free_extent_buffer might end up dropping our original 4784 * ref by mistake and freeing the page before we are able 4785 * to add one more ref. 4786 * 4787 * So bump the ref count first, then set the bit. If someone 4788 * beat us to it, drop the ref we added. 4789 */ 4790 refs = atomic_read(&eb->refs); 4791 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4792 return; 4793 4794 spin_lock(&eb->refs_lock); 4795 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4796 atomic_inc(&eb->refs); 4797 spin_unlock(&eb->refs_lock); 4798 } 4799 4800 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 4801 struct page *accessed) 4802 { 4803 unsigned long num_pages, i; 4804 4805 check_buffer_tree_ref(eb); 4806 4807 num_pages = num_extent_pages(eb->start, eb->len); 4808 for (i = 0; i < num_pages; i++) { 4809 struct page *p = eb->pages[i]; 4810 4811 if (p != accessed) 4812 mark_page_accessed(p); 4813 } 4814 } 4815 4816 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 4817 u64 start) 4818 { 4819 struct extent_buffer *eb; 4820 4821 rcu_read_lock(); 4822 eb = radix_tree_lookup(&fs_info->buffer_radix, 4823 start >> PAGE_SHIFT); 4824 if (eb && atomic_inc_not_zero(&eb->refs)) { 4825 rcu_read_unlock(); 4826 /* 4827 * Lock our eb's refs_lock to avoid races with 4828 * free_extent_buffer. When we get our eb it might be flagged 4829 * with EXTENT_BUFFER_STALE and another task running 4830 * free_extent_buffer might have seen that flag set, 4831 * eb->refs == 2, that the buffer isn't under IO (dirty and 4832 * writeback flags not set) and it's still in the tree (flag 4833 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 4834 * of decrementing the extent buffer's reference count twice. 4835 * So here we could race and increment the eb's reference count, 4836 * clear its stale flag, mark it as dirty and drop our reference 4837 * before the other task finishes executing free_extent_buffer, 4838 * which would later result in an attempt to free an extent 4839 * buffer that is dirty. 4840 */ 4841 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 4842 spin_lock(&eb->refs_lock); 4843 spin_unlock(&eb->refs_lock); 4844 } 4845 mark_extent_buffer_accessed(eb, NULL); 4846 return eb; 4847 } 4848 rcu_read_unlock(); 4849 4850 return NULL; 4851 } 4852 4853 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4854 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 4855 u64 start) 4856 { 4857 struct extent_buffer *eb, *exists = NULL; 4858 int ret; 4859 4860 eb = find_extent_buffer(fs_info, start); 4861 if (eb) 4862 return eb; 4863 eb = alloc_dummy_extent_buffer(fs_info, start); 4864 if (!eb) 4865 return NULL; 4866 eb->fs_info = fs_info; 4867 again: 4868 ret = radix_tree_preload(GFP_NOFS); 4869 if (ret) 4870 goto free_eb; 4871 spin_lock(&fs_info->buffer_lock); 4872 ret = radix_tree_insert(&fs_info->buffer_radix, 4873 start >> PAGE_SHIFT, eb); 4874 spin_unlock(&fs_info->buffer_lock); 4875 radix_tree_preload_end(); 4876 if (ret == -EEXIST) { 4877 exists = find_extent_buffer(fs_info, start); 4878 if (exists) 4879 goto free_eb; 4880 else 4881 goto again; 4882 } 4883 check_buffer_tree_ref(eb); 4884 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4885 4886 /* 4887 * We will free dummy extent buffer's if they come into 4888 * free_extent_buffer with a ref count of 2, but if we are using this we 4889 * want the buffers to stay in memory until we're done with them, so 4890 * bump the ref count again. 4891 */ 4892 atomic_inc(&eb->refs); 4893 return eb; 4894 free_eb: 4895 btrfs_release_extent_buffer(eb); 4896 return exists; 4897 } 4898 #endif 4899 4900 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4901 u64 start) 4902 { 4903 unsigned long len = fs_info->nodesize; 4904 unsigned long num_pages = num_extent_pages(start, len); 4905 unsigned long i; 4906 unsigned long index = start >> PAGE_SHIFT; 4907 struct extent_buffer *eb; 4908 struct extent_buffer *exists = NULL; 4909 struct page *p; 4910 struct address_space *mapping = fs_info->btree_inode->i_mapping; 4911 int uptodate = 1; 4912 int ret; 4913 4914 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 4915 btrfs_err(fs_info, "bad tree block start %llu", start); 4916 return ERR_PTR(-EINVAL); 4917 } 4918 4919 eb = find_extent_buffer(fs_info, start); 4920 if (eb) 4921 return eb; 4922 4923 eb = __alloc_extent_buffer(fs_info, start, len); 4924 if (!eb) 4925 return ERR_PTR(-ENOMEM); 4926 4927 for (i = 0; i < num_pages; i++, index++) { 4928 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 4929 if (!p) { 4930 exists = ERR_PTR(-ENOMEM); 4931 goto free_eb; 4932 } 4933 4934 spin_lock(&mapping->private_lock); 4935 if (PagePrivate(p)) { 4936 /* 4937 * We could have already allocated an eb for this page 4938 * and attached one so lets see if we can get a ref on 4939 * the existing eb, and if we can we know it's good and 4940 * we can just return that one, else we know we can just 4941 * overwrite page->private. 4942 */ 4943 exists = (struct extent_buffer *)p->private; 4944 if (atomic_inc_not_zero(&exists->refs)) { 4945 spin_unlock(&mapping->private_lock); 4946 unlock_page(p); 4947 put_page(p); 4948 mark_extent_buffer_accessed(exists, p); 4949 goto free_eb; 4950 } 4951 exists = NULL; 4952 4953 /* 4954 * Do this so attach doesn't complain and we need to 4955 * drop the ref the old guy had. 4956 */ 4957 ClearPagePrivate(p); 4958 WARN_ON(PageDirty(p)); 4959 put_page(p); 4960 } 4961 attach_extent_buffer_page(eb, p); 4962 spin_unlock(&mapping->private_lock); 4963 WARN_ON(PageDirty(p)); 4964 eb->pages[i] = p; 4965 if (!PageUptodate(p)) 4966 uptodate = 0; 4967 4968 /* 4969 * see below about how we avoid a nasty race with release page 4970 * and why we unlock later 4971 */ 4972 } 4973 if (uptodate) 4974 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4975 again: 4976 ret = radix_tree_preload(GFP_NOFS); 4977 if (ret) { 4978 exists = ERR_PTR(ret); 4979 goto free_eb; 4980 } 4981 4982 spin_lock(&fs_info->buffer_lock); 4983 ret = radix_tree_insert(&fs_info->buffer_radix, 4984 start >> PAGE_SHIFT, eb); 4985 spin_unlock(&fs_info->buffer_lock); 4986 radix_tree_preload_end(); 4987 if (ret == -EEXIST) { 4988 exists = find_extent_buffer(fs_info, start); 4989 if (exists) 4990 goto free_eb; 4991 else 4992 goto again; 4993 } 4994 /* add one reference for the tree */ 4995 check_buffer_tree_ref(eb); 4996 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4997 4998 /* 4999 * there is a race where release page may have 5000 * tried to find this extent buffer in the radix 5001 * but failed. It will tell the VM it is safe to 5002 * reclaim the, and it will clear the page private bit. 5003 * We must make sure to set the page private bit properly 5004 * after the extent buffer is in the radix tree so 5005 * it doesn't get lost 5006 */ 5007 SetPageChecked(eb->pages[0]); 5008 for (i = 1; i < num_pages; i++) { 5009 p = eb->pages[i]; 5010 ClearPageChecked(p); 5011 unlock_page(p); 5012 } 5013 unlock_page(eb->pages[0]); 5014 return eb; 5015 5016 free_eb: 5017 WARN_ON(!atomic_dec_and_test(&eb->refs)); 5018 for (i = 0; i < num_pages; i++) { 5019 if (eb->pages[i]) 5020 unlock_page(eb->pages[i]); 5021 } 5022 5023 btrfs_release_extent_buffer(eb); 5024 return exists; 5025 } 5026 5027 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5028 { 5029 struct extent_buffer *eb = 5030 container_of(head, struct extent_buffer, rcu_head); 5031 5032 __free_extent_buffer(eb); 5033 } 5034 5035 /* Expects to have eb->eb_lock already held */ 5036 static int release_extent_buffer(struct extent_buffer *eb) 5037 { 5038 WARN_ON(atomic_read(&eb->refs) == 0); 5039 if (atomic_dec_and_test(&eb->refs)) { 5040 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5041 struct btrfs_fs_info *fs_info = eb->fs_info; 5042 5043 spin_unlock(&eb->refs_lock); 5044 5045 spin_lock(&fs_info->buffer_lock); 5046 radix_tree_delete(&fs_info->buffer_radix, 5047 eb->start >> PAGE_SHIFT); 5048 spin_unlock(&fs_info->buffer_lock); 5049 } else { 5050 spin_unlock(&eb->refs_lock); 5051 } 5052 5053 /* Should be safe to release our pages at this point */ 5054 btrfs_release_extent_buffer_page(eb); 5055 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5056 if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { 5057 __free_extent_buffer(eb); 5058 return 1; 5059 } 5060 #endif 5061 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5062 return 1; 5063 } 5064 spin_unlock(&eb->refs_lock); 5065 5066 return 0; 5067 } 5068 5069 void free_extent_buffer(struct extent_buffer *eb) 5070 { 5071 int refs; 5072 int old; 5073 if (!eb) 5074 return; 5075 5076 while (1) { 5077 refs = atomic_read(&eb->refs); 5078 if (refs <= 3) 5079 break; 5080 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5081 if (old == refs) 5082 return; 5083 } 5084 5085 spin_lock(&eb->refs_lock); 5086 if (atomic_read(&eb->refs) == 2 && 5087 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 5088 atomic_dec(&eb->refs); 5089 5090 if (atomic_read(&eb->refs) == 2 && 5091 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5092 !extent_buffer_under_io(eb) && 5093 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5094 atomic_dec(&eb->refs); 5095 5096 /* 5097 * I know this is terrible, but it's temporary until we stop tracking 5098 * the uptodate bits and such for the extent buffers. 5099 */ 5100 release_extent_buffer(eb); 5101 } 5102 5103 void free_extent_buffer_stale(struct extent_buffer *eb) 5104 { 5105 if (!eb) 5106 return; 5107 5108 spin_lock(&eb->refs_lock); 5109 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5110 5111 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5112 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5113 atomic_dec(&eb->refs); 5114 release_extent_buffer(eb); 5115 } 5116 5117 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5118 { 5119 unsigned long i; 5120 unsigned long num_pages; 5121 struct page *page; 5122 5123 num_pages = num_extent_pages(eb->start, eb->len); 5124 5125 for (i = 0; i < num_pages; i++) { 5126 page = eb->pages[i]; 5127 if (!PageDirty(page)) 5128 continue; 5129 5130 lock_page(page); 5131 WARN_ON(!PagePrivate(page)); 5132 5133 clear_page_dirty_for_io(page); 5134 spin_lock_irq(&page->mapping->tree_lock); 5135 if (!PageDirty(page)) { 5136 radix_tree_tag_clear(&page->mapping->page_tree, 5137 page_index(page), 5138 PAGECACHE_TAG_DIRTY); 5139 } 5140 spin_unlock_irq(&page->mapping->tree_lock); 5141 ClearPageError(page); 5142 unlock_page(page); 5143 } 5144 WARN_ON(atomic_read(&eb->refs) == 0); 5145 } 5146 5147 int set_extent_buffer_dirty(struct extent_buffer *eb) 5148 { 5149 unsigned long i; 5150 unsigned long num_pages; 5151 int was_dirty = 0; 5152 5153 check_buffer_tree_ref(eb); 5154 5155 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5156 5157 num_pages = num_extent_pages(eb->start, eb->len); 5158 WARN_ON(atomic_read(&eb->refs) == 0); 5159 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5160 5161 for (i = 0; i < num_pages; i++) 5162 set_page_dirty(eb->pages[i]); 5163 return was_dirty; 5164 } 5165 5166 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5167 { 5168 unsigned long i; 5169 struct page *page; 5170 unsigned long num_pages; 5171 5172 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5173 num_pages = num_extent_pages(eb->start, eb->len); 5174 for (i = 0; i < num_pages; i++) { 5175 page = eb->pages[i]; 5176 if (page) 5177 ClearPageUptodate(page); 5178 } 5179 } 5180 5181 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5182 { 5183 unsigned long i; 5184 struct page *page; 5185 unsigned long num_pages; 5186 5187 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5188 num_pages = num_extent_pages(eb->start, eb->len); 5189 for (i = 0; i < num_pages; i++) { 5190 page = eb->pages[i]; 5191 SetPageUptodate(page); 5192 } 5193 } 5194 5195 int extent_buffer_uptodate(struct extent_buffer *eb) 5196 { 5197 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5198 } 5199 5200 int read_extent_buffer_pages(struct extent_io_tree *tree, 5201 struct extent_buffer *eb, int wait, 5202 get_extent_t *get_extent, int mirror_num) 5203 { 5204 unsigned long i; 5205 struct page *page; 5206 int err; 5207 int ret = 0; 5208 int locked_pages = 0; 5209 int all_uptodate = 1; 5210 unsigned long num_pages; 5211 unsigned long num_reads = 0; 5212 struct bio *bio = NULL; 5213 unsigned long bio_flags = 0; 5214 5215 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5216 return 0; 5217 5218 num_pages = num_extent_pages(eb->start, eb->len); 5219 for (i = 0; i < num_pages; i++) { 5220 page = eb->pages[i]; 5221 if (wait == WAIT_NONE) { 5222 if (!trylock_page(page)) 5223 goto unlock_exit; 5224 } else { 5225 lock_page(page); 5226 } 5227 locked_pages++; 5228 } 5229 /* 5230 * We need to firstly lock all pages to make sure that 5231 * the uptodate bit of our pages won't be affected by 5232 * clear_extent_buffer_uptodate(). 5233 */ 5234 for (i = 0; i < num_pages; i++) { 5235 page = eb->pages[i]; 5236 if (!PageUptodate(page)) { 5237 num_reads++; 5238 all_uptodate = 0; 5239 } 5240 } 5241 5242 if (all_uptodate) { 5243 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5244 goto unlock_exit; 5245 } 5246 5247 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5248 eb->read_mirror = 0; 5249 atomic_set(&eb->io_pages, num_reads); 5250 for (i = 0; i < num_pages; i++) { 5251 page = eb->pages[i]; 5252 5253 if (!PageUptodate(page)) { 5254 if (ret) { 5255 atomic_dec(&eb->io_pages); 5256 unlock_page(page); 5257 continue; 5258 } 5259 5260 ClearPageError(page); 5261 err = __extent_read_full_page(tree, page, 5262 get_extent, &bio, 5263 mirror_num, &bio_flags, 5264 REQ_META); 5265 if (err) { 5266 ret = err; 5267 /* 5268 * We use &bio in above __extent_read_full_page, 5269 * so we ensure that if it returns error, the 5270 * current page fails to add itself to bio and 5271 * it's been unlocked. 5272 * 5273 * We must dec io_pages by ourselves. 5274 */ 5275 atomic_dec(&eb->io_pages); 5276 } 5277 } else { 5278 unlock_page(page); 5279 } 5280 } 5281 5282 if (bio) { 5283 err = submit_one_bio(bio, mirror_num, bio_flags); 5284 if (err) 5285 return err; 5286 } 5287 5288 if (ret || wait != WAIT_COMPLETE) 5289 return ret; 5290 5291 for (i = 0; i < num_pages; i++) { 5292 page = eb->pages[i]; 5293 wait_on_page_locked(page); 5294 if (!PageUptodate(page)) 5295 ret = -EIO; 5296 } 5297 5298 return ret; 5299 5300 unlock_exit: 5301 while (locked_pages > 0) { 5302 locked_pages--; 5303 page = eb->pages[locked_pages]; 5304 unlock_page(page); 5305 } 5306 return ret; 5307 } 5308 5309 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 5310 unsigned long start, 5311 unsigned long len) 5312 { 5313 size_t cur; 5314 size_t offset; 5315 struct page *page; 5316 char *kaddr; 5317 char *dst = (char *)dstv; 5318 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5319 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5320 5321 WARN_ON(start > eb->len); 5322 WARN_ON(start + len > eb->start + eb->len); 5323 5324 offset = (start_offset + start) & (PAGE_SIZE - 1); 5325 5326 while (len > 0) { 5327 page = eb->pages[i]; 5328 5329 cur = min(len, (PAGE_SIZE - offset)); 5330 kaddr = page_address(page); 5331 memcpy(dst, kaddr + offset, cur); 5332 5333 dst += cur; 5334 len -= cur; 5335 offset = 0; 5336 i++; 5337 } 5338 } 5339 5340 int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, 5341 unsigned long start, 5342 unsigned long len) 5343 { 5344 size_t cur; 5345 size_t offset; 5346 struct page *page; 5347 char *kaddr; 5348 char __user *dst = (char __user *)dstv; 5349 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5350 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5351 int ret = 0; 5352 5353 WARN_ON(start > eb->len); 5354 WARN_ON(start + len > eb->start + eb->len); 5355 5356 offset = (start_offset + start) & (PAGE_SIZE - 1); 5357 5358 while (len > 0) { 5359 page = eb->pages[i]; 5360 5361 cur = min(len, (PAGE_SIZE - offset)); 5362 kaddr = page_address(page); 5363 if (copy_to_user(dst, kaddr + offset, cur)) { 5364 ret = -EFAULT; 5365 break; 5366 } 5367 5368 dst += cur; 5369 len -= cur; 5370 offset = 0; 5371 i++; 5372 } 5373 5374 return ret; 5375 } 5376 5377 /* 5378 * return 0 if the item is found within a page. 5379 * return 1 if the item spans two pages. 5380 * return -EINVAL otherwise. 5381 */ 5382 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 5383 unsigned long min_len, char **map, 5384 unsigned long *map_start, 5385 unsigned long *map_len) 5386 { 5387 size_t offset = start & (PAGE_SIZE - 1); 5388 char *kaddr; 5389 struct page *p; 5390 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5391 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5392 unsigned long end_i = (start_offset + start + min_len - 1) >> 5393 PAGE_SHIFT; 5394 5395 if (i != end_i) 5396 return 1; 5397 5398 if (i == 0) { 5399 offset = start_offset; 5400 *map_start = 0; 5401 } else { 5402 offset = 0; 5403 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5404 } 5405 5406 if (start + min_len > eb->len) { 5407 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5408 eb->start, eb->len, start, min_len); 5409 return -EINVAL; 5410 } 5411 5412 p = eb->pages[i]; 5413 kaddr = page_address(p); 5414 *map = kaddr + offset; 5415 *map_len = PAGE_SIZE - offset; 5416 return 0; 5417 } 5418 5419 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 5420 unsigned long start, 5421 unsigned long len) 5422 { 5423 size_t cur; 5424 size_t offset; 5425 struct page *page; 5426 char *kaddr; 5427 char *ptr = (char *)ptrv; 5428 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5429 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5430 int ret = 0; 5431 5432 WARN_ON(start > eb->len); 5433 WARN_ON(start + len > eb->start + eb->len); 5434 5435 offset = (start_offset + start) & (PAGE_SIZE - 1); 5436 5437 while (len > 0) { 5438 page = eb->pages[i]; 5439 5440 cur = min(len, (PAGE_SIZE - offset)); 5441 5442 kaddr = page_address(page); 5443 ret = memcmp(ptr, kaddr + offset, cur); 5444 if (ret) 5445 break; 5446 5447 ptr += cur; 5448 len -= cur; 5449 offset = 0; 5450 i++; 5451 } 5452 return ret; 5453 } 5454 5455 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, 5456 const void *srcv) 5457 { 5458 char *kaddr; 5459 5460 WARN_ON(!PageUptodate(eb->pages[0])); 5461 kaddr = page_address(eb->pages[0]); 5462 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 5463 BTRFS_FSID_SIZE); 5464 } 5465 5466 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) 5467 { 5468 char *kaddr; 5469 5470 WARN_ON(!PageUptodate(eb->pages[0])); 5471 kaddr = page_address(eb->pages[0]); 5472 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 5473 BTRFS_FSID_SIZE); 5474 } 5475 5476 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5477 unsigned long start, unsigned long len) 5478 { 5479 size_t cur; 5480 size_t offset; 5481 struct page *page; 5482 char *kaddr; 5483 char *src = (char *)srcv; 5484 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5485 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5486 5487 WARN_ON(start > eb->len); 5488 WARN_ON(start + len > eb->start + eb->len); 5489 5490 offset = (start_offset + start) & (PAGE_SIZE - 1); 5491 5492 while (len > 0) { 5493 page = eb->pages[i]; 5494 WARN_ON(!PageUptodate(page)); 5495 5496 cur = min(len, PAGE_SIZE - offset); 5497 kaddr = page_address(page); 5498 memcpy(kaddr + offset, src, cur); 5499 5500 src += cur; 5501 len -= cur; 5502 offset = 0; 5503 i++; 5504 } 5505 } 5506 5507 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, 5508 unsigned long len) 5509 { 5510 size_t cur; 5511 size_t offset; 5512 struct page *page; 5513 char *kaddr; 5514 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5515 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5516 5517 WARN_ON(start > eb->len); 5518 WARN_ON(start + len > eb->start + eb->len); 5519 5520 offset = (start_offset + start) & (PAGE_SIZE - 1); 5521 5522 while (len > 0) { 5523 page = eb->pages[i]; 5524 WARN_ON(!PageUptodate(page)); 5525 5526 cur = min(len, PAGE_SIZE - offset); 5527 kaddr = page_address(page); 5528 memset(kaddr + offset, 0, cur); 5529 5530 len -= cur; 5531 offset = 0; 5532 i++; 5533 } 5534 } 5535 5536 void copy_extent_buffer_full(struct extent_buffer *dst, 5537 struct extent_buffer *src) 5538 { 5539 int i; 5540 unsigned num_pages; 5541 5542 ASSERT(dst->len == src->len); 5543 5544 num_pages = num_extent_pages(dst->start, dst->len); 5545 for (i = 0; i < num_pages; i++) 5546 copy_page(page_address(dst->pages[i]), 5547 page_address(src->pages[i])); 5548 } 5549 5550 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5551 unsigned long dst_offset, unsigned long src_offset, 5552 unsigned long len) 5553 { 5554 u64 dst_len = dst->len; 5555 size_t cur; 5556 size_t offset; 5557 struct page *page; 5558 char *kaddr; 5559 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5560 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5561 5562 WARN_ON(src->len != dst_len); 5563 5564 offset = (start_offset + dst_offset) & 5565 (PAGE_SIZE - 1); 5566 5567 while (len > 0) { 5568 page = dst->pages[i]; 5569 WARN_ON(!PageUptodate(page)); 5570 5571 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5572 5573 kaddr = page_address(page); 5574 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5575 5576 src_offset += cur; 5577 len -= cur; 5578 offset = 0; 5579 i++; 5580 } 5581 } 5582 5583 void le_bitmap_set(u8 *map, unsigned int start, int len) 5584 { 5585 u8 *p = map + BIT_BYTE(start); 5586 const unsigned int size = start + len; 5587 int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5588 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); 5589 5590 while (len - bits_to_set >= 0) { 5591 *p |= mask_to_set; 5592 len -= bits_to_set; 5593 bits_to_set = BITS_PER_BYTE; 5594 mask_to_set = ~0; 5595 p++; 5596 } 5597 if (len) { 5598 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5599 *p |= mask_to_set; 5600 } 5601 } 5602 5603 void le_bitmap_clear(u8 *map, unsigned int start, int len) 5604 { 5605 u8 *p = map + BIT_BYTE(start); 5606 const unsigned int size = start + len; 5607 int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5608 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); 5609 5610 while (len - bits_to_clear >= 0) { 5611 *p &= ~mask_to_clear; 5612 len -= bits_to_clear; 5613 bits_to_clear = BITS_PER_BYTE; 5614 mask_to_clear = ~0; 5615 p++; 5616 } 5617 if (len) { 5618 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5619 *p &= ~mask_to_clear; 5620 } 5621 } 5622 5623 /* 5624 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5625 * given bit number 5626 * @eb: the extent buffer 5627 * @start: offset of the bitmap item in the extent buffer 5628 * @nr: bit number 5629 * @page_index: return index of the page in the extent buffer that contains the 5630 * given bit number 5631 * @page_offset: return offset into the page given by page_index 5632 * 5633 * This helper hides the ugliness of finding the byte in an extent buffer which 5634 * contains a given bit. 5635 */ 5636 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5637 unsigned long start, unsigned long nr, 5638 unsigned long *page_index, 5639 size_t *page_offset) 5640 { 5641 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5642 size_t byte_offset = BIT_BYTE(nr); 5643 size_t offset; 5644 5645 /* 5646 * The byte we want is the offset of the extent buffer + the offset of 5647 * the bitmap item in the extent buffer + the offset of the byte in the 5648 * bitmap item. 5649 */ 5650 offset = start_offset + start + byte_offset; 5651 5652 *page_index = offset >> PAGE_SHIFT; 5653 *page_offset = offset & (PAGE_SIZE - 1); 5654 } 5655 5656 /** 5657 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5658 * @eb: the extent buffer 5659 * @start: offset of the bitmap item in the extent buffer 5660 * @nr: bit number to test 5661 */ 5662 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5663 unsigned long nr) 5664 { 5665 u8 *kaddr; 5666 struct page *page; 5667 unsigned long i; 5668 size_t offset; 5669 5670 eb_bitmap_offset(eb, start, nr, &i, &offset); 5671 page = eb->pages[i]; 5672 WARN_ON(!PageUptodate(page)); 5673 kaddr = page_address(page); 5674 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5675 } 5676 5677 /** 5678 * extent_buffer_bitmap_set - set an area of a bitmap 5679 * @eb: the extent buffer 5680 * @start: offset of the bitmap item in the extent buffer 5681 * @pos: bit number of the first bit 5682 * @len: number of bits to set 5683 */ 5684 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5685 unsigned long pos, unsigned long len) 5686 { 5687 u8 *kaddr; 5688 struct page *page; 5689 unsigned long i; 5690 size_t offset; 5691 const unsigned int size = pos + len; 5692 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5693 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5694 5695 eb_bitmap_offset(eb, start, pos, &i, &offset); 5696 page = eb->pages[i]; 5697 WARN_ON(!PageUptodate(page)); 5698 kaddr = page_address(page); 5699 5700 while (len >= bits_to_set) { 5701 kaddr[offset] |= mask_to_set; 5702 len -= bits_to_set; 5703 bits_to_set = BITS_PER_BYTE; 5704 mask_to_set = ~0; 5705 if (++offset >= PAGE_SIZE && len > 0) { 5706 offset = 0; 5707 page = eb->pages[++i]; 5708 WARN_ON(!PageUptodate(page)); 5709 kaddr = page_address(page); 5710 } 5711 } 5712 if (len) { 5713 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5714 kaddr[offset] |= mask_to_set; 5715 } 5716 } 5717 5718 5719 /** 5720 * extent_buffer_bitmap_clear - clear an area of a bitmap 5721 * @eb: the extent buffer 5722 * @start: offset of the bitmap item in the extent buffer 5723 * @pos: bit number of the first bit 5724 * @len: number of bits to clear 5725 */ 5726 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5727 unsigned long pos, unsigned long len) 5728 { 5729 u8 *kaddr; 5730 struct page *page; 5731 unsigned long i; 5732 size_t offset; 5733 const unsigned int size = pos + len; 5734 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5735 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5736 5737 eb_bitmap_offset(eb, start, pos, &i, &offset); 5738 page = eb->pages[i]; 5739 WARN_ON(!PageUptodate(page)); 5740 kaddr = page_address(page); 5741 5742 while (len >= bits_to_clear) { 5743 kaddr[offset] &= ~mask_to_clear; 5744 len -= bits_to_clear; 5745 bits_to_clear = BITS_PER_BYTE; 5746 mask_to_clear = ~0; 5747 if (++offset >= PAGE_SIZE && len > 0) { 5748 offset = 0; 5749 page = eb->pages[++i]; 5750 WARN_ON(!PageUptodate(page)); 5751 kaddr = page_address(page); 5752 } 5753 } 5754 if (len) { 5755 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5756 kaddr[offset] &= ~mask_to_clear; 5757 } 5758 } 5759 5760 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5761 { 5762 unsigned long distance = (src > dst) ? src - dst : dst - src; 5763 return distance < len; 5764 } 5765 5766 static void copy_pages(struct page *dst_page, struct page *src_page, 5767 unsigned long dst_off, unsigned long src_off, 5768 unsigned long len) 5769 { 5770 char *dst_kaddr = page_address(dst_page); 5771 char *src_kaddr; 5772 int must_memmove = 0; 5773 5774 if (dst_page != src_page) { 5775 src_kaddr = page_address(src_page); 5776 } else { 5777 src_kaddr = dst_kaddr; 5778 if (areas_overlap(src_off, dst_off, len)) 5779 must_memmove = 1; 5780 } 5781 5782 if (must_memmove) 5783 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5784 else 5785 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5786 } 5787 5788 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5789 unsigned long src_offset, unsigned long len) 5790 { 5791 struct btrfs_fs_info *fs_info = dst->fs_info; 5792 size_t cur; 5793 size_t dst_off_in_page; 5794 size_t src_off_in_page; 5795 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5796 unsigned long dst_i; 5797 unsigned long src_i; 5798 5799 if (src_offset + len > dst->len) { 5800 btrfs_err(fs_info, 5801 "memmove bogus src_offset %lu move len %lu dst len %lu", 5802 src_offset, len, dst->len); 5803 BUG_ON(1); 5804 } 5805 if (dst_offset + len > dst->len) { 5806 btrfs_err(fs_info, 5807 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5808 dst_offset, len, dst->len); 5809 BUG_ON(1); 5810 } 5811 5812 while (len > 0) { 5813 dst_off_in_page = (start_offset + dst_offset) & 5814 (PAGE_SIZE - 1); 5815 src_off_in_page = (start_offset + src_offset) & 5816 (PAGE_SIZE - 1); 5817 5818 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5819 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5820 5821 cur = min(len, (unsigned long)(PAGE_SIZE - 5822 src_off_in_page)); 5823 cur = min_t(unsigned long, cur, 5824 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5825 5826 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5827 dst_off_in_page, src_off_in_page, cur); 5828 5829 src_offset += cur; 5830 dst_offset += cur; 5831 len -= cur; 5832 } 5833 } 5834 5835 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5836 unsigned long src_offset, unsigned long len) 5837 { 5838 struct btrfs_fs_info *fs_info = dst->fs_info; 5839 size_t cur; 5840 size_t dst_off_in_page; 5841 size_t src_off_in_page; 5842 unsigned long dst_end = dst_offset + len - 1; 5843 unsigned long src_end = src_offset + len - 1; 5844 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5845 unsigned long dst_i; 5846 unsigned long src_i; 5847 5848 if (src_offset + len > dst->len) { 5849 btrfs_err(fs_info, 5850 "memmove bogus src_offset %lu move len %lu len %lu", 5851 src_offset, len, dst->len); 5852 BUG_ON(1); 5853 } 5854 if (dst_offset + len > dst->len) { 5855 btrfs_err(fs_info, 5856 "memmove bogus dst_offset %lu move len %lu len %lu", 5857 dst_offset, len, dst->len); 5858 BUG_ON(1); 5859 } 5860 if (dst_offset < src_offset) { 5861 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5862 return; 5863 } 5864 while (len > 0) { 5865 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 5866 src_i = (start_offset + src_end) >> PAGE_SHIFT; 5867 5868 dst_off_in_page = (start_offset + dst_end) & 5869 (PAGE_SIZE - 1); 5870 src_off_in_page = (start_offset + src_end) & 5871 (PAGE_SIZE - 1); 5872 5873 cur = min_t(unsigned long, len, src_off_in_page + 1); 5874 cur = min(cur, dst_off_in_page + 1); 5875 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5876 dst_off_in_page - cur + 1, 5877 src_off_in_page - cur + 1, cur); 5878 5879 dst_end -= cur; 5880 src_end -= cur; 5881 len -= cur; 5882 } 5883 } 5884 5885 int try_release_extent_buffer(struct page *page) 5886 { 5887 struct extent_buffer *eb; 5888 5889 /* 5890 * We need to make sure nobody is attaching this page to an eb right 5891 * now. 5892 */ 5893 spin_lock(&page->mapping->private_lock); 5894 if (!PagePrivate(page)) { 5895 spin_unlock(&page->mapping->private_lock); 5896 return 1; 5897 } 5898 5899 eb = (struct extent_buffer *)page->private; 5900 BUG_ON(!eb); 5901 5902 /* 5903 * This is a little awful but should be ok, we need to make sure that 5904 * the eb doesn't disappear out from under us while we're looking at 5905 * this page. 5906 */ 5907 spin_lock(&eb->refs_lock); 5908 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5909 spin_unlock(&eb->refs_lock); 5910 spin_unlock(&page->mapping->private_lock); 5911 return 0; 5912 } 5913 spin_unlock(&page->mapping->private_lock); 5914 5915 /* 5916 * If tree ref isn't set then we know the ref on this eb is a real ref, 5917 * so just return, this page will likely be freed soon anyway. 5918 */ 5919 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5920 spin_unlock(&eb->refs_lock); 5921 return 0; 5922 } 5923 5924 return release_extent_buffer(eb); 5925 } 5926