1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 #include "volumes.h" 19 #include "check-integrity.h" 20 #include "locking.h" 21 #include "rcu-string.h" 22 #include "backref.h" 23 #include "transaction.h" 24 25 static struct kmem_cache *extent_state_cache; 26 static struct kmem_cache *extent_buffer_cache; 27 static struct bio_set *btrfs_bioset; 28 29 static inline bool extent_state_in_tree(const struct extent_state *state) 30 { 31 return !RB_EMPTY_NODE(&state->rb_node); 32 } 33 34 #ifdef CONFIG_BTRFS_DEBUG 35 static LIST_HEAD(buffers); 36 static LIST_HEAD(states); 37 38 static DEFINE_SPINLOCK(leak_lock); 39 40 static inline 41 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 42 { 43 unsigned long flags; 44 45 spin_lock_irqsave(&leak_lock, flags); 46 list_add(new, head); 47 spin_unlock_irqrestore(&leak_lock, flags); 48 } 49 50 static inline 51 void btrfs_leak_debug_del(struct list_head *entry) 52 { 53 unsigned long flags; 54 55 spin_lock_irqsave(&leak_lock, flags); 56 list_del(entry); 57 spin_unlock_irqrestore(&leak_lock, flags); 58 } 59 60 static inline 61 void btrfs_leak_debug_check(void) 62 { 63 struct extent_state *state; 64 struct extent_buffer *eb; 65 66 while (!list_empty(&states)) { 67 state = list_entry(states.next, struct extent_state, leak_list); 68 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 69 state->start, state->end, state->state, 70 extent_state_in_tree(state), 71 atomic_read(&state->refs)); 72 list_del(&state->leak_list); 73 kmem_cache_free(extent_state_cache, state); 74 } 75 76 while (!list_empty(&buffers)) { 77 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 78 pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n", 79 eb->start, eb->len, atomic_read(&eb->refs)); 80 list_del(&eb->leak_list); 81 kmem_cache_free(extent_buffer_cache, eb); 82 } 83 } 84 85 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 86 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 87 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 88 struct extent_io_tree *tree, u64 start, u64 end) 89 { 90 struct inode *inode; 91 u64 isize; 92 93 if (!tree->mapping) 94 return; 95 96 inode = tree->mapping->host; 97 isize = i_size_read(inode); 98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 99 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 100 "%s: ino %llu isize %llu odd range [%llu,%llu]", 101 caller, btrfs_ino(inode), isize, start, end); 102 } 103 } 104 #else 105 #define btrfs_leak_debug_add(new, head) do {} while (0) 106 #define btrfs_leak_debug_del(entry) do {} while (0) 107 #define btrfs_leak_debug_check() do {} while (0) 108 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 109 #endif 110 111 #define BUFFER_LRU_MAX 64 112 113 struct tree_entry { 114 u64 start; 115 u64 end; 116 struct rb_node rb_node; 117 }; 118 119 struct extent_page_data { 120 struct bio *bio; 121 struct extent_io_tree *tree; 122 get_extent_t *get_extent; 123 unsigned long bio_flags; 124 125 /* tells writepage not to lock the state bits for this range 126 * it still does the unlocking 127 */ 128 unsigned int extent_locked:1; 129 130 /* tells the submit_bio code to use REQ_SYNC */ 131 unsigned int sync_io:1; 132 }; 133 134 static void add_extent_changeset(struct extent_state *state, unsigned bits, 135 struct extent_changeset *changeset, 136 int set) 137 { 138 int ret; 139 140 if (!changeset) 141 return; 142 if (set && (state->state & bits) == bits) 143 return; 144 if (!set && (state->state & bits) == 0) 145 return; 146 changeset->bytes_changed += state->end - state->start + 1; 147 ret = ulist_add(changeset->range_changed, state->start, state->end, 148 GFP_ATOMIC); 149 /* ENOMEM */ 150 BUG_ON(ret < 0); 151 } 152 153 static noinline void flush_write_bio(void *data); 154 static inline struct btrfs_fs_info * 155 tree_fs_info(struct extent_io_tree *tree) 156 { 157 if (!tree->mapping) 158 return NULL; 159 return btrfs_sb(tree->mapping->host->i_sb); 160 } 161 162 int __init extent_io_init(void) 163 { 164 extent_state_cache = kmem_cache_create("btrfs_extent_state", 165 sizeof(struct extent_state), 0, 166 SLAB_MEM_SPREAD, NULL); 167 if (!extent_state_cache) 168 return -ENOMEM; 169 170 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 171 sizeof(struct extent_buffer), 0, 172 SLAB_MEM_SPREAD, NULL); 173 if (!extent_buffer_cache) 174 goto free_state_cache; 175 176 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 177 offsetof(struct btrfs_io_bio, bio)); 178 if (!btrfs_bioset) 179 goto free_buffer_cache; 180 181 if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) 182 goto free_bioset; 183 184 return 0; 185 186 free_bioset: 187 bioset_free(btrfs_bioset); 188 btrfs_bioset = NULL; 189 190 free_buffer_cache: 191 kmem_cache_destroy(extent_buffer_cache); 192 extent_buffer_cache = NULL; 193 194 free_state_cache: 195 kmem_cache_destroy(extent_state_cache); 196 extent_state_cache = NULL; 197 return -ENOMEM; 198 } 199 200 void extent_io_exit(void) 201 { 202 btrfs_leak_debug_check(); 203 204 /* 205 * Make sure all delayed rcu free are flushed before we 206 * destroy caches. 207 */ 208 rcu_barrier(); 209 kmem_cache_destroy(extent_state_cache); 210 kmem_cache_destroy(extent_buffer_cache); 211 if (btrfs_bioset) 212 bioset_free(btrfs_bioset); 213 } 214 215 void extent_io_tree_init(struct extent_io_tree *tree, 216 struct address_space *mapping) 217 { 218 tree->state = RB_ROOT; 219 tree->ops = NULL; 220 tree->dirty_bytes = 0; 221 spin_lock_init(&tree->lock); 222 tree->mapping = mapping; 223 } 224 225 static struct extent_state *alloc_extent_state(gfp_t mask) 226 { 227 struct extent_state *state; 228 229 state = kmem_cache_alloc(extent_state_cache, mask); 230 if (!state) 231 return state; 232 state->state = 0; 233 state->failrec = NULL; 234 RB_CLEAR_NODE(&state->rb_node); 235 btrfs_leak_debug_add(&state->leak_list, &states); 236 atomic_set(&state->refs, 1); 237 init_waitqueue_head(&state->wq); 238 trace_alloc_extent_state(state, mask, _RET_IP_); 239 return state; 240 } 241 242 void free_extent_state(struct extent_state *state) 243 { 244 if (!state) 245 return; 246 if (atomic_dec_and_test(&state->refs)) { 247 WARN_ON(extent_state_in_tree(state)); 248 btrfs_leak_debug_del(&state->leak_list); 249 trace_free_extent_state(state, _RET_IP_); 250 kmem_cache_free(extent_state_cache, state); 251 } 252 } 253 254 static struct rb_node *tree_insert(struct rb_root *root, 255 struct rb_node *search_start, 256 u64 offset, 257 struct rb_node *node, 258 struct rb_node ***p_in, 259 struct rb_node **parent_in) 260 { 261 struct rb_node **p; 262 struct rb_node *parent = NULL; 263 struct tree_entry *entry; 264 265 if (p_in && parent_in) { 266 p = *p_in; 267 parent = *parent_in; 268 goto do_insert; 269 } 270 271 p = search_start ? &search_start : &root->rb_node; 272 while (*p) { 273 parent = *p; 274 entry = rb_entry(parent, struct tree_entry, rb_node); 275 276 if (offset < entry->start) 277 p = &(*p)->rb_left; 278 else if (offset > entry->end) 279 p = &(*p)->rb_right; 280 else 281 return parent; 282 } 283 284 do_insert: 285 rb_link_node(node, parent, p); 286 rb_insert_color(node, root); 287 return NULL; 288 } 289 290 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 291 struct rb_node **prev_ret, 292 struct rb_node **next_ret, 293 struct rb_node ***p_ret, 294 struct rb_node **parent_ret) 295 { 296 struct rb_root *root = &tree->state; 297 struct rb_node **n = &root->rb_node; 298 struct rb_node *prev = NULL; 299 struct rb_node *orig_prev = NULL; 300 struct tree_entry *entry; 301 struct tree_entry *prev_entry = NULL; 302 303 while (*n) { 304 prev = *n; 305 entry = rb_entry(prev, struct tree_entry, rb_node); 306 prev_entry = entry; 307 308 if (offset < entry->start) 309 n = &(*n)->rb_left; 310 else if (offset > entry->end) 311 n = &(*n)->rb_right; 312 else 313 return *n; 314 } 315 316 if (p_ret) 317 *p_ret = n; 318 if (parent_ret) 319 *parent_ret = prev; 320 321 if (prev_ret) { 322 orig_prev = prev; 323 while (prev && offset > prev_entry->end) { 324 prev = rb_next(prev); 325 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 326 } 327 *prev_ret = prev; 328 prev = orig_prev; 329 } 330 331 if (next_ret) { 332 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 333 while (prev && offset < prev_entry->start) { 334 prev = rb_prev(prev); 335 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 336 } 337 *next_ret = prev; 338 } 339 return NULL; 340 } 341 342 static inline struct rb_node * 343 tree_search_for_insert(struct extent_io_tree *tree, 344 u64 offset, 345 struct rb_node ***p_ret, 346 struct rb_node **parent_ret) 347 { 348 struct rb_node *prev = NULL; 349 struct rb_node *ret; 350 351 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); 352 if (!ret) 353 return prev; 354 return ret; 355 } 356 357 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 358 u64 offset) 359 { 360 return tree_search_for_insert(tree, offset, NULL, NULL); 361 } 362 363 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 364 struct extent_state *other) 365 { 366 if (tree->ops && tree->ops->merge_extent_hook) 367 tree->ops->merge_extent_hook(tree->mapping->host, new, 368 other); 369 } 370 371 /* 372 * utility function to look for merge candidates inside a given range. 373 * Any extents with matching state are merged together into a single 374 * extent in the tree. Extents with EXTENT_IO in their state field 375 * are not merged because the end_io handlers need to be able to do 376 * operations on them without sleeping (or doing allocations/splits). 377 * 378 * This should be called with the tree lock held. 379 */ 380 static void merge_state(struct extent_io_tree *tree, 381 struct extent_state *state) 382 { 383 struct extent_state *other; 384 struct rb_node *other_node; 385 386 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 387 return; 388 389 other_node = rb_prev(&state->rb_node); 390 if (other_node) { 391 other = rb_entry(other_node, struct extent_state, rb_node); 392 if (other->end == state->start - 1 && 393 other->state == state->state) { 394 merge_cb(tree, state, other); 395 state->start = other->start; 396 rb_erase(&other->rb_node, &tree->state); 397 RB_CLEAR_NODE(&other->rb_node); 398 free_extent_state(other); 399 } 400 } 401 other_node = rb_next(&state->rb_node); 402 if (other_node) { 403 other = rb_entry(other_node, struct extent_state, rb_node); 404 if (other->start == state->end + 1 && 405 other->state == state->state) { 406 merge_cb(tree, state, other); 407 state->end = other->end; 408 rb_erase(&other->rb_node, &tree->state); 409 RB_CLEAR_NODE(&other->rb_node); 410 free_extent_state(other); 411 } 412 } 413 } 414 415 static void set_state_cb(struct extent_io_tree *tree, 416 struct extent_state *state, unsigned *bits) 417 { 418 if (tree->ops && tree->ops->set_bit_hook) 419 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 420 } 421 422 static void clear_state_cb(struct extent_io_tree *tree, 423 struct extent_state *state, unsigned *bits) 424 { 425 if (tree->ops && tree->ops->clear_bit_hook) 426 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 427 } 428 429 static void set_state_bits(struct extent_io_tree *tree, 430 struct extent_state *state, unsigned *bits, 431 struct extent_changeset *changeset); 432 433 /* 434 * insert an extent_state struct into the tree. 'bits' are set on the 435 * struct before it is inserted. 436 * 437 * This may return -EEXIST if the extent is already there, in which case the 438 * state struct is freed. 439 * 440 * The tree lock is not taken internally. This is a utility function and 441 * probably isn't what you want to call (see set/clear_extent_bit). 442 */ 443 static int insert_state(struct extent_io_tree *tree, 444 struct extent_state *state, u64 start, u64 end, 445 struct rb_node ***p, 446 struct rb_node **parent, 447 unsigned *bits, struct extent_changeset *changeset) 448 { 449 struct rb_node *node; 450 451 if (end < start) 452 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 453 end, start); 454 state->start = start; 455 state->end = end; 456 457 set_state_bits(tree, state, bits, changeset); 458 459 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 460 if (node) { 461 struct extent_state *found; 462 found = rb_entry(node, struct extent_state, rb_node); 463 pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 464 found->start, found->end, start, end); 465 return -EEXIST; 466 } 467 merge_state(tree, state); 468 return 0; 469 } 470 471 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 472 u64 split) 473 { 474 if (tree->ops && tree->ops->split_extent_hook) 475 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 476 } 477 478 /* 479 * split a given extent state struct in two, inserting the preallocated 480 * struct 'prealloc' as the newly created second half. 'split' indicates an 481 * offset inside 'orig' where it should be split. 482 * 483 * Before calling, 484 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 485 * are two extent state structs in the tree: 486 * prealloc: [orig->start, split - 1] 487 * orig: [ split, orig->end ] 488 * 489 * The tree locks are not taken by this function. They need to be held 490 * by the caller. 491 */ 492 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 493 struct extent_state *prealloc, u64 split) 494 { 495 struct rb_node *node; 496 497 split_cb(tree, orig, split); 498 499 prealloc->start = orig->start; 500 prealloc->end = split - 1; 501 prealloc->state = orig->state; 502 orig->start = split; 503 504 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 505 &prealloc->rb_node, NULL, NULL); 506 if (node) { 507 free_extent_state(prealloc); 508 return -EEXIST; 509 } 510 return 0; 511 } 512 513 static struct extent_state *next_state(struct extent_state *state) 514 { 515 struct rb_node *next = rb_next(&state->rb_node); 516 if (next) 517 return rb_entry(next, struct extent_state, rb_node); 518 else 519 return NULL; 520 } 521 522 /* 523 * utility function to clear some bits in an extent state struct. 524 * it will optionally wake up any one waiting on this state (wake == 1). 525 * 526 * If no bits are set on the state struct after clearing things, the 527 * struct is freed and removed from the tree 528 */ 529 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 530 struct extent_state *state, 531 unsigned *bits, int wake, 532 struct extent_changeset *changeset) 533 { 534 struct extent_state *next; 535 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 536 537 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 538 u64 range = state->end - state->start + 1; 539 WARN_ON(range > tree->dirty_bytes); 540 tree->dirty_bytes -= range; 541 } 542 clear_state_cb(tree, state, bits); 543 add_extent_changeset(state, bits_to_clear, changeset, 0); 544 state->state &= ~bits_to_clear; 545 if (wake) 546 wake_up(&state->wq); 547 if (state->state == 0) { 548 next = next_state(state); 549 if (extent_state_in_tree(state)) { 550 rb_erase(&state->rb_node, &tree->state); 551 RB_CLEAR_NODE(&state->rb_node); 552 free_extent_state(state); 553 } else { 554 WARN_ON(1); 555 } 556 } else { 557 merge_state(tree, state); 558 next = next_state(state); 559 } 560 return next; 561 } 562 563 static struct extent_state * 564 alloc_extent_state_atomic(struct extent_state *prealloc) 565 { 566 if (!prealloc) 567 prealloc = alloc_extent_state(GFP_ATOMIC); 568 569 return prealloc; 570 } 571 572 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 573 { 574 btrfs_panic(tree_fs_info(tree), err, 575 "Locking error: Extent tree was modified by another thread while locked."); 576 } 577 578 /* 579 * clear some bits on a range in the tree. This may require splitting 580 * or inserting elements in the tree, so the gfp mask is used to 581 * indicate which allocations or sleeping are allowed. 582 * 583 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 584 * the given range from the tree regardless of state (ie for truncate). 585 * 586 * the range [start, end] is inclusive. 587 * 588 * This takes the tree lock, and returns 0 on success and < 0 on error. 589 */ 590 static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 591 unsigned bits, int wake, int delete, 592 struct extent_state **cached_state, 593 gfp_t mask, struct extent_changeset *changeset) 594 { 595 struct extent_state *state; 596 struct extent_state *cached; 597 struct extent_state *prealloc = NULL; 598 struct rb_node *node; 599 u64 last_end; 600 int err; 601 int clear = 0; 602 603 btrfs_debug_check_extent_io_range(tree, start, end); 604 605 if (bits & EXTENT_DELALLOC) 606 bits |= EXTENT_NORESERVE; 607 608 if (delete) 609 bits |= ~EXTENT_CTLBITS; 610 bits |= EXTENT_FIRST_DELALLOC; 611 612 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 613 clear = 1; 614 again: 615 if (!prealloc && gfpflags_allow_blocking(mask)) { 616 /* 617 * Don't care for allocation failure here because we might end 618 * up not needing the pre-allocated extent state at all, which 619 * is the case if we only have in the tree extent states that 620 * cover our input range and don't cover too any other range. 621 * If we end up needing a new extent state we allocate it later. 622 */ 623 prealloc = alloc_extent_state(mask); 624 } 625 626 spin_lock(&tree->lock); 627 if (cached_state) { 628 cached = *cached_state; 629 630 if (clear) { 631 *cached_state = NULL; 632 cached_state = NULL; 633 } 634 635 if (cached && extent_state_in_tree(cached) && 636 cached->start <= start && cached->end > start) { 637 if (clear) 638 atomic_dec(&cached->refs); 639 state = cached; 640 goto hit_next; 641 } 642 if (clear) 643 free_extent_state(cached); 644 } 645 /* 646 * this search will find the extents that end after 647 * our range starts 648 */ 649 node = tree_search(tree, start); 650 if (!node) 651 goto out; 652 state = rb_entry(node, struct extent_state, rb_node); 653 hit_next: 654 if (state->start > end) 655 goto out; 656 WARN_ON(state->end < start); 657 last_end = state->end; 658 659 /* the state doesn't have the wanted bits, go ahead */ 660 if (!(state->state & bits)) { 661 state = next_state(state); 662 goto next; 663 } 664 665 /* 666 * | ---- desired range ---- | 667 * | state | or 668 * | ------------- state -------------- | 669 * 670 * We need to split the extent we found, and may flip 671 * bits on second half. 672 * 673 * If the extent we found extends past our range, we 674 * just split and search again. It'll get split again 675 * the next time though. 676 * 677 * If the extent we found is inside our range, we clear 678 * the desired bit on it. 679 */ 680 681 if (state->start < start) { 682 prealloc = alloc_extent_state_atomic(prealloc); 683 BUG_ON(!prealloc); 684 err = split_state(tree, state, prealloc, start); 685 if (err) 686 extent_io_tree_panic(tree, err); 687 688 prealloc = NULL; 689 if (err) 690 goto out; 691 if (state->end <= end) { 692 state = clear_state_bit(tree, state, &bits, wake, 693 changeset); 694 goto next; 695 } 696 goto search_again; 697 } 698 /* 699 * | ---- desired range ---- | 700 * | state | 701 * We need to split the extent, and clear the bit 702 * on the first half 703 */ 704 if (state->start <= end && state->end > end) { 705 prealloc = alloc_extent_state_atomic(prealloc); 706 BUG_ON(!prealloc); 707 err = split_state(tree, state, prealloc, end + 1); 708 if (err) 709 extent_io_tree_panic(tree, err); 710 711 if (wake) 712 wake_up(&state->wq); 713 714 clear_state_bit(tree, prealloc, &bits, wake, changeset); 715 716 prealloc = NULL; 717 goto out; 718 } 719 720 state = clear_state_bit(tree, state, &bits, wake, changeset); 721 next: 722 if (last_end == (u64)-1) 723 goto out; 724 start = last_end + 1; 725 if (start <= end && state && !need_resched()) 726 goto hit_next; 727 728 search_again: 729 if (start > end) 730 goto out; 731 spin_unlock(&tree->lock); 732 if (gfpflags_allow_blocking(mask)) 733 cond_resched(); 734 goto again; 735 736 out: 737 spin_unlock(&tree->lock); 738 if (prealloc) 739 free_extent_state(prealloc); 740 741 return 0; 742 743 } 744 745 static void wait_on_state(struct extent_io_tree *tree, 746 struct extent_state *state) 747 __releases(tree->lock) 748 __acquires(tree->lock) 749 { 750 DEFINE_WAIT(wait); 751 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 752 spin_unlock(&tree->lock); 753 schedule(); 754 spin_lock(&tree->lock); 755 finish_wait(&state->wq, &wait); 756 } 757 758 /* 759 * waits for one or more bits to clear on a range in the state tree. 760 * The range [start, end] is inclusive. 761 * The tree lock is taken by this function 762 */ 763 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 764 unsigned long bits) 765 { 766 struct extent_state *state; 767 struct rb_node *node; 768 769 btrfs_debug_check_extent_io_range(tree, start, end); 770 771 spin_lock(&tree->lock); 772 again: 773 while (1) { 774 /* 775 * this search will find all the extents that end after 776 * our range starts 777 */ 778 node = tree_search(tree, start); 779 process_node: 780 if (!node) 781 break; 782 783 state = rb_entry(node, struct extent_state, rb_node); 784 785 if (state->start > end) 786 goto out; 787 788 if (state->state & bits) { 789 start = state->start; 790 atomic_inc(&state->refs); 791 wait_on_state(tree, state); 792 free_extent_state(state); 793 goto again; 794 } 795 start = state->end + 1; 796 797 if (start > end) 798 break; 799 800 if (!cond_resched_lock(&tree->lock)) { 801 node = rb_next(node); 802 goto process_node; 803 } 804 } 805 out: 806 spin_unlock(&tree->lock); 807 } 808 809 static void set_state_bits(struct extent_io_tree *tree, 810 struct extent_state *state, 811 unsigned *bits, struct extent_changeset *changeset) 812 { 813 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 814 815 set_state_cb(tree, state, bits); 816 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 817 u64 range = state->end - state->start + 1; 818 tree->dirty_bytes += range; 819 } 820 add_extent_changeset(state, bits_to_set, changeset, 1); 821 state->state |= bits_to_set; 822 } 823 824 static void cache_state_if_flags(struct extent_state *state, 825 struct extent_state **cached_ptr, 826 unsigned flags) 827 { 828 if (cached_ptr && !(*cached_ptr)) { 829 if (!flags || (state->state & flags)) { 830 *cached_ptr = state; 831 atomic_inc(&state->refs); 832 } 833 } 834 } 835 836 static void cache_state(struct extent_state *state, 837 struct extent_state **cached_ptr) 838 { 839 return cache_state_if_flags(state, cached_ptr, 840 EXTENT_IOBITS | EXTENT_BOUNDARY); 841 } 842 843 /* 844 * set some bits on a range in the tree. This may require allocations or 845 * sleeping, so the gfp mask is used to indicate what is allowed. 846 * 847 * If any of the exclusive bits are set, this will fail with -EEXIST if some 848 * part of the range already has the desired bits set. The start of the 849 * existing range is returned in failed_start in this case. 850 * 851 * [start, end] is inclusive This takes the tree lock. 852 */ 853 854 static int __must_check 855 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 856 unsigned bits, unsigned exclusive_bits, 857 u64 *failed_start, struct extent_state **cached_state, 858 gfp_t mask, struct extent_changeset *changeset) 859 { 860 struct extent_state *state; 861 struct extent_state *prealloc = NULL; 862 struct rb_node *node; 863 struct rb_node **p; 864 struct rb_node *parent; 865 int err = 0; 866 u64 last_start; 867 u64 last_end; 868 869 btrfs_debug_check_extent_io_range(tree, start, end); 870 871 bits |= EXTENT_FIRST_DELALLOC; 872 again: 873 if (!prealloc && gfpflags_allow_blocking(mask)) { 874 /* 875 * Don't care for allocation failure here because we might end 876 * up not needing the pre-allocated extent state at all, which 877 * is the case if we only have in the tree extent states that 878 * cover our input range and don't cover too any other range. 879 * If we end up needing a new extent state we allocate it later. 880 */ 881 prealloc = alloc_extent_state(mask); 882 } 883 884 spin_lock(&tree->lock); 885 if (cached_state && *cached_state) { 886 state = *cached_state; 887 if (state->start <= start && state->end > start && 888 extent_state_in_tree(state)) { 889 node = &state->rb_node; 890 goto hit_next; 891 } 892 } 893 /* 894 * this search will find all the extents that end after 895 * our range starts. 896 */ 897 node = tree_search_for_insert(tree, start, &p, &parent); 898 if (!node) { 899 prealloc = alloc_extent_state_atomic(prealloc); 900 BUG_ON(!prealloc); 901 err = insert_state(tree, prealloc, start, end, 902 &p, &parent, &bits, changeset); 903 if (err) 904 extent_io_tree_panic(tree, err); 905 906 cache_state(prealloc, cached_state); 907 prealloc = NULL; 908 goto out; 909 } 910 state = rb_entry(node, struct extent_state, rb_node); 911 hit_next: 912 last_start = state->start; 913 last_end = state->end; 914 915 /* 916 * | ---- desired range ---- | 917 * | state | 918 * 919 * Just lock what we found and keep going 920 */ 921 if (state->start == start && state->end <= end) { 922 if (state->state & exclusive_bits) { 923 *failed_start = state->start; 924 err = -EEXIST; 925 goto out; 926 } 927 928 set_state_bits(tree, state, &bits, changeset); 929 cache_state(state, cached_state); 930 merge_state(tree, state); 931 if (last_end == (u64)-1) 932 goto out; 933 start = last_end + 1; 934 state = next_state(state); 935 if (start < end && state && state->start == start && 936 !need_resched()) 937 goto hit_next; 938 goto search_again; 939 } 940 941 /* 942 * | ---- desired range ---- | 943 * | state | 944 * or 945 * | ------------- state -------------- | 946 * 947 * We need to split the extent we found, and may flip bits on 948 * second half. 949 * 950 * If the extent we found extends past our 951 * range, we just split and search again. It'll get split 952 * again the next time though. 953 * 954 * If the extent we found is inside our range, we set the 955 * desired bit on it. 956 */ 957 if (state->start < start) { 958 if (state->state & exclusive_bits) { 959 *failed_start = start; 960 err = -EEXIST; 961 goto out; 962 } 963 964 prealloc = alloc_extent_state_atomic(prealloc); 965 BUG_ON(!prealloc); 966 err = split_state(tree, state, prealloc, start); 967 if (err) 968 extent_io_tree_panic(tree, err); 969 970 prealloc = NULL; 971 if (err) 972 goto out; 973 if (state->end <= end) { 974 set_state_bits(tree, state, &bits, changeset); 975 cache_state(state, cached_state); 976 merge_state(tree, state); 977 if (last_end == (u64)-1) 978 goto out; 979 start = last_end + 1; 980 state = next_state(state); 981 if (start < end && state && state->start == start && 982 !need_resched()) 983 goto hit_next; 984 } 985 goto search_again; 986 } 987 /* 988 * | ---- desired range ---- | 989 * | state | or | state | 990 * 991 * There's a hole, we need to insert something in it and 992 * ignore the extent we found. 993 */ 994 if (state->start > start) { 995 u64 this_end; 996 if (end < last_start) 997 this_end = end; 998 else 999 this_end = last_start - 1; 1000 1001 prealloc = alloc_extent_state_atomic(prealloc); 1002 BUG_ON(!prealloc); 1003 1004 /* 1005 * Avoid to free 'prealloc' if it can be merged with 1006 * the later extent. 1007 */ 1008 err = insert_state(tree, prealloc, start, this_end, 1009 NULL, NULL, &bits, changeset); 1010 if (err) 1011 extent_io_tree_panic(tree, err); 1012 1013 cache_state(prealloc, cached_state); 1014 prealloc = NULL; 1015 start = this_end + 1; 1016 goto search_again; 1017 } 1018 /* 1019 * | ---- desired range ---- | 1020 * | state | 1021 * We need to split the extent, and set the bit 1022 * on the first half 1023 */ 1024 if (state->start <= end && state->end > end) { 1025 if (state->state & exclusive_bits) { 1026 *failed_start = start; 1027 err = -EEXIST; 1028 goto out; 1029 } 1030 1031 prealloc = alloc_extent_state_atomic(prealloc); 1032 BUG_ON(!prealloc); 1033 err = split_state(tree, state, prealloc, end + 1); 1034 if (err) 1035 extent_io_tree_panic(tree, err); 1036 1037 set_state_bits(tree, prealloc, &bits, changeset); 1038 cache_state(prealloc, cached_state); 1039 merge_state(tree, prealloc); 1040 prealloc = NULL; 1041 goto out; 1042 } 1043 1044 search_again: 1045 if (start > end) 1046 goto out; 1047 spin_unlock(&tree->lock); 1048 if (gfpflags_allow_blocking(mask)) 1049 cond_resched(); 1050 goto again; 1051 1052 out: 1053 spin_unlock(&tree->lock); 1054 if (prealloc) 1055 free_extent_state(prealloc); 1056 1057 return err; 1058 1059 } 1060 1061 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1062 unsigned bits, u64 * failed_start, 1063 struct extent_state **cached_state, gfp_t mask) 1064 { 1065 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1066 cached_state, mask, NULL); 1067 } 1068 1069 1070 /** 1071 * convert_extent_bit - convert all bits in a given range from one bit to 1072 * another 1073 * @tree: the io tree to search 1074 * @start: the start offset in bytes 1075 * @end: the end offset in bytes (inclusive) 1076 * @bits: the bits to set in this range 1077 * @clear_bits: the bits to clear in this range 1078 * @cached_state: state that we're going to cache 1079 * 1080 * This will go through and set bits for the given range. If any states exist 1081 * already in this range they are set with the given bit and cleared of the 1082 * clear_bits. This is only meant to be used by things that are mergeable, ie 1083 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1084 * boundary bits like LOCK. 1085 * 1086 * All allocations are done with GFP_NOFS. 1087 */ 1088 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1089 unsigned bits, unsigned clear_bits, 1090 struct extent_state **cached_state) 1091 { 1092 struct extent_state *state; 1093 struct extent_state *prealloc = NULL; 1094 struct rb_node *node; 1095 struct rb_node **p; 1096 struct rb_node *parent; 1097 int err = 0; 1098 u64 last_start; 1099 u64 last_end; 1100 bool first_iteration = true; 1101 1102 btrfs_debug_check_extent_io_range(tree, start, end); 1103 1104 again: 1105 if (!prealloc) { 1106 /* 1107 * Best effort, don't worry if extent state allocation fails 1108 * here for the first iteration. We might have a cached state 1109 * that matches exactly the target range, in which case no 1110 * extent state allocations are needed. We'll only know this 1111 * after locking the tree. 1112 */ 1113 prealloc = alloc_extent_state(GFP_NOFS); 1114 if (!prealloc && !first_iteration) 1115 return -ENOMEM; 1116 } 1117 1118 spin_lock(&tree->lock); 1119 if (cached_state && *cached_state) { 1120 state = *cached_state; 1121 if (state->start <= start && state->end > start && 1122 extent_state_in_tree(state)) { 1123 node = &state->rb_node; 1124 goto hit_next; 1125 } 1126 } 1127 1128 /* 1129 * this search will find all the extents that end after 1130 * our range starts. 1131 */ 1132 node = tree_search_for_insert(tree, start, &p, &parent); 1133 if (!node) { 1134 prealloc = alloc_extent_state_atomic(prealloc); 1135 if (!prealloc) { 1136 err = -ENOMEM; 1137 goto out; 1138 } 1139 err = insert_state(tree, prealloc, start, end, 1140 &p, &parent, &bits, NULL); 1141 if (err) 1142 extent_io_tree_panic(tree, err); 1143 cache_state(prealloc, cached_state); 1144 prealloc = NULL; 1145 goto out; 1146 } 1147 state = rb_entry(node, struct extent_state, rb_node); 1148 hit_next: 1149 last_start = state->start; 1150 last_end = state->end; 1151 1152 /* 1153 * | ---- desired range ---- | 1154 * | state | 1155 * 1156 * Just lock what we found and keep going 1157 */ 1158 if (state->start == start && state->end <= end) { 1159 set_state_bits(tree, state, &bits, NULL); 1160 cache_state(state, cached_state); 1161 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1162 if (last_end == (u64)-1) 1163 goto out; 1164 start = last_end + 1; 1165 if (start < end && state && state->start == start && 1166 !need_resched()) 1167 goto hit_next; 1168 goto search_again; 1169 } 1170 1171 /* 1172 * | ---- desired range ---- | 1173 * | state | 1174 * or 1175 * | ------------- state -------------- | 1176 * 1177 * We need to split the extent we found, and may flip bits on 1178 * second half. 1179 * 1180 * If the extent we found extends past our 1181 * range, we just split and search again. It'll get split 1182 * again the next time though. 1183 * 1184 * If the extent we found is inside our range, we set the 1185 * desired bit on it. 1186 */ 1187 if (state->start < start) { 1188 prealloc = alloc_extent_state_atomic(prealloc); 1189 if (!prealloc) { 1190 err = -ENOMEM; 1191 goto out; 1192 } 1193 err = split_state(tree, state, prealloc, start); 1194 if (err) 1195 extent_io_tree_panic(tree, err); 1196 prealloc = NULL; 1197 if (err) 1198 goto out; 1199 if (state->end <= end) { 1200 set_state_bits(tree, state, &bits, NULL); 1201 cache_state(state, cached_state); 1202 state = clear_state_bit(tree, state, &clear_bits, 0, 1203 NULL); 1204 if (last_end == (u64)-1) 1205 goto out; 1206 start = last_end + 1; 1207 if (start < end && state && state->start == start && 1208 !need_resched()) 1209 goto hit_next; 1210 } 1211 goto search_again; 1212 } 1213 /* 1214 * | ---- desired range ---- | 1215 * | state | or | state | 1216 * 1217 * There's a hole, we need to insert something in it and 1218 * ignore the extent we found. 1219 */ 1220 if (state->start > start) { 1221 u64 this_end; 1222 if (end < last_start) 1223 this_end = end; 1224 else 1225 this_end = last_start - 1; 1226 1227 prealloc = alloc_extent_state_atomic(prealloc); 1228 if (!prealloc) { 1229 err = -ENOMEM; 1230 goto out; 1231 } 1232 1233 /* 1234 * Avoid to free 'prealloc' if it can be merged with 1235 * the later extent. 1236 */ 1237 err = insert_state(tree, prealloc, start, this_end, 1238 NULL, NULL, &bits, NULL); 1239 if (err) 1240 extent_io_tree_panic(tree, err); 1241 cache_state(prealloc, cached_state); 1242 prealloc = NULL; 1243 start = this_end + 1; 1244 goto search_again; 1245 } 1246 /* 1247 * | ---- desired range ---- | 1248 * | state | 1249 * We need to split the extent, and set the bit 1250 * on the first half 1251 */ 1252 if (state->start <= end && state->end > end) { 1253 prealloc = alloc_extent_state_atomic(prealloc); 1254 if (!prealloc) { 1255 err = -ENOMEM; 1256 goto out; 1257 } 1258 1259 err = split_state(tree, state, prealloc, end + 1); 1260 if (err) 1261 extent_io_tree_panic(tree, err); 1262 1263 set_state_bits(tree, prealloc, &bits, NULL); 1264 cache_state(prealloc, cached_state); 1265 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1266 prealloc = NULL; 1267 goto out; 1268 } 1269 1270 search_again: 1271 if (start > end) 1272 goto out; 1273 spin_unlock(&tree->lock); 1274 cond_resched(); 1275 first_iteration = false; 1276 goto again; 1277 1278 out: 1279 spin_unlock(&tree->lock); 1280 if (prealloc) 1281 free_extent_state(prealloc); 1282 1283 return err; 1284 } 1285 1286 /* wrappers around set/clear extent bit */ 1287 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1288 unsigned bits, struct extent_changeset *changeset) 1289 { 1290 /* 1291 * We don't support EXTENT_LOCKED yet, as current changeset will 1292 * record any bits changed, so for EXTENT_LOCKED case, it will 1293 * either fail with -EEXIST or changeset will record the whole 1294 * range. 1295 */ 1296 BUG_ON(bits & EXTENT_LOCKED); 1297 1298 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1299 changeset); 1300 } 1301 1302 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1303 unsigned bits, int wake, int delete, 1304 struct extent_state **cached, gfp_t mask) 1305 { 1306 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1307 cached, mask, NULL); 1308 } 1309 1310 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1311 unsigned bits, struct extent_changeset *changeset) 1312 { 1313 /* 1314 * Don't support EXTENT_LOCKED case, same reason as 1315 * set_record_extent_bits(). 1316 */ 1317 BUG_ON(bits & EXTENT_LOCKED); 1318 1319 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1320 changeset); 1321 } 1322 1323 /* 1324 * either insert or lock state struct between start and end use mask to tell 1325 * us if waiting is desired. 1326 */ 1327 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1328 struct extent_state **cached_state) 1329 { 1330 int err; 1331 u64 failed_start; 1332 1333 while (1) { 1334 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1335 EXTENT_LOCKED, &failed_start, 1336 cached_state, GFP_NOFS, NULL); 1337 if (err == -EEXIST) { 1338 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1339 start = failed_start; 1340 } else 1341 break; 1342 WARN_ON(start > end); 1343 } 1344 return err; 1345 } 1346 1347 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1348 { 1349 int err; 1350 u64 failed_start; 1351 1352 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1353 &failed_start, NULL, GFP_NOFS, NULL); 1354 if (err == -EEXIST) { 1355 if (failed_start > start) 1356 clear_extent_bit(tree, start, failed_start - 1, 1357 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1358 return 0; 1359 } 1360 return 1; 1361 } 1362 1363 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1364 { 1365 unsigned long index = start >> PAGE_SHIFT; 1366 unsigned long end_index = end >> PAGE_SHIFT; 1367 struct page *page; 1368 1369 while (index <= end_index) { 1370 page = find_get_page(inode->i_mapping, index); 1371 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1372 clear_page_dirty_for_io(page); 1373 put_page(page); 1374 index++; 1375 } 1376 } 1377 1378 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1379 { 1380 unsigned long index = start >> PAGE_SHIFT; 1381 unsigned long end_index = end >> PAGE_SHIFT; 1382 struct page *page; 1383 1384 while (index <= end_index) { 1385 page = find_get_page(inode->i_mapping, index); 1386 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1387 __set_page_dirty_nobuffers(page); 1388 account_page_redirty(page); 1389 put_page(page); 1390 index++; 1391 } 1392 } 1393 1394 /* 1395 * helper function to set both pages and extents in the tree writeback 1396 */ 1397 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1398 { 1399 unsigned long index = start >> PAGE_SHIFT; 1400 unsigned long end_index = end >> PAGE_SHIFT; 1401 struct page *page; 1402 1403 while (index <= end_index) { 1404 page = find_get_page(tree->mapping, index); 1405 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1406 set_page_writeback(page); 1407 put_page(page); 1408 index++; 1409 } 1410 } 1411 1412 /* find the first state struct with 'bits' set after 'start', and 1413 * return it. tree->lock must be held. NULL will returned if 1414 * nothing was found after 'start' 1415 */ 1416 static struct extent_state * 1417 find_first_extent_bit_state(struct extent_io_tree *tree, 1418 u64 start, unsigned bits) 1419 { 1420 struct rb_node *node; 1421 struct extent_state *state; 1422 1423 /* 1424 * this search will find all the extents that end after 1425 * our range starts. 1426 */ 1427 node = tree_search(tree, start); 1428 if (!node) 1429 goto out; 1430 1431 while (1) { 1432 state = rb_entry(node, struct extent_state, rb_node); 1433 if (state->end >= start && (state->state & bits)) 1434 return state; 1435 1436 node = rb_next(node); 1437 if (!node) 1438 break; 1439 } 1440 out: 1441 return NULL; 1442 } 1443 1444 /* 1445 * find the first offset in the io tree with 'bits' set. zero is 1446 * returned if we find something, and *start_ret and *end_ret are 1447 * set to reflect the state struct that was found. 1448 * 1449 * If nothing was found, 1 is returned. If found something, return 0. 1450 */ 1451 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1452 u64 *start_ret, u64 *end_ret, unsigned bits, 1453 struct extent_state **cached_state) 1454 { 1455 struct extent_state *state; 1456 struct rb_node *n; 1457 int ret = 1; 1458 1459 spin_lock(&tree->lock); 1460 if (cached_state && *cached_state) { 1461 state = *cached_state; 1462 if (state->end == start - 1 && extent_state_in_tree(state)) { 1463 n = rb_next(&state->rb_node); 1464 while (n) { 1465 state = rb_entry(n, struct extent_state, 1466 rb_node); 1467 if (state->state & bits) 1468 goto got_it; 1469 n = rb_next(n); 1470 } 1471 free_extent_state(*cached_state); 1472 *cached_state = NULL; 1473 goto out; 1474 } 1475 free_extent_state(*cached_state); 1476 *cached_state = NULL; 1477 } 1478 1479 state = find_first_extent_bit_state(tree, start, bits); 1480 got_it: 1481 if (state) { 1482 cache_state_if_flags(state, cached_state, 0); 1483 *start_ret = state->start; 1484 *end_ret = state->end; 1485 ret = 0; 1486 } 1487 out: 1488 spin_unlock(&tree->lock); 1489 return ret; 1490 } 1491 1492 /* 1493 * find a contiguous range of bytes in the file marked as delalloc, not 1494 * more than 'max_bytes'. start and end are used to return the range, 1495 * 1496 * 1 is returned if we find something, 0 if nothing was in the tree 1497 */ 1498 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1499 u64 *start, u64 *end, u64 max_bytes, 1500 struct extent_state **cached_state) 1501 { 1502 struct rb_node *node; 1503 struct extent_state *state; 1504 u64 cur_start = *start; 1505 u64 found = 0; 1506 u64 total_bytes = 0; 1507 1508 spin_lock(&tree->lock); 1509 1510 /* 1511 * this search will find all the extents that end after 1512 * our range starts. 1513 */ 1514 node = tree_search(tree, cur_start); 1515 if (!node) { 1516 if (!found) 1517 *end = (u64)-1; 1518 goto out; 1519 } 1520 1521 while (1) { 1522 state = rb_entry(node, struct extent_state, rb_node); 1523 if (found && (state->start != cur_start || 1524 (state->state & EXTENT_BOUNDARY))) { 1525 goto out; 1526 } 1527 if (!(state->state & EXTENT_DELALLOC)) { 1528 if (!found) 1529 *end = state->end; 1530 goto out; 1531 } 1532 if (!found) { 1533 *start = state->start; 1534 *cached_state = state; 1535 atomic_inc(&state->refs); 1536 } 1537 found++; 1538 *end = state->end; 1539 cur_start = state->end + 1; 1540 node = rb_next(node); 1541 total_bytes += state->end - state->start + 1; 1542 if (total_bytes >= max_bytes) 1543 break; 1544 if (!node) 1545 break; 1546 } 1547 out: 1548 spin_unlock(&tree->lock); 1549 return found; 1550 } 1551 1552 static noinline void __unlock_for_delalloc(struct inode *inode, 1553 struct page *locked_page, 1554 u64 start, u64 end) 1555 { 1556 int ret; 1557 struct page *pages[16]; 1558 unsigned long index = start >> PAGE_SHIFT; 1559 unsigned long end_index = end >> PAGE_SHIFT; 1560 unsigned long nr_pages = end_index - index + 1; 1561 int i; 1562 1563 if (index == locked_page->index && end_index == index) 1564 return; 1565 1566 while (nr_pages > 0) { 1567 ret = find_get_pages_contig(inode->i_mapping, index, 1568 min_t(unsigned long, nr_pages, 1569 ARRAY_SIZE(pages)), pages); 1570 for (i = 0; i < ret; i++) { 1571 if (pages[i] != locked_page) 1572 unlock_page(pages[i]); 1573 put_page(pages[i]); 1574 } 1575 nr_pages -= ret; 1576 index += ret; 1577 cond_resched(); 1578 } 1579 } 1580 1581 static noinline int lock_delalloc_pages(struct inode *inode, 1582 struct page *locked_page, 1583 u64 delalloc_start, 1584 u64 delalloc_end) 1585 { 1586 unsigned long index = delalloc_start >> PAGE_SHIFT; 1587 unsigned long start_index = index; 1588 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1589 unsigned long pages_locked = 0; 1590 struct page *pages[16]; 1591 unsigned long nrpages; 1592 int ret; 1593 int i; 1594 1595 /* the caller is responsible for locking the start index */ 1596 if (index == locked_page->index && index == end_index) 1597 return 0; 1598 1599 /* skip the page at the start index */ 1600 nrpages = end_index - index + 1; 1601 while (nrpages > 0) { 1602 ret = find_get_pages_contig(inode->i_mapping, index, 1603 min_t(unsigned long, 1604 nrpages, ARRAY_SIZE(pages)), pages); 1605 if (ret == 0) { 1606 ret = -EAGAIN; 1607 goto done; 1608 } 1609 /* now we have an array of pages, lock them all */ 1610 for (i = 0; i < ret; i++) { 1611 /* 1612 * the caller is taking responsibility for 1613 * locked_page 1614 */ 1615 if (pages[i] != locked_page) { 1616 lock_page(pages[i]); 1617 if (!PageDirty(pages[i]) || 1618 pages[i]->mapping != inode->i_mapping) { 1619 ret = -EAGAIN; 1620 unlock_page(pages[i]); 1621 put_page(pages[i]); 1622 goto done; 1623 } 1624 } 1625 put_page(pages[i]); 1626 pages_locked++; 1627 } 1628 nrpages -= ret; 1629 index += ret; 1630 cond_resched(); 1631 } 1632 ret = 0; 1633 done: 1634 if (ret && pages_locked) { 1635 __unlock_for_delalloc(inode, locked_page, 1636 delalloc_start, 1637 ((u64)(start_index + pages_locked - 1)) << 1638 PAGE_SHIFT); 1639 } 1640 return ret; 1641 } 1642 1643 /* 1644 * find a contiguous range of bytes in the file marked as delalloc, not 1645 * more than 'max_bytes'. start and end are used to return the range, 1646 * 1647 * 1 is returned if we find something, 0 if nothing was in the tree 1648 */ 1649 STATIC u64 find_lock_delalloc_range(struct inode *inode, 1650 struct extent_io_tree *tree, 1651 struct page *locked_page, u64 *start, 1652 u64 *end, u64 max_bytes) 1653 { 1654 u64 delalloc_start; 1655 u64 delalloc_end; 1656 u64 found; 1657 struct extent_state *cached_state = NULL; 1658 int ret; 1659 int loops = 0; 1660 1661 again: 1662 /* step one, find a bunch of delalloc bytes starting at start */ 1663 delalloc_start = *start; 1664 delalloc_end = 0; 1665 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1666 max_bytes, &cached_state); 1667 if (!found || delalloc_end <= *start) { 1668 *start = delalloc_start; 1669 *end = delalloc_end; 1670 free_extent_state(cached_state); 1671 return 0; 1672 } 1673 1674 /* 1675 * start comes from the offset of locked_page. We have to lock 1676 * pages in order, so we can't process delalloc bytes before 1677 * locked_page 1678 */ 1679 if (delalloc_start < *start) 1680 delalloc_start = *start; 1681 1682 /* 1683 * make sure to limit the number of pages we try to lock down 1684 */ 1685 if (delalloc_end + 1 - delalloc_start > max_bytes) 1686 delalloc_end = delalloc_start + max_bytes - 1; 1687 1688 /* step two, lock all the pages after the page that has start */ 1689 ret = lock_delalloc_pages(inode, locked_page, 1690 delalloc_start, delalloc_end); 1691 if (ret == -EAGAIN) { 1692 /* some of the pages are gone, lets avoid looping by 1693 * shortening the size of the delalloc range we're searching 1694 */ 1695 free_extent_state(cached_state); 1696 cached_state = NULL; 1697 if (!loops) { 1698 max_bytes = PAGE_SIZE; 1699 loops = 1; 1700 goto again; 1701 } else { 1702 found = 0; 1703 goto out_failed; 1704 } 1705 } 1706 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1707 1708 /* step three, lock the state bits for the whole range */ 1709 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1710 1711 /* then test to make sure it is all still delalloc */ 1712 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1713 EXTENT_DELALLOC, 1, cached_state); 1714 if (!ret) { 1715 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1716 &cached_state, GFP_NOFS); 1717 __unlock_for_delalloc(inode, locked_page, 1718 delalloc_start, delalloc_end); 1719 cond_resched(); 1720 goto again; 1721 } 1722 free_extent_state(cached_state); 1723 *start = delalloc_start; 1724 *end = delalloc_end; 1725 out_failed: 1726 return found; 1727 } 1728 1729 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1730 u64 delalloc_end, struct page *locked_page, 1731 unsigned clear_bits, 1732 unsigned long page_ops) 1733 { 1734 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1735 int ret; 1736 struct page *pages[16]; 1737 unsigned long index = start >> PAGE_SHIFT; 1738 unsigned long end_index = end >> PAGE_SHIFT; 1739 unsigned long nr_pages = end_index - index + 1; 1740 int i; 1741 1742 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1743 if (page_ops == 0) 1744 return; 1745 1746 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1747 mapping_set_error(inode->i_mapping, -EIO); 1748 1749 while (nr_pages > 0) { 1750 ret = find_get_pages_contig(inode->i_mapping, index, 1751 min_t(unsigned long, 1752 nr_pages, ARRAY_SIZE(pages)), pages); 1753 for (i = 0; i < ret; i++) { 1754 1755 if (page_ops & PAGE_SET_PRIVATE2) 1756 SetPagePrivate2(pages[i]); 1757 1758 if (pages[i] == locked_page) { 1759 put_page(pages[i]); 1760 continue; 1761 } 1762 if (page_ops & PAGE_CLEAR_DIRTY) 1763 clear_page_dirty_for_io(pages[i]); 1764 if (page_ops & PAGE_SET_WRITEBACK) 1765 set_page_writeback(pages[i]); 1766 if (page_ops & PAGE_SET_ERROR) 1767 SetPageError(pages[i]); 1768 if (page_ops & PAGE_END_WRITEBACK) 1769 end_page_writeback(pages[i]); 1770 if (page_ops & PAGE_UNLOCK) 1771 unlock_page(pages[i]); 1772 put_page(pages[i]); 1773 } 1774 nr_pages -= ret; 1775 index += ret; 1776 cond_resched(); 1777 } 1778 } 1779 1780 /* 1781 * count the number of bytes in the tree that have a given bit(s) 1782 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1783 * cached. The total number found is returned. 1784 */ 1785 u64 count_range_bits(struct extent_io_tree *tree, 1786 u64 *start, u64 search_end, u64 max_bytes, 1787 unsigned bits, int contig) 1788 { 1789 struct rb_node *node; 1790 struct extent_state *state; 1791 u64 cur_start = *start; 1792 u64 total_bytes = 0; 1793 u64 last = 0; 1794 int found = 0; 1795 1796 if (WARN_ON(search_end <= cur_start)) 1797 return 0; 1798 1799 spin_lock(&tree->lock); 1800 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1801 total_bytes = tree->dirty_bytes; 1802 goto out; 1803 } 1804 /* 1805 * this search will find all the extents that end after 1806 * our range starts. 1807 */ 1808 node = tree_search(tree, cur_start); 1809 if (!node) 1810 goto out; 1811 1812 while (1) { 1813 state = rb_entry(node, struct extent_state, rb_node); 1814 if (state->start > search_end) 1815 break; 1816 if (contig && found && state->start > last + 1) 1817 break; 1818 if (state->end >= cur_start && (state->state & bits) == bits) { 1819 total_bytes += min(search_end, state->end) + 1 - 1820 max(cur_start, state->start); 1821 if (total_bytes >= max_bytes) 1822 break; 1823 if (!found) { 1824 *start = max(cur_start, state->start); 1825 found = 1; 1826 } 1827 last = state->end; 1828 } else if (contig && found) { 1829 break; 1830 } 1831 node = rb_next(node); 1832 if (!node) 1833 break; 1834 } 1835 out: 1836 spin_unlock(&tree->lock); 1837 return total_bytes; 1838 } 1839 1840 /* 1841 * set the private field for a given byte offset in the tree. If there isn't 1842 * an extent_state there already, this does nothing. 1843 */ 1844 static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, 1845 struct io_failure_record *failrec) 1846 { 1847 struct rb_node *node; 1848 struct extent_state *state; 1849 int ret = 0; 1850 1851 spin_lock(&tree->lock); 1852 /* 1853 * this search will find all the extents that end after 1854 * our range starts. 1855 */ 1856 node = tree_search(tree, start); 1857 if (!node) { 1858 ret = -ENOENT; 1859 goto out; 1860 } 1861 state = rb_entry(node, struct extent_state, rb_node); 1862 if (state->start != start) { 1863 ret = -ENOENT; 1864 goto out; 1865 } 1866 state->failrec = failrec; 1867 out: 1868 spin_unlock(&tree->lock); 1869 return ret; 1870 } 1871 1872 static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, 1873 struct io_failure_record **failrec) 1874 { 1875 struct rb_node *node; 1876 struct extent_state *state; 1877 int ret = 0; 1878 1879 spin_lock(&tree->lock); 1880 /* 1881 * this search will find all the extents that end after 1882 * our range starts. 1883 */ 1884 node = tree_search(tree, start); 1885 if (!node) { 1886 ret = -ENOENT; 1887 goto out; 1888 } 1889 state = rb_entry(node, struct extent_state, rb_node); 1890 if (state->start != start) { 1891 ret = -ENOENT; 1892 goto out; 1893 } 1894 *failrec = state->failrec; 1895 out: 1896 spin_unlock(&tree->lock); 1897 return ret; 1898 } 1899 1900 /* 1901 * searches a range in the state tree for a given mask. 1902 * If 'filled' == 1, this returns 1 only if every extent in the tree 1903 * has the bits set. Otherwise, 1 is returned if any bit in the 1904 * range is found set. 1905 */ 1906 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1907 unsigned bits, int filled, struct extent_state *cached) 1908 { 1909 struct extent_state *state = NULL; 1910 struct rb_node *node; 1911 int bitset = 0; 1912 1913 spin_lock(&tree->lock); 1914 if (cached && extent_state_in_tree(cached) && cached->start <= start && 1915 cached->end > start) 1916 node = &cached->rb_node; 1917 else 1918 node = tree_search(tree, start); 1919 while (node && start <= end) { 1920 state = rb_entry(node, struct extent_state, rb_node); 1921 1922 if (filled && state->start > start) { 1923 bitset = 0; 1924 break; 1925 } 1926 1927 if (state->start > end) 1928 break; 1929 1930 if (state->state & bits) { 1931 bitset = 1; 1932 if (!filled) 1933 break; 1934 } else if (filled) { 1935 bitset = 0; 1936 break; 1937 } 1938 1939 if (state->end == (u64)-1) 1940 break; 1941 1942 start = state->end + 1; 1943 if (start > end) 1944 break; 1945 node = rb_next(node); 1946 if (!node) { 1947 if (filled) 1948 bitset = 0; 1949 break; 1950 } 1951 } 1952 spin_unlock(&tree->lock); 1953 return bitset; 1954 } 1955 1956 /* 1957 * helper function to set a given page up to date if all the 1958 * extents in the tree for that page are up to date 1959 */ 1960 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1961 { 1962 u64 start = page_offset(page); 1963 u64 end = start + PAGE_SIZE - 1; 1964 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1965 SetPageUptodate(page); 1966 } 1967 1968 int free_io_failure(struct inode *inode, struct io_failure_record *rec) 1969 { 1970 int ret; 1971 int err = 0; 1972 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1973 1974 set_state_failrec(failure_tree, rec->start, NULL); 1975 ret = clear_extent_bits(failure_tree, rec->start, 1976 rec->start + rec->len - 1, 1977 EXTENT_LOCKED | EXTENT_DIRTY); 1978 if (ret) 1979 err = ret; 1980 1981 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1982 rec->start + rec->len - 1, 1983 EXTENT_DAMAGED); 1984 if (ret && !err) 1985 err = ret; 1986 1987 kfree(rec); 1988 return err; 1989 } 1990 1991 /* 1992 * this bypasses the standard btrfs submit functions deliberately, as 1993 * the standard behavior is to write all copies in a raid setup. here we only 1994 * want to write the one bad copy. so we do the mapping for ourselves and issue 1995 * submit_bio directly. 1996 * to avoid any synchronization issues, wait for the data after writing, which 1997 * actually prevents the read that triggered the error from finishing. 1998 * currently, there can be no more than two copies of every data bit. thus, 1999 * exactly one rewrite is required. 2000 */ 2001 int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, 2002 struct page *page, unsigned int pg_offset, int mirror_num) 2003 { 2004 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2005 struct bio *bio; 2006 struct btrfs_device *dev; 2007 u64 map_length = 0; 2008 u64 sector; 2009 struct btrfs_bio *bbio = NULL; 2010 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 2011 int ret; 2012 2013 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); 2014 BUG_ON(!mirror_num); 2015 2016 /* we can't repair anything in raid56 yet */ 2017 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 2018 return 0; 2019 2020 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2021 if (!bio) 2022 return -EIO; 2023 bio->bi_iter.bi_size = 0; 2024 map_length = length; 2025 2026 /* 2027 * Avoid races with device replace and make sure our bbio has devices 2028 * associated to its stripes that don't go away while we are doing the 2029 * read repair operation. 2030 */ 2031 btrfs_bio_counter_inc_blocked(fs_info); 2032 ret = btrfs_map_block(fs_info, WRITE, logical, 2033 &map_length, &bbio, mirror_num); 2034 if (ret) { 2035 btrfs_bio_counter_dec(fs_info); 2036 bio_put(bio); 2037 return -EIO; 2038 } 2039 BUG_ON(mirror_num != bbio->mirror_num); 2040 sector = bbio->stripes[mirror_num-1].physical >> 9; 2041 bio->bi_iter.bi_sector = sector; 2042 dev = bbio->stripes[mirror_num-1].dev; 2043 btrfs_put_bbio(bbio); 2044 if (!dev || !dev->bdev || !dev->writeable) { 2045 btrfs_bio_counter_dec(fs_info); 2046 bio_put(bio); 2047 return -EIO; 2048 } 2049 bio->bi_bdev = dev->bdev; 2050 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2051 bio_add_page(bio, page, length, pg_offset); 2052 2053 if (btrfsic_submit_bio_wait(bio)) { 2054 /* try to remap that extent elsewhere? */ 2055 btrfs_bio_counter_dec(fs_info); 2056 bio_put(bio); 2057 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2058 return -EIO; 2059 } 2060 2061 btrfs_info_rl_in_rcu(fs_info, 2062 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2063 btrfs_ino(inode), start, 2064 rcu_str_deref(dev->name), sector); 2065 btrfs_bio_counter_dec(fs_info); 2066 bio_put(bio); 2067 return 0; 2068 } 2069 2070 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 2071 int mirror_num) 2072 { 2073 u64 start = eb->start; 2074 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2075 int ret = 0; 2076 2077 if (root->fs_info->sb->s_flags & MS_RDONLY) 2078 return -EROFS; 2079 2080 for (i = 0; i < num_pages; i++) { 2081 struct page *p = eb->pages[i]; 2082 2083 ret = repair_io_failure(root->fs_info->btree_inode, start, 2084 PAGE_SIZE, start, p, 2085 start - page_offset(p), mirror_num); 2086 if (ret) 2087 break; 2088 start += PAGE_SIZE; 2089 } 2090 2091 return ret; 2092 } 2093 2094 /* 2095 * each time an IO finishes, we do a fast check in the IO failure tree 2096 * to see if we need to process or clean up an io_failure_record 2097 */ 2098 int clean_io_failure(struct inode *inode, u64 start, struct page *page, 2099 unsigned int pg_offset) 2100 { 2101 u64 private; 2102 struct io_failure_record *failrec; 2103 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2104 struct extent_state *state; 2105 int num_copies; 2106 int ret; 2107 2108 private = 0; 2109 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2110 (u64)-1, 1, EXTENT_DIRTY, 0); 2111 if (!ret) 2112 return 0; 2113 2114 ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start, 2115 &failrec); 2116 if (ret) 2117 return 0; 2118 2119 BUG_ON(!failrec->this_mirror); 2120 2121 if (failrec->in_validation) { 2122 /* there was no real error, just free the record */ 2123 btrfs_debug(fs_info, 2124 "clean_io_failure: freeing dummy error at %llu", 2125 failrec->start); 2126 goto out; 2127 } 2128 if (fs_info->sb->s_flags & MS_RDONLY) 2129 goto out; 2130 2131 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2132 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2133 failrec->start, 2134 EXTENT_LOCKED); 2135 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2136 2137 if (state && state->start <= failrec->start && 2138 state->end >= failrec->start + failrec->len - 1) { 2139 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2140 failrec->len); 2141 if (num_copies > 1) { 2142 repair_io_failure(inode, start, failrec->len, 2143 failrec->logical, page, 2144 pg_offset, failrec->failed_mirror); 2145 } 2146 } 2147 2148 out: 2149 free_io_failure(inode, failrec); 2150 2151 return 0; 2152 } 2153 2154 /* 2155 * Can be called when 2156 * - hold extent lock 2157 * - under ordered extent 2158 * - the inode is freeing 2159 */ 2160 void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) 2161 { 2162 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2163 struct io_failure_record *failrec; 2164 struct extent_state *state, *next; 2165 2166 if (RB_EMPTY_ROOT(&failure_tree->state)) 2167 return; 2168 2169 spin_lock(&failure_tree->lock); 2170 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2171 while (state) { 2172 if (state->start > end) 2173 break; 2174 2175 ASSERT(state->end <= end); 2176 2177 next = next_state(state); 2178 2179 failrec = state->failrec; 2180 free_extent_state(state); 2181 kfree(failrec); 2182 2183 state = next; 2184 } 2185 spin_unlock(&failure_tree->lock); 2186 } 2187 2188 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2189 struct io_failure_record **failrec_ret) 2190 { 2191 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2192 struct io_failure_record *failrec; 2193 struct extent_map *em; 2194 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2195 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2196 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2197 int ret; 2198 u64 logical; 2199 2200 ret = get_state_failrec(failure_tree, start, &failrec); 2201 if (ret) { 2202 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2203 if (!failrec) 2204 return -ENOMEM; 2205 2206 failrec->start = start; 2207 failrec->len = end - start + 1; 2208 failrec->this_mirror = 0; 2209 failrec->bio_flags = 0; 2210 failrec->in_validation = 0; 2211 2212 read_lock(&em_tree->lock); 2213 em = lookup_extent_mapping(em_tree, start, failrec->len); 2214 if (!em) { 2215 read_unlock(&em_tree->lock); 2216 kfree(failrec); 2217 return -EIO; 2218 } 2219 2220 if (em->start > start || em->start + em->len <= start) { 2221 free_extent_map(em); 2222 em = NULL; 2223 } 2224 read_unlock(&em_tree->lock); 2225 if (!em) { 2226 kfree(failrec); 2227 return -EIO; 2228 } 2229 2230 logical = start - em->start; 2231 logical = em->block_start + logical; 2232 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2233 logical = em->block_start; 2234 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2235 extent_set_compress_type(&failrec->bio_flags, 2236 em->compress_type); 2237 } 2238 2239 btrfs_debug(fs_info, 2240 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2241 logical, start, failrec->len); 2242 2243 failrec->logical = logical; 2244 free_extent_map(em); 2245 2246 /* set the bits in the private failure tree */ 2247 ret = set_extent_bits(failure_tree, start, end, 2248 EXTENT_LOCKED | EXTENT_DIRTY); 2249 if (ret >= 0) 2250 ret = set_state_failrec(failure_tree, start, failrec); 2251 /* set the bits in the inode's tree */ 2252 if (ret >= 0) 2253 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2254 if (ret < 0) { 2255 kfree(failrec); 2256 return ret; 2257 } 2258 } else { 2259 btrfs_debug(fs_info, 2260 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2261 failrec->logical, failrec->start, failrec->len, 2262 failrec->in_validation); 2263 /* 2264 * when data can be on disk more than twice, add to failrec here 2265 * (e.g. with a list for failed_mirror) to make 2266 * clean_io_failure() clean all those errors at once. 2267 */ 2268 } 2269 2270 *failrec_ret = failrec; 2271 2272 return 0; 2273 } 2274 2275 int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, 2276 struct io_failure_record *failrec, int failed_mirror) 2277 { 2278 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2279 int num_copies; 2280 2281 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2282 if (num_copies == 1) { 2283 /* 2284 * we only have a single copy of the data, so don't bother with 2285 * all the retry and error correction code that follows. no 2286 * matter what the error is, it is very likely to persist. 2287 */ 2288 btrfs_debug(fs_info, 2289 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2290 num_copies, failrec->this_mirror, failed_mirror); 2291 return 0; 2292 } 2293 2294 /* 2295 * there are two premises: 2296 * a) deliver good data to the caller 2297 * b) correct the bad sectors on disk 2298 */ 2299 if (failed_bio->bi_vcnt > 1) { 2300 /* 2301 * to fulfill b), we need to know the exact failing sectors, as 2302 * we don't want to rewrite any more than the failed ones. thus, 2303 * we need separate read requests for the failed bio 2304 * 2305 * if the following BUG_ON triggers, our validation request got 2306 * merged. we need separate requests for our algorithm to work. 2307 */ 2308 BUG_ON(failrec->in_validation); 2309 failrec->in_validation = 1; 2310 failrec->this_mirror = failed_mirror; 2311 } else { 2312 /* 2313 * we're ready to fulfill a) and b) alongside. get a good copy 2314 * of the failed sector and if we succeed, we have setup 2315 * everything for repair_io_failure to do the rest for us. 2316 */ 2317 if (failrec->in_validation) { 2318 BUG_ON(failrec->this_mirror != failed_mirror); 2319 failrec->in_validation = 0; 2320 failrec->this_mirror = 0; 2321 } 2322 failrec->failed_mirror = failed_mirror; 2323 failrec->this_mirror++; 2324 if (failrec->this_mirror == failed_mirror) 2325 failrec->this_mirror++; 2326 } 2327 2328 if (failrec->this_mirror > num_copies) { 2329 btrfs_debug(fs_info, 2330 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2331 num_copies, failrec->this_mirror, failed_mirror); 2332 return 0; 2333 } 2334 2335 return 1; 2336 } 2337 2338 2339 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2340 struct io_failure_record *failrec, 2341 struct page *page, int pg_offset, int icsum, 2342 bio_end_io_t *endio_func, void *data) 2343 { 2344 struct bio *bio; 2345 struct btrfs_io_bio *btrfs_failed_bio; 2346 struct btrfs_io_bio *btrfs_bio; 2347 2348 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2349 if (!bio) 2350 return NULL; 2351 2352 bio->bi_end_io = endio_func; 2353 bio->bi_iter.bi_sector = failrec->logical >> 9; 2354 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2355 bio->bi_iter.bi_size = 0; 2356 bio->bi_private = data; 2357 2358 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2359 if (btrfs_failed_bio->csum) { 2360 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2361 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2362 2363 btrfs_bio = btrfs_io_bio(bio); 2364 btrfs_bio->csum = btrfs_bio->csum_inline; 2365 icsum *= csum_size; 2366 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2367 csum_size); 2368 } 2369 2370 bio_add_page(bio, page, failrec->len, pg_offset); 2371 2372 return bio; 2373 } 2374 2375 /* 2376 * this is a generic handler for readpage errors (default 2377 * readpage_io_failed_hook). if other copies exist, read those and write back 2378 * good data to the failed position. does not investigate in remapping the 2379 * failed extent elsewhere, hoping the device will be smart enough to do this as 2380 * needed 2381 */ 2382 2383 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2384 struct page *page, u64 start, u64 end, 2385 int failed_mirror) 2386 { 2387 struct io_failure_record *failrec; 2388 struct inode *inode = page->mapping->host; 2389 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2390 struct bio *bio; 2391 int read_mode = 0; 2392 int ret; 2393 2394 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2395 2396 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2397 if (ret) 2398 return ret; 2399 2400 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); 2401 if (!ret) { 2402 free_io_failure(inode, failrec); 2403 return -EIO; 2404 } 2405 2406 if (failed_bio->bi_vcnt > 1) 2407 read_mode |= REQ_FAILFAST_DEV; 2408 2409 phy_offset >>= inode->i_sb->s_blocksize_bits; 2410 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2411 start - page_offset(page), 2412 (int)phy_offset, failed_bio->bi_end_io, 2413 NULL); 2414 if (!bio) { 2415 free_io_failure(inode, failrec); 2416 return -EIO; 2417 } 2418 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 2419 2420 btrfs_debug(btrfs_sb(inode->i_sb), 2421 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2422 read_mode, failrec->this_mirror, failrec->in_validation); 2423 2424 ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, 2425 failrec->bio_flags, 0); 2426 if (ret) { 2427 free_io_failure(inode, failrec); 2428 bio_put(bio); 2429 } 2430 2431 return ret; 2432 } 2433 2434 /* lots and lots of room for performance fixes in the end_bio funcs */ 2435 2436 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2437 { 2438 int uptodate = (err == 0); 2439 struct extent_io_tree *tree; 2440 int ret = 0; 2441 2442 tree = &BTRFS_I(page->mapping->host)->io_tree; 2443 2444 if (tree->ops && tree->ops->writepage_end_io_hook) { 2445 ret = tree->ops->writepage_end_io_hook(page, start, 2446 end, NULL, uptodate); 2447 if (ret) 2448 uptodate = 0; 2449 } 2450 2451 if (!uptodate) { 2452 ClearPageUptodate(page); 2453 SetPageError(page); 2454 ret = ret < 0 ? ret : -EIO; 2455 mapping_set_error(page->mapping, ret); 2456 } 2457 } 2458 2459 /* 2460 * after a writepage IO is done, we need to: 2461 * clear the uptodate bits on error 2462 * clear the writeback bits in the extent tree for this IO 2463 * end_page_writeback if the page has no more pending IO 2464 * 2465 * Scheduling is not allowed, so the extent state tree is expected 2466 * to have one and only one object corresponding to this IO. 2467 */ 2468 static void end_bio_extent_writepage(struct bio *bio) 2469 { 2470 struct bio_vec *bvec; 2471 u64 start; 2472 u64 end; 2473 int i; 2474 2475 bio_for_each_segment_all(bvec, bio, i) { 2476 struct page *page = bvec->bv_page; 2477 2478 /* We always issue full-page reads, but if some block 2479 * in a page fails to read, blk_update_request() will 2480 * advance bv_offset and adjust bv_len to compensate. 2481 * Print a warning for nonzero offsets, and an error 2482 * if they don't add up to a full page. */ 2483 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2484 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2485 btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, 2486 "partial page write in btrfs with offset %u and length %u", 2487 bvec->bv_offset, bvec->bv_len); 2488 else 2489 btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, 2490 "incomplete page write in btrfs with offset %u and length %u", 2491 bvec->bv_offset, bvec->bv_len); 2492 } 2493 2494 start = page_offset(page); 2495 end = start + bvec->bv_offset + bvec->bv_len - 1; 2496 2497 end_extent_writepage(page, bio->bi_error, start, end); 2498 end_page_writeback(page); 2499 } 2500 2501 bio_put(bio); 2502 } 2503 2504 static void 2505 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2506 int uptodate) 2507 { 2508 struct extent_state *cached = NULL; 2509 u64 end = start + len - 1; 2510 2511 if (uptodate && tree->track_uptodate) 2512 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2513 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2514 } 2515 2516 /* 2517 * after a readpage IO is done, we need to: 2518 * clear the uptodate bits on error 2519 * set the uptodate bits if things worked 2520 * set the page up to date if all extents in the tree are uptodate 2521 * clear the lock bit in the extent tree 2522 * unlock the page if there are no other extents locked for it 2523 * 2524 * Scheduling is not allowed, so the extent state tree is expected 2525 * to have one and only one object corresponding to this IO. 2526 */ 2527 static void end_bio_extent_readpage(struct bio *bio) 2528 { 2529 struct bio_vec *bvec; 2530 int uptodate = !bio->bi_error; 2531 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2532 struct extent_io_tree *tree; 2533 u64 offset = 0; 2534 u64 start; 2535 u64 end; 2536 u64 len; 2537 u64 extent_start = 0; 2538 u64 extent_len = 0; 2539 int mirror; 2540 int ret; 2541 int i; 2542 2543 bio_for_each_segment_all(bvec, bio, i) { 2544 struct page *page = bvec->bv_page; 2545 struct inode *inode = page->mapping->host; 2546 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2547 2548 btrfs_debug(fs_info, 2549 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2550 (u64)bio->bi_iter.bi_sector, bio->bi_error, 2551 io_bio->mirror_num); 2552 tree = &BTRFS_I(inode)->io_tree; 2553 2554 /* We always issue full-page reads, but if some block 2555 * in a page fails to read, blk_update_request() will 2556 * advance bv_offset and adjust bv_len to compensate. 2557 * Print a warning for nonzero offsets, and an error 2558 * if they don't add up to a full page. */ 2559 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2560 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2561 btrfs_err(fs_info, 2562 "partial page read in btrfs with offset %u and length %u", 2563 bvec->bv_offset, bvec->bv_len); 2564 else 2565 btrfs_info(fs_info, 2566 "incomplete page read in btrfs with offset %u and length %u", 2567 bvec->bv_offset, bvec->bv_len); 2568 } 2569 2570 start = page_offset(page); 2571 end = start + bvec->bv_offset + bvec->bv_len - 1; 2572 len = bvec->bv_len; 2573 2574 mirror = io_bio->mirror_num; 2575 if (likely(uptodate && tree->ops && 2576 tree->ops->readpage_end_io_hook)) { 2577 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2578 page, start, end, 2579 mirror); 2580 if (ret) 2581 uptodate = 0; 2582 else 2583 clean_io_failure(inode, start, page, 0); 2584 } 2585 2586 if (likely(uptodate)) 2587 goto readpage_ok; 2588 2589 if (tree->ops && tree->ops->readpage_io_failed_hook) { 2590 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2591 if (!ret && !bio->bi_error) 2592 uptodate = 1; 2593 } else { 2594 /* 2595 * The generic bio_readpage_error handles errors the 2596 * following way: If possible, new read requests are 2597 * created and submitted and will end up in 2598 * end_bio_extent_readpage as well (if we're lucky, not 2599 * in the !uptodate case). In that case it returns 0 and 2600 * we just go on with the next page in our bio. If it 2601 * can't handle the error it will return -EIO and we 2602 * remain responsible for that page. 2603 */ 2604 ret = bio_readpage_error(bio, offset, page, start, end, 2605 mirror); 2606 if (ret == 0) { 2607 uptodate = !bio->bi_error; 2608 offset += len; 2609 continue; 2610 } 2611 } 2612 readpage_ok: 2613 if (likely(uptodate)) { 2614 loff_t i_size = i_size_read(inode); 2615 pgoff_t end_index = i_size >> PAGE_SHIFT; 2616 unsigned off; 2617 2618 /* Zero out the end if this page straddles i_size */ 2619 off = i_size & (PAGE_SIZE-1); 2620 if (page->index == end_index && off) 2621 zero_user_segment(page, off, PAGE_SIZE); 2622 SetPageUptodate(page); 2623 } else { 2624 ClearPageUptodate(page); 2625 SetPageError(page); 2626 } 2627 unlock_page(page); 2628 offset += len; 2629 2630 if (unlikely(!uptodate)) { 2631 if (extent_len) { 2632 endio_readpage_release_extent(tree, 2633 extent_start, 2634 extent_len, 1); 2635 extent_start = 0; 2636 extent_len = 0; 2637 } 2638 endio_readpage_release_extent(tree, start, 2639 end - start + 1, 0); 2640 } else if (!extent_len) { 2641 extent_start = start; 2642 extent_len = end + 1 - start; 2643 } else if (extent_start + extent_len == start) { 2644 extent_len += end + 1 - start; 2645 } else { 2646 endio_readpage_release_extent(tree, extent_start, 2647 extent_len, uptodate); 2648 extent_start = start; 2649 extent_len = end + 1 - start; 2650 } 2651 } 2652 2653 if (extent_len) 2654 endio_readpage_release_extent(tree, extent_start, extent_len, 2655 uptodate); 2656 if (io_bio->end_io) 2657 io_bio->end_io(io_bio, bio->bi_error); 2658 bio_put(bio); 2659 } 2660 2661 /* 2662 * this allocates from the btrfs_bioset. We're returning a bio right now 2663 * but you can call btrfs_io_bio for the appropriate container_of magic 2664 */ 2665 struct bio * 2666 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2667 gfp_t gfp_flags) 2668 { 2669 struct btrfs_io_bio *btrfs_bio; 2670 struct bio *bio; 2671 2672 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2673 2674 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2675 while (!bio && (nr_vecs /= 2)) { 2676 bio = bio_alloc_bioset(gfp_flags, 2677 nr_vecs, btrfs_bioset); 2678 } 2679 } 2680 2681 if (bio) { 2682 bio->bi_bdev = bdev; 2683 bio->bi_iter.bi_sector = first_sector; 2684 btrfs_bio = btrfs_io_bio(bio); 2685 btrfs_bio->csum = NULL; 2686 btrfs_bio->csum_allocated = NULL; 2687 btrfs_bio->end_io = NULL; 2688 } 2689 return bio; 2690 } 2691 2692 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2693 { 2694 struct btrfs_io_bio *btrfs_bio; 2695 struct bio *new; 2696 2697 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2698 if (new) { 2699 btrfs_bio = btrfs_io_bio(new); 2700 btrfs_bio->csum = NULL; 2701 btrfs_bio->csum_allocated = NULL; 2702 btrfs_bio->end_io = NULL; 2703 } 2704 return new; 2705 } 2706 2707 /* this also allocates from the btrfs_bioset */ 2708 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2709 { 2710 struct btrfs_io_bio *btrfs_bio; 2711 struct bio *bio; 2712 2713 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2714 if (bio) { 2715 btrfs_bio = btrfs_io_bio(bio); 2716 btrfs_bio->csum = NULL; 2717 btrfs_bio->csum_allocated = NULL; 2718 btrfs_bio->end_io = NULL; 2719 } 2720 return bio; 2721 } 2722 2723 2724 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 2725 unsigned long bio_flags) 2726 { 2727 int ret = 0; 2728 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2729 struct page *page = bvec->bv_page; 2730 struct extent_io_tree *tree = bio->bi_private; 2731 u64 start; 2732 2733 start = page_offset(page) + bvec->bv_offset; 2734 2735 bio->bi_private = NULL; 2736 bio_get(bio); 2737 2738 if (tree->ops && tree->ops->submit_bio_hook) 2739 ret = tree->ops->submit_bio_hook(page->mapping->host, bio, 2740 mirror_num, bio_flags, start); 2741 else 2742 btrfsic_submit_bio(bio); 2743 2744 bio_put(bio); 2745 return ret; 2746 } 2747 2748 static int merge_bio(struct extent_io_tree *tree, struct page *page, 2749 unsigned long offset, size_t size, struct bio *bio, 2750 unsigned long bio_flags) 2751 { 2752 int ret = 0; 2753 if (tree->ops && tree->ops->merge_bio_hook) 2754 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2755 bio_flags); 2756 return ret; 2757 2758 } 2759 2760 static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, 2761 struct writeback_control *wbc, 2762 struct page *page, sector_t sector, 2763 size_t size, unsigned long offset, 2764 struct block_device *bdev, 2765 struct bio **bio_ret, 2766 unsigned long max_pages, 2767 bio_end_io_t end_io_func, 2768 int mirror_num, 2769 unsigned long prev_bio_flags, 2770 unsigned long bio_flags, 2771 bool force_bio_submit) 2772 { 2773 int ret = 0; 2774 struct bio *bio; 2775 int contig = 0; 2776 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2777 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2778 2779 if (bio_ret && *bio_ret) { 2780 bio = *bio_ret; 2781 if (old_compressed) 2782 contig = bio->bi_iter.bi_sector == sector; 2783 else 2784 contig = bio_end_sector(bio) == sector; 2785 2786 if (prev_bio_flags != bio_flags || !contig || 2787 force_bio_submit || 2788 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2789 bio_add_page(bio, page, page_size, offset) < page_size) { 2790 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2791 if (ret < 0) { 2792 *bio_ret = NULL; 2793 return ret; 2794 } 2795 bio = NULL; 2796 } else { 2797 if (wbc) 2798 wbc_account_io(wbc, page, page_size); 2799 return 0; 2800 } 2801 } 2802 2803 bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, 2804 GFP_NOFS | __GFP_HIGH); 2805 if (!bio) 2806 return -ENOMEM; 2807 2808 bio_add_page(bio, page, page_size, offset); 2809 bio->bi_end_io = end_io_func; 2810 bio->bi_private = tree; 2811 bio_set_op_attrs(bio, op, op_flags); 2812 if (wbc) { 2813 wbc_init_bio(wbc, bio); 2814 wbc_account_io(wbc, page, page_size); 2815 } 2816 2817 if (bio_ret) 2818 *bio_ret = bio; 2819 else 2820 ret = submit_one_bio(bio, mirror_num, bio_flags); 2821 2822 return ret; 2823 } 2824 2825 static void attach_extent_buffer_page(struct extent_buffer *eb, 2826 struct page *page) 2827 { 2828 if (!PagePrivate(page)) { 2829 SetPagePrivate(page); 2830 get_page(page); 2831 set_page_private(page, (unsigned long)eb); 2832 } else { 2833 WARN_ON(page->private != (unsigned long)eb); 2834 } 2835 } 2836 2837 void set_page_extent_mapped(struct page *page) 2838 { 2839 if (!PagePrivate(page)) { 2840 SetPagePrivate(page); 2841 get_page(page); 2842 set_page_private(page, EXTENT_PAGE_PRIVATE); 2843 } 2844 } 2845 2846 static struct extent_map * 2847 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2848 u64 start, u64 len, get_extent_t *get_extent, 2849 struct extent_map **em_cached) 2850 { 2851 struct extent_map *em; 2852 2853 if (em_cached && *em_cached) { 2854 em = *em_cached; 2855 if (extent_map_in_tree(em) && start >= em->start && 2856 start < extent_map_end(em)) { 2857 atomic_inc(&em->refs); 2858 return em; 2859 } 2860 2861 free_extent_map(em); 2862 *em_cached = NULL; 2863 } 2864 2865 em = get_extent(inode, page, pg_offset, start, len, 0); 2866 if (em_cached && !IS_ERR_OR_NULL(em)) { 2867 BUG_ON(*em_cached); 2868 atomic_inc(&em->refs); 2869 *em_cached = em; 2870 } 2871 return em; 2872 } 2873 /* 2874 * basic readpage implementation. Locked extent state structs are inserted 2875 * into the tree that are removed when the IO is done (by the end_io 2876 * handlers) 2877 * XXX JDM: This needs looking at to ensure proper page locking 2878 * return 0 on success, otherwise return error 2879 */ 2880 static int __do_readpage(struct extent_io_tree *tree, 2881 struct page *page, 2882 get_extent_t *get_extent, 2883 struct extent_map **em_cached, 2884 struct bio **bio, int mirror_num, 2885 unsigned long *bio_flags, int read_flags, 2886 u64 *prev_em_start) 2887 { 2888 struct inode *inode = page->mapping->host; 2889 u64 start = page_offset(page); 2890 u64 page_end = start + PAGE_SIZE - 1; 2891 u64 end; 2892 u64 cur = start; 2893 u64 extent_offset; 2894 u64 last_byte = i_size_read(inode); 2895 u64 block_start; 2896 u64 cur_end; 2897 sector_t sector; 2898 struct extent_map *em; 2899 struct block_device *bdev; 2900 int ret = 0; 2901 int nr = 0; 2902 size_t pg_offset = 0; 2903 size_t iosize; 2904 size_t disk_io_size; 2905 size_t blocksize = inode->i_sb->s_blocksize; 2906 unsigned long this_bio_flag = 0; 2907 2908 set_page_extent_mapped(page); 2909 2910 end = page_end; 2911 if (!PageUptodate(page)) { 2912 if (cleancache_get_page(page) == 0) { 2913 BUG_ON(blocksize != PAGE_SIZE); 2914 unlock_extent(tree, start, end); 2915 goto out; 2916 } 2917 } 2918 2919 if (page->index == last_byte >> PAGE_SHIFT) { 2920 char *userpage; 2921 size_t zero_offset = last_byte & (PAGE_SIZE - 1); 2922 2923 if (zero_offset) { 2924 iosize = PAGE_SIZE - zero_offset; 2925 userpage = kmap_atomic(page); 2926 memset(userpage + zero_offset, 0, iosize); 2927 flush_dcache_page(page); 2928 kunmap_atomic(userpage); 2929 } 2930 } 2931 while (cur <= end) { 2932 unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1; 2933 bool force_bio_submit = false; 2934 2935 if (cur >= last_byte) { 2936 char *userpage; 2937 struct extent_state *cached = NULL; 2938 2939 iosize = PAGE_SIZE - pg_offset; 2940 userpage = kmap_atomic(page); 2941 memset(userpage + pg_offset, 0, iosize); 2942 flush_dcache_page(page); 2943 kunmap_atomic(userpage); 2944 set_extent_uptodate(tree, cur, cur + iosize - 1, 2945 &cached, GFP_NOFS); 2946 unlock_extent_cached(tree, cur, 2947 cur + iosize - 1, 2948 &cached, GFP_NOFS); 2949 break; 2950 } 2951 em = __get_extent_map(inode, page, pg_offset, cur, 2952 end - cur + 1, get_extent, em_cached); 2953 if (IS_ERR_OR_NULL(em)) { 2954 SetPageError(page); 2955 unlock_extent(tree, cur, end); 2956 break; 2957 } 2958 extent_offset = cur - em->start; 2959 BUG_ON(extent_map_end(em) <= cur); 2960 BUG_ON(end < cur); 2961 2962 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2963 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2964 extent_set_compress_type(&this_bio_flag, 2965 em->compress_type); 2966 } 2967 2968 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2969 cur_end = min(extent_map_end(em) - 1, end); 2970 iosize = ALIGN(iosize, blocksize); 2971 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2972 disk_io_size = em->block_len; 2973 sector = em->block_start >> 9; 2974 } else { 2975 sector = (em->block_start + extent_offset) >> 9; 2976 disk_io_size = iosize; 2977 } 2978 bdev = em->bdev; 2979 block_start = em->block_start; 2980 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2981 block_start = EXTENT_MAP_HOLE; 2982 2983 /* 2984 * If we have a file range that points to a compressed extent 2985 * and it's followed by a consecutive file range that points to 2986 * to the same compressed extent (possibly with a different 2987 * offset and/or length, so it either points to the whole extent 2988 * or only part of it), we must make sure we do not submit a 2989 * single bio to populate the pages for the 2 ranges because 2990 * this makes the compressed extent read zero out the pages 2991 * belonging to the 2nd range. Imagine the following scenario: 2992 * 2993 * File layout 2994 * [0 - 8K] [8K - 24K] 2995 * | | 2996 * | | 2997 * points to extent X, points to extent X, 2998 * offset 4K, length of 8K offset 0, length 16K 2999 * 3000 * [extent X, compressed length = 4K uncompressed length = 16K] 3001 * 3002 * If the bio to read the compressed extent covers both ranges, 3003 * it will decompress extent X into the pages belonging to the 3004 * first range and then it will stop, zeroing out the remaining 3005 * pages that belong to the other range that points to extent X. 3006 * So here we make sure we submit 2 bios, one for the first 3007 * range and another one for the third range. Both will target 3008 * the same physical extent from disk, but we can't currently 3009 * make the compressed bio endio callback populate the pages 3010 * for both ranges because each compressed bio is tightly 3011 * coupled with a single extent map, and each range can have 3012 * an extent map with a different offset value relative to the 3013 * uncompressed data of our extent and different lengths. This 3014 * is a corner case so we prioritize correctness over 3015 * non-optimal behavior (submitting 2 bios for the same extent). 3016 */ 3017 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3018 prev_em_start && *prev_em_start != (u64)-1 && 3019 *prev_em_start != em->orig_start) 3020 force_bio_submit = true; 3021 3022 if (prev_em_start) 3023 *prev_em_start = em->orig_start; 3024 3025 free_extent_map(em); 3026 em = NULL; 3027 3028 /* we've found a hole, just zero and go on */ 3029 if (block_start == EXTENT_MAP_HOLE) { 3030 char *userpage; 3031 struct extent_state *cached = NULL; 3032 3033 userpage = kmap_atomic(page); 3034 memset(userpage + pg_offset, 0, iosize); 3035 flush_dcache_page(page); 3036 kunmap_atomic(userpage); 3037 3038 set_extent_uptodate(tree, cur, cur + iosize - 1, 3039 &cached, GFP_NOFS); 3040 unlock_extent_cached(tree, cur, 3041 cur + iosize - 1, 3042 &cached, GFP_NOFS); 3043 cur = cur + iosize; 3044 pg_offset += iosize; 3045 continue; 3046 } 3047 /* the get_extent function already copied into the page */ 3048 if (test_range_bit(tree, cur, cur_end, 3049 EXTENT_UPTODATE, 1, NULL)) { 3050 check_page_uptodate(tree, page); 3051 unlock_extent(tree, cur, cur + iosize - 1); 3052 cur = cur + iosize; 3053 pg_offset += iosize; 3054 continue; 3055 } 3056 /* we have an inline extent but it didn't get marked up 3057 * to date. Error out 3058 */ 3059 if (block_start == EXTENT_MAP_INLINE) { 3060 SetPageError(page); 3061 unlock_extent(tree, cur, cur + iosize - 1); 3062 cur = cur + iosize; 3063 pg_offset += iosize; 3064 continue; 3065 } 3066 3067 pnr -= page->index; 3068 ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, 3069 page, sector, disk_io_size, pg_offset, 3070 bdev, bio, pnr, 3071 end_bio_extent_readpage, mirror_num, 3072 *bio_flags, 3073 this_bio_flag, 3074 force_bio_submit); 3075 if (!ret) { 3076 nr++; 3077 *bio_flags = this_bio_flag; 3078 } else { 3079 SetPageError(page); 3080 unlock_extent(tree, cur, cur + iosize - 1); 3081 goto out; 3082 } 3083 cur = cur + iosize; 3084 pg_offset += iosize; 3085 } 3086 out: 3087 if (!nr) { 3088 if (!PageError(page)) 3089 SetPageUptodate(page); 3090 unlock_page(page); 3091 } 3092 return ret; 3093 } 3094 3095 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 3096 struct page *pages[], int nr_pages, 3097 u64 start, u64 end, 3098 get_extent_t *get_extent, 3099 struct extent_map **em_cached, 3100 struct bio **bio, int mirror_num, 3101 unsigned long *bio_flags, 3102 u64 *prev_em_start) 3103 { 3104 struct inode *inode; 3105 struct btrfs_ordered_extent *ordered; 3106 int index; 3107 3108 inode = pages[0]->mapping->host; 3109 while (1) { 3110 lock_extent(tree, start, end); 3111 ordered = btrfs_lookup_ordered_range(inode, start, 3112 end - start + 1); 3113 if (!ordered) 3114 break; 3115 unlock_extent(tree, start, end); 3116 btrfs_start_ordered_extent(inode, ordered, 1); 3117 btrfs_put_ordered_extent(ordered); 3118 } 3119 3120 for (index = 0; index < nr_pages; index++) { 3121 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 3122 mirror_num, bio_flags, 0, prev_em_start); 3123 put_page(pages[index]); 3124 } 3125 } 3126 3127 static void __extent_readpages(struct extent_io_tree *tree, 3128 struct page *pages[], 3129 int nr_pages, get_extent_t *get_extent, 3130 struct extent_map **em_cached, 3131 struct bio **bio, int mirror_num, 3132 unsigned long *bio_flags, 3133 u64 *prev_em_start) 3134 { 3135 u64 start = 0; 3136 u64 end = 0; 3137 u64 page_start; 3138 int index; 3139 int first_index = 0; 3140 3141 for (index = 0; index < nr_pages; index++) { 3142 page_start = page_offset(pages[index]); 3143 if (!end) { 3144 start = page_start; 3145 end = start + PAGE_SIZE - 1; 3146 first_index = index; 3147 } else if (end + 1 == page_start) { 3148 end += PAGE_SIZE; 3149 } else { 3150 __do_contiguous_readpages(tree, &pages[first_index], 3151 index - first_index, start, 3152 end, get_extent, em_cached, 3153 bio, mirror_num, bio_flags, 3154 prev_em_start); 3155 start = page_start; 3156 end = start + PAGE_SIZE - 1; 3157 first_index = index; 3158 } 3159 } 3160 3161 if (end) 3162 __do_contiguous_readpages(tree, &pages[first_index], 3163 index - first_index, start, 3164 end, get_extent, em_cached, bio, 3165 mirror_num, bio_flags, 3166 prev_em_start); 3167 } 3168 3169 static int __extent_read_full_page(struct extent_io_tree *tree, 3170 struct page *page, 3171 get_extent_t *get_extent, 3172 struct bio **bio, int mirror_num, 3173 unsigned long *bio_flags, int read_flags) 3174 { 3175 struct inode *inode = page->mapping->host; 3176 struct btrfs_ordered_extent *ordered; 3177 u64 start = page_offset(page); 3178 u64 end = start + PAGE_SIZE - 1; 3179 int ret; 3180 3181 while (1) { 3182 lock_extent(tree, start, end); 3183 ordered = btrfs_lookup_ordered_range(inode, start, 3184 PAGE_SIZE); 3185 if (!ordered) 3186 break; 3187 unlock_extent(tree, start, end); 3188 btrfs_start_ordered_extent(inode, ordered, 1); 3189 btrfs_put_ordered_extent(ordered); 3190 } 3191 3192 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3193 bio_flags, read_flags, NULL); 3194 return ret; 3195 } 3196 3197 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3198 get_extent_t *get_extent, int mirror_num) 3199 { 3200 struct bio *bio = NULL; 3201 unsigned long bio_flags = 0; 3202 int ret; 3203 3204 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3205 &bio_flags, 0); 3206 if (bio) 3207 ret = submit_one_bio(bio, mirror_num, bio_flags); 3208 return ret; 3209 } 3210 3211 static void update_nr_written(struct page *page, struct writeback_control *wbc, 3212 unsigned long nr_written) 3213 { 3214 wbc->nr_to_write -= nr_written; 3215 } 3216 3217 /* 3218 * helper for __extent_writepage, doing all of the delayed allocation setup. 3219 * 3220 * This returns 1 if our fill_delalloc function did all the work required 3221 * to write the page (copy into inline extent). In this case the IO has 3222 * been started and the page is already unlocked. 3223 * 3224 * This returns 0 if all went well (page still locked) 3225 * This returns < 0 if there were errors (page still locked) 3226 */ 3227 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3228 struct page *page, struct writeback_control *wbc, 3229 struct extent_page_data *epd, 3230 u64 delalloc_start, 3231 unsigned long *nr_written) 3232 { 3233 struct extent_io_tree *tree = epd->tree; 3234 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3235 u64 nr_delalloc; 3236 u64 delalloc_to_write = 0; 3237 u64 delalloc_end = 0; 3238 int ret; 3239 int page_started = 0; 3240 3241 if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) 3242 return 0; 3243 3244 while (delalloc_end < page_end) { 3245 nr_delalloc = find_lock_delalloc_range(inode, tree, 3246 page, 3247 &delalloc_start, 3248 &delalloc_end, 3249 BTRFS_MAX_EXTENT_SIZE); 3250 if (nr_delalloc == 0) { 3251 delalloc_start = delalloc_end + 1; 3252 continue; 3253 } 3254 ret = tree->ops->fill_delalloc(inode, page, 3255 delalloc_start, 3256 delalloc_end, 3257 &page_started, 3258 nr_written); 3259 /* File system has been set read-only */ 3260 if (ret) { 3261 SetPageError(page); 3262 /* fill_delalloc should be return < 0 for error 3263 * but just in case, we use > 0 here meaning the 3264 * IO is started, so we don't want to return > 0 3265 * unless things are going well. 3266 */ 3267 ret = ret < 0 ? ret : -EIO; 3268 goto done; 3269 } 3270 /* 3271 * delalloc_end is already one less than the total length, so 3272 * we don't subtract one from PAGE_SIZE 3273 */ 3274 delalloc_to_write += (delalloc_end - delalloc_start + 3275 PAGE_SIZE) >> PAGE_SHIFT; 3276 delalloc_start = delalloc_end + 1; 3277 } 3278 if (wbc->nr_to_write < delalloc_to_write) { 3279 int thresh = 8192; 3280 3281 if (delalloc_to_write < thresh * 2) 3282 thresh = delalloc_to_write; 3283 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3284 thresh); 3285 } 3286 3287 /* did the fill delalloc function already unlock and start 3288 * the IO? 3289 */ 3290 if (page_started) { 3291 /* 3292 * we've unlocked the page, so we can't update 3293 * the mapping's writeback index, just update 3294 * nr_to_write. 3295 */ 3296 wbc->nr_to_write -= *nr_written; 3297 return 1; 3298 } 3299 3300 ret = 0; 3301 3302 done: 3303 return ret; 3304 } 3305 3306 /* 3307 * helper for __extent_writepage. This calls the writepage start hooks, 3308 * and does the loop to map the page into extents and bios. 3309 * 3310 * We return 1 if the IO is started and the page is unlocked, 3311 * 0 if all went well (page still locked) 3312 * < 0 if there were errors (page still locked) 3313 */ 3314 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3315 struct page *page, 3316 struct writeback_control *wbc, 3317 struct extent_page_data *epd, 3318 loff_t i_size, 3319 unsigned long nr_written, 3320 int write_flags, int *nr_ret) 3321 { 3322 struct extent_io_tree *tree = epd->tree; 3323 u64 start = page_offset(page); 3324 u64 page_end = start + PAGE_SIZE - 1; 3325 u64 end; 3326 u64 cur = start; 3327 u64 extent_offset; 3328 u64 block_start; 3329 u64 iosize; 3330 sector_t sector; 3331 struct extent_state *cached_state = NULL; 3332 struct extent_map *em; 3333 struct block_device *bdev; 3334 size_t pg_offset = 0; 3335 size_t blocksize; 3336 int ret = 0; 3337 int nr = 0; 3338 bool compressed; 3339 3340 if (tree->ops && tree->ops->writepage_start_hook) { 3341 ret = tree->ops->writepage_start_hook(page, start, 3342 page_end); 3343 if (ret) { 3344 /* Fixup worker will requeue */ 3345 if (ret == -EBUSY) 3346 wbc->pages_skipped++; 3347 else 3348 redirty_page_for_writepage(wbc, page); 3349 3350 update_nr_written(page, wbc, nr_written); 3351 unlock_page(page); 3352 ret = 1; 3353 goto done_unlocked; 3354 } 3355 } 3356 3357 /* 3358 * we don't want to touch the inode after unlocking the page, 3359 * so we update the mapping writeback index now 3360 */ 3361 update_nr_written(page, wbc, nr_written + 1); 3362 3363 end = page_end; 3364 if (i_size <= start) { 3365 if (tree->ops && tree->ops->writepage_end_io_hook) 3366 tree->ops->writepage_end_io_hook(page, start, 3367 page_end, NULL, 1); 3368 goto done; 3369 } 3370 3371 blocksize = inode->i_sb->s_blocksize; 3372 3373 while (cur <= end) { 3374 u64 em_end; 3375 unsigned long max_nr; 3376 3377 if (cur >= i_size) { 3378 if (tree->ops && tree->ops->writepage_end_io_hook) 3379 tree->ops->writepage_end_io_hook(page, cur, 3380 page_end, NULL, 1); 3381 break; 3382 } 3383 em = epd->get_extent(inode, page, pg_offset, cur, 3384 end - cur + 1, 1); 3385 if (IS_ERR_OR_NULL(em)) { 3386 SetPageError(page); 3387 ret = PTR_ERR_OR_ZERO(em); 3388 break; 3389 } 3390 3391 extent_offset = cur - em->start; 3392 em_end = extent_map_end(em); 3393 BUG_ON(em_end <= cur); 3394 BUG_ON(end < cur); 3395 iosize = min(em_end - cur, end - cur + 1); 3396 iosize = ALIGN(iosize, blocksize); 3397 sector = (em->block_start + extent_offset) >> 9; 3398 bdev = em->bdev; 3399 block_start = em->block_start; 3400 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3401 free_extent_map(em); 3402 em = NULL; 3403 3404 /* 3405 * compressed and inline extents are written through other 3406 * paths in the FS 3407 */ 3408 if (compressed || block_start == EXTENT_MAP_HOLE || 3409 block_start == EXTENT_MAP_INLINE) { 3410 /* 3411 * end_io notification does not happen here for 3412 * compressed extents 3413 */ 3414 if (!compressed && tree->ops && 3415 tree->ops->writepage_end_io_hook) 3416 tree->ops->writepage_end_io_hook(page, cur, 3417 cur + iosize - 1, 3418 NULL, 1); 3419 else if (compressed) { 3420 /* we don't want to end_page_writeback on 3421 * a compressed extent. this happens 3422 * elsewhere 3423 */ 3424 nr++; 3425 } 3426 3427 cur += iosize; 3428 pg_offset += iosize; 3429 continue; 3430 } 3431 3432 max_nr = (i_size >> PAGE_SHIFT) + 1; 3433 3434 set_range_writeback(tree, cur, cur + iosize - 1); 3435 if (!PageWriteback(page)) { 3436 btrfs_err(BTRFS_I(inode)->root->fs_info, 3437 "page %lu not writeback, cur %llu end %llu", 3438 page->index, cur, end); 3439 } 3440 3441 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3442 page, sector, iosize, pg_offset, 3443 bdev, &epd->bio, max_nr, 3444 end_bio_extent_writepage, 3445 0, 0, 0, false); 3446 if (ret) 3447 SetPageError(page); 3448 3449 cur = cur + iosize; 3450 pg_offset += iosize; 3451 nr++; 3452 } 3453 done: 3454 *nr_ret = nr; 3455 3456 done_unlocked: 3457 3458 /* drop our reference on any cached states */ 3459 free_extent_state(cached_state); 3460 return ret; 3461 } 3462 3463 /* 3464 * the writepage semantics are similar to regular writepage. extent 3465 * records are inserted to lock ranges in the tree, and as dirty areas 3466 * are found, they are marked writeback. Then the lock bits are removed 3467 * and the end_io handler clears the writeback ranges 3468 */ 3469 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3470 void *data) 3471 { 3472 struct inode *inode = page->mapping->host; 3473 struct extent_page_data *epd = data; 3474 u64 start = page_offset(page); 3475 u64 page_end = start + PAGE_SIZE - 1; 3476 int ret; 3477 int nr = 0; 3478 size_t pg_offset = 0; 3479 loff_t i_size = i_size_read(inode); 3480 unsigned long end_index = i_size >> PAGE_SHIFT; 3481 int write_flags = 0; 3482 unsigned long nr_written = 0; 3483 3484 if (wbc->sync_mode == WB_SYNC_ALL) 3485 write_flags = REQ_SYNC; 3486 3487 trace___extent_writepage(page, inode, wbc); 3488 3489 WARN_ON(!PageLocked(page)); 3490 3491 ClearPageError(page); 3492 3493 pg_offset = i_size & (PAGE_SIZE - 1); 3494 if (page->index > end_index || 3495 (page->index == end_index && !pg_offset)) { 3496 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3497 unlock_page(page); 3498 return 0; 3499 } 3500 3501 if (page->index == end_index) { 3502 char *userpage; 3503 3504 userpage = kmap_atomic(page); 3505 memset(userpage + pg_offset, 0, 3506 PAGE_SIZE - pg_offset); 3507 kunmap_atomic(userpage); 3508 flush_dcache_page(page); 3509 } 3510 3511 pg_offset = 0; 3512 3513 set_page_extent_mapped(page); 3514 3515 ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); 3516 if (ret == 1) 3517 goto done_unlocked; 3518 if (ret) 3519 goto done; 3520 3521 ret = __extent_writepage_io(inode, page, wbc, epd, 3522 i_size, nr_written, write_flags, &nr); 3523 if (ret == 1) 3524 goto done_unlocked; 3525 3526 done: 3527 if (nr == 0) { 3528 /* make sure the mapping tag for page dirty gets cleared */ 3529 set_page_writeback(page); 3530 end_page_writeback(page); 3531 } 3532 if (PageError(page)) { 3533 ret = ret < 0 ? ret : -EIO; 3534 end_extent_writepage(page, ret, start, page_end); 3535 } 3536 unlock_page(page); 3537 return ret; 3538 3539 done_unlocked: 3540 return 0; 3541 } 3542 3543 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3544 { 3545 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3546 TASK_UNINTERRUPTIBLE); 3547 } 3548 3549 static noinline_for_stack int 3550 lock_extent_buffer_for_io(struct extent_buffer *eb, 3551 struct btrfs_fs_info *fs_info, 3552 struct extent_page_data *epd) 3553 { 3554 unsigned long i, num_pages; 3555 int flush = 0; 3556 int ret = 0; 3557 3558 if (!btrfs_try_tree_write_lock(eb)) { 3559 flush = 1; 3560 flush_write_bio(epd); 3561 btrfs_tree_lock(eb); 3562 } 3563 3564 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3565 btrfs_tree_unlock(eb); 3566 if (!epd->sync_io) 3567 return 0; 3568 if (!flush) { 3569 flush_write_bio(epd); 3570 flush = 1; 3571 } 3572 while (1) { 3573 wait_on_extent_buffer_writeback(eb); 3574 btrfs_tree_lock(eb); 3575 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3576 break; 3577 btrfs_tree_unlock(eb); 3578 } 3579 } 3580 3581 /* 3582 * We need to do this to prevent races in people who check if the eb is 3583 * under IO since we can end up having no IO bits set for a short period 3584 * of time. 3585 */ 3586 spin_lock(&eb->refs_lock); 3587 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3588 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3589 spin_unlock(&eb->refs_lock); 3590 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3591 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3592 -eb->len, 3593 fs_info->dirty_metadata_batch); 3594 ret = 1; 3595 } else { 3596 spin_unlock(&eb->refs_lock); 3597 } 3598 3599 btrfs_tree_unlock(eb); 3600 3601 if (!ret) 3602 return ret; 3603 3604 num_pages = num_extent_pages(eb->start, eb->len); 3605 for (i = 0; i < num_pages; i++) { 3606 struct page *p = eb->pages[i]; 3607 3608 if (!trylock_page(p)) { 3609 if (!flush) { 3610 flush_write_bio(epd); 3611 flush = 1; 3612 } 3613 lock_page(p); 3614 } 3615 } 3616 3617 return ret; 3618 } 3619 3620 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3621 { 3622 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3623 smp_mb__after_atomic(); 3624 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3625 } 3626 3627 static void set_btree_ioerr(struct page *page) 3628 { 3629 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3630 3631 SetPageError(page); 3632 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3633 return; 3634 3635 /* 3636 * If writeback for a btree extent that doesn't belong to a log tree 3637 * failed, increment the counter transaction->eb_write_errors. 3638 * We do this because while the transaction is running and before it's 3639 * committing (when we call filemap_fdata[write|wait]_range against 3640 * the btree inode), we might have 3641 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3642 * returns an error or an error happens during writeback, when we're 3643 * committing the transaction we wouldn't know about it, since the pages 3644 * can be no longer dirty nor marked anymore for writeback (if a 3645 * subsequent modification to the extent buffer didn't happen before the 3646 * transaction commit), which makes filemap_fdata[write|wait]_range not 3647 * able to find the pages tagged with SetPageError at transaction 3648 * commit time. So if this happens we must abort the transaction, 3649 * otherwise we commit a super block with btree roots that point to 3650 * btree nodes/leafs whose content on disk is invalid - either garbage 3651 * or the content of some node/leaf from a past generation that got 3652 * cowed or deleted and is no longer valid. 3653 * 3654 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3655 * not be enough - we need to distinguish between log tree extents vs 3656 * non-log tree extents, and the next filemap_fdatawait_range() call 3657 * will catch and clear such errors in the mapping - and that call might 3658 * be from a log sync and not from a transaction commit. Also, checking 3659 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3660 * not done and would not be reliable - the eb might have been released 3661 * from memory and reading it back again means that flag would not be 3662 * set (since it's a runtime flag, not persisted on disk). 3663 * 3664 * Using the flags below in the btree inode also makes us achieve the 3665 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3666 * writeback for all dirty pages and before filemap_fdatawait_range() 3667 * is called, the writeback for all dirty pages had already finished 3668 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3669 * filemap_fdatawait_range() would return success, as it could not know 3670 * that writeback errors happened (the pages were no longer tagged for 3671 * writeback). 3672 */ 3673 switch (eb->log_index) { 3674 case -1: 3675 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3676 break; 3677 case 0: 3678 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3679 break; 3680 case 1: 3681 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3682 break; 3683 default: 3684 BUG(); /* unexpected, logic error */ 3685 } 3686 } 3687 3688 static void end_bio_extent_buffer_writepage(struct bio *bio) 3689 { 3690 struct bio_vec *bvec; 3691 struct extent_buffer *eb; 3692 int i, done; 3693 3694 bio_for_each_segment_all(bvec, bio, i) { 3695 struct page *page = bvec->bv_page; 3696 3697 eb = (struct extent_buffer *)page->private; 3698 BUG_ON(!eb); 3699 done = atomic_dec_and_test(&eb->io_pages); 3700 3701 if (bio->bi_error || 3702 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3703 ClearPageUptodate(page); 3704 set_btree_ioerr(page); 3705 } 3706 3707 end_page_writeback(page); 3708 3709 if (!done) 3710 continue; 3711 3712 end_extent_buffer_writeback(eb); 3713 } 3714 3715 bio_put(bio); 3716 } 3717 3718 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3719 struct btrfs_fs_info *fs_info, 3720 struct writeback_control *wbc, 3721 struct extent_page_data *epd) 3722 { 3723 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3724 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3725 u64 offset = eb->start; 3726 u32 nritems; 3727 unsigned long i, num_pages; 3728 unsigned long bio_flags = 0; 3729 unsigned long start, end; 3730 int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; 3731 int ret = 0; 3732 3733 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3734 num_pages = num_extent_pages(eb->start, eb->len); 3735 atomic_set(&eb->io_pages, num_pages); 3736 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3737 bio_flags = EXTENT_BIO_TREE_LOG; 3738 3739 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3740 nritems = btrfs_header_nritems(eb); 3741 if (btrfs_header_level(eb) > 0) { 3742 end = btrfs_node_key_ptr_offset(nritems); 3743 3744 memset_extent_buffer(eb, 0, end, eb->len - end); 3745 } else { 3746 /* 3747 * leaf: 3748 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3749 */ 3750 start = btrfs_item_nr_offset(nritems); 3751 end = btrfs_leaf_data(eb) + 3752 leaf_data_end(fs_info->tree_root, eb); 3753 memset_extent_buffer(eb, 0, start, end - start); 3754 } 3755 3756 for (i = 0; i < num_pages; i++) { 3757 struct page *p = eb->pages[i]; 3758 3759 clear_page_dirty_for_io(p); 3760 set_page_writeback(p); 3761 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3762 p, offset >> 9, PAGE_SIZE, 0, bdev, 3763 &epd->bio, -1, 3764 end_bio_extent_buffer_writepage, 3765 0, epd->bio_flags, bio_flags, false); 3766 epd->bio_flags = bio_flags; 3767 if (ret) { 3768 set_btree_ioerr(p); 3769 end_page_writeback(p); 3770 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3771 end_extent_buffer_writeback(eb); 3772 ret = -EIO; 3773 break; 3774 } 3775 offset += PAGE_SIZE; 3776 update_nr_written(p, wbc, 1); 3777 unlock_page(p); 3778 } 3779 3780 if (unlikely(ret)) { 3781 for (; i < num_pages; i++) { 3782 struct page *p = eb->pages[i]; 3783 clear_page_dirty_for_io(p); 3784 unlock_page(p); 3785 } 3786 } 3787 3788 return ret; 3789 } 3790 3791 int btree_write_cache_pages(struct address_space *mapping, 3792 struct writeback_control *wbc) 3793 { 3794 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3795 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3796 struct extent_buffer *eb, *prev_eb = NULL; 3797 struct extent_page_data epd = { 3798 .bio = NULL, 3799 .tree = tree, 3800 .extent_locked = 0, 3801 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3802 .bio_flags = 0, 3803 }; 3804 int ret = 0; 3805 int done = 0; 3806 int nr_to_write_done = 0; 3807 struct pagevec pvec; 3808 int nr_pages; 3809 pgoff_t index; 3810 pgoff_t end; /* Inclusive */ 3811 int scanned = 0; 3812 int tag; 3813 3814 pagevec_init(&pvec, 0); 3815 if (wbc->range_cyclic) { 3816 index = mapping->writeback_index; /* Start from prev offset */ 3817 end = -1; 3818 } else { 3819 index = wbc->range_start >> PAGE_SHIFT; 3820 end = wbc->range_end >> PAGE_SHIFT; 3821 scanned = 1; 3822 } 3823 if (wbc->sync_mode == WB_SYNC_ALL) 3824 tag = PAGECACHE_TAG_TOWRITE; 3825 else 3826 tag = PAGECACHE_TAG_DIRTY; 3827 retry: 3828 if (wbc->sync_mode == WB_SYNC_ALL) 3829 tag_pages_for_writeback(mapping, index, end); 3830 while (!done && !nr_to_write_done && (index <= end) && 3831 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3832 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3833 unsigned i; 3834 3835 scanned = 1; 3836 for (i = 0; i < nr_pages; i++) { 3837 struct page *page = pvec.pages[i]; 3838 3839 if (!PagePrivate(page)) 3840 continue; 3841 3842 if (!wbc->range_cyclic && page->index > end) { 3843 done = 1; 3844 break; 3845 } 3846 3847 spin_lock(&mapping->private_lock); 3848 if (!PagePrivate(page)) { 3849 spin_unlock(&mapping->private_lock); 3850 continue; 3851 } 3852 3853 eb = (struct extent_buffer *)page->private; 3854 3855 /* 3856 * Shouldn't happen and normally this would be a BUG_ON 3857 * but no sense in crashing the users box for something 3858 * we can survive anyway. 3859 */ 3860 if (WARN_ON(!eb)) { 3861 spin_unlock(&mapping->private_lock); 3862 continue; 3863 } 3864 3865 if (eb == prev_eb) { 3866 spin_unlock(&mapping->private_lock); 3867 continue; 3868 } 3869 3870 ret = atomic_inc_not_zero(&eb->refs); 3871 spin_unlock(&mapping->private_lock); 3872 if (!ret) 3873 continue; 3874 3875 prev_eb = eb; 3876 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3877 if (!ret) { 3878 free_extent_buffer(eb); 3879 continue; 3880 } 3881 3882 ret = write_one_eb(eb, fs_info, wbc, &epd); 3883 if (ret) { 3884 done = 1; 3885 free_extent_buffer(eb); 3886 break; 3887 } 3888 free_extent_buffer(eb); 3889 3890 /* 3891 * the filesystem may choose to bump up nr_to_write. 3892 * We have to make sure to honor the new nr_to_write 3893 * at any time 3894 */ 3895 nr_to_write_done = wbc->nr_to_write <= 0; 3896 } 3897 pagevec_release(&pvec); 3898 cond_resched(); 3899 } 3900 if (!scanned && !done) { 3901 /* 3902 * We hit the last page and there is more work to be done: wrap 3903 * back to the start of the file 3904 */ 3905 scanned = 1; 3906 index = 0; 3907 goto retry; 3908 } 3909 flush_write_bio(&epd); 3910 return ret; 3911 } 3912 3913 /** 3914 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3915 * @mapping: address space structure to write 3916 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3917 * @writepage: function called for each page 3918 * @data: data passed to writepage function 3919 * 3920 * If a page is already under I/O, write_cache_pages() skips it, even 3921 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3922 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3923 * and msync() need to guarantee that all the data which was dirty at the time 3924 * the call was made get new I/O started against them. If wbc->sync_mode is 3925 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3926 * existing IO to complete. 3927 */ 3928 static int extent_write_cache_pages(struct extent_io_tree *tree, 3929 struct address_space *mapping, 3930 struct writeback_control *wbc, 3931 writepage_t writepage, void *data, 3932 void (*flush_fn)(void *)) 3933 { 3934 struct inode *inode = mapping->host; 3935 int ret = 0; 3936 int done = 0; 3937 int nr_to_write_done = 0; 3938 struct pagevec pvec; 3939 int nr_pages; 3940 pgoff_t index; 3941 pgoff_t end; /* Inclusive */ 3942 pgoff_t done_index; 3943 int range_whole = 0; 3944 int scanned = 0; 3945 int tag; 3946 3947 /* 3948 * We have to hold onto the inode so that ordered extents can do their 3949 * work when the IO finishes. The alternative to this is failing to add 3950 * an ordered extent if the igrab() fails there and that is a huge pain 3951 * to deal with, so instead just hold onto the inode throughout the 3952 * writepages operation. If it fails here we are freeing up the inode 3953 * anyway and we'd rather not waste our time writing out stuff that is 3954 * going to be truncated anyway. 3955 */ 3956 if (!igrab(inode)) 3957 return 0; 3958 3959 pagevec_init(&pvec, 0); 3960 if (wbc->range_cyclic) { 3961 index = mapping->writeback_index; /* Start from prev offset */ 3962 end = -1; 3963 } else { 3964 index = wbc->range_start >> PAGE_SHIFT; 3965 end = wbc->range_end >> PAGE_SHIFT; 3966 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 3967 range_whole = 1; 3968 scanned = 1; 3969 } 3970 if (wbc->sync_mode == WB_SYNC_ALL) 3971 tag = PAGECACHE_TAG_TOWRITE; 3972 else 3973 tag = PAGECACHE_TAG_DIRTY; 3974 retry: 3975 if (wbc->sync_mode == WB_SYNC_ALL) 3976 tag_pages_for_writeback(mapping, index, end); 3977 done_index = index; 3978 while (!done && !nr_to_write_done && (index <= end) && 3979 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3980 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3981 unsigned i; 3982 3983 scanned = 1; 3984 for (i = 0; i < nr_pages; i++) { 3985 struct page *page = pvec.pages[i]; 3986 3987 done_index = page->index; 3988 /* 3989 * At this point we hold neither mapping->tree_lock nor 3990 * lock on the page itself: the page may be truncated or 3991 * invalidated (changing page->mapping to NULL), or even 3992 * swizzled back from swapper_space to tmpfs file 3993 * mapping 3994 */ 3995 if (!trylock_page(page)) { 3996 flush_fn(data); 3997 lock_page(page); 3998 } 3999 4000 if (unlikely(page->mapping != mapping)) { 4001 unlock_page(page); 4002 continue; 4003 } 4004 4005 if (!wbc->range_cyclic && page->index > end) { 4006 done = 1; 4007 unlock_page(page); 4008 continue; 4009 } 4010 4011 if (wbc->sync_mode != WB_SYNC_NONE) { 4012 if (PageWriteback(page)) 4013 flush_fn(data); 4014 wait_on_page_writeback(page); 4015 } 4016 4017 if (PageWriteback(page) || 4018 !clear_page_dirty_for_io(page)) { 4019 unlock_page(page); 4020 continue; 4021 } 4022 4023 ret = (*writepage)(page, wbc, data); 4024 4025 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 4026 unlock_page(page); 4027 ret = 0; 4028 } 4029 if (ret < 0) { 4030 /* 4031 * done_index is set past this page, 4032 * so media errors will not choke 4033 * background writeout for the entire 4034 * file. This has consequences for 4035 * range_cyclic semantics (ie. it may 4036 * not be suitable for data integrity 4037 * writeout). 4038 */ 4039 done_index = page->index + 1; 4040 done = 1; 4041 break; 4042 } 4043 4044 /* 4045 * the filesystem may choose to bump up nr_to_write. 4046 * We have to make sure to honor the new nr_to_write 4047 * at any time 4048 */ 4049 nr_to_write_done = wbc->nr_to_write <= 0; 4050 } 4051 pagevec_release(&pvec); 4052 cond_resched(); 4053 } 4054 if (!scanned && !done) { 4055 /* 4056 * We hit the last page and there is more work to be done: wrap 4057 * back to the start of the file 4058 */ 4059 scanned = 1; 4060 index = 0; 4061 goto retry; 4062 } 4063 4064 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4065 mapping->writeback_index = done_index; 4066 4067 btrfs_add_delayed_iput(inode); 4068 return ret; 4069 } 4070 4071 static void flush_epd_write_bio(struct extent_page_data *epd) 4072 { 4073 if (epd->bio) { 4074 int ret; 4075 4076 bio_set_op_attrs(epd->bio, REQ_OP_WRITE, 4077 epd->sync_io ? REQ_SYNC : 0); 4078 4079 ret = submit_one_bio(epd->bio, 0, epd->bio_flags); 4080 BUG_ON(ret < 0); /* -ENOMEM */ 4081 epd->bio = NULL; 4082 } 4083 } 4084 4085 static noinline void flush_write_bio(void *data) 4086 { 4087 struct extent_page_data *epd = data; 4088 flush_epd_write_bio(epd); 4089 } 4090 4091 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 4092 get_extent_t *get_extent, 4093 struct writeback_control *wbc) 4094 { 4095 int ret; 4096 struct extent_page_data epd = { 4097 .bio = NULL, 4098 .tree = tree, 4099 .get_extent = get_extent, 4100 .extent_locked = 0, 4101 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4102 .bio_flags = 0, 4103 }; 4104 4105 ret = __extent_writepage(page, wbc, &epd); 4106 4107 flush_epd_write_bio(&epd); 4108 return ret; 4109 } 4110 4111 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 4112 u64 start, u64 end, get_extent_t *get_extent, 4113 int mode) 4114 { 4115 int ret = 0; 4116 struct address_space *mapping = inode->i_mapping; 4117 struct page *page; 4118 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4119 PAGE_SHIFT; 4120 4121 struct extent_page_data epd = { 4122 .bio = NULL, 4123 .tree = tree, 4124 .get_extent = get_extent, 4125 .extent_locked = 1, 4126 .sync_io = mode == WB_SYNC_ALL, 4127 .bio_flags = 0, 4128 }; 4129 struct writeback_control wbc_writepages = { 4130 .sync_mode = mode, 4131 .nr_to_write = nr_pages * 2, 4132 .range_start = start, 4133 .range_end = end + 1, 4134 }; 4135 4136 while (start <= end) { 4137 page = find_get_page(mapping, start >> PAGE_SHIFT); 4138 if (clear_page_dirty_for_io(page)) 4139 ret = __extent_writepage(page, &wbc_writepages, &epd); 4140 else { 4141 if (tree->ops && tree->ops->writepage_end_io_hook) 4142 tree->ops->writepage_end_io_hook(page, start, 4143 start + PAGE_SIZE - 1, 4144 NULL, 1); 4145 unlock_page(page); 4146 } 4147 put_page(page); 4148 start += PAGE_SIZE; 4149 } 4150 4151 flush_epd_write_bio(&epd); 4152 return ret; 4153 } 4154 4155 int extent_writepages(struct extent_io_tree *tree, 4156 struct address_space *mapping, 4157 get_extent_t *get_extent, 4158 struct writeback_control *wbc) 4159 { 4160 int ret = 0; 4161 struct extent_page_data epd = { 4162 .bio = NULL, 4163 .tree = tree, 4164 .get_extent = get_extent, 4165 .extent_locked = 0, 4166 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4167 .bio_flags = 0, 4168 }; 4169 4170 ret = extent_write_cache_pages(tree, mapping, wbc, 4171 __extent_writepage, &epd, 4172 flush_write_bio); 4173 flush_epd_write_bio(&epd); 4174 return ret; 4175 } 4176 4177 int extent_readpages(struct extent_io_tree *tree, 4178 struct address_space *mapping, 4179 struct list_head *pages, unsigned nr_pages, 4180 get_extent_t get_extent) 4181 { 4182 struct bio *bio = NULL; 4183 unsigned page_idx; 4184 unsigned long bio_flags = 0; 4185 struct page *pagepool[16]; 4186 struct page *page; 4187 struct extent_map *em_cached = NULL; 4188 int nr = 0; 4189 u64 prev_em_start = (u64)-1; 4190 4191 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 4192 page = list_entry(pages->prev, struct page, lru); 4193 4194 prefetchw(&page->flags); 4195 list_del(&page->lru); 4196 if (add_to_page_cache_lru(page, mapping, 4197 page->index, 4198 readahead_gfp_mask(mapping))) { 4199 put_page(page); 4200 continue; 4201 } 4202 4203 pagepool[nr++] = page; 4204 if (nr < ARRAY_SIZE(pagepool)) 4205 continue; 4206 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4207 &bio, 0, &bio_flags, &prev_em_start); 4208 nr = 0; 4209 } 4210 if (nr) 4211 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4212 &bio, 0, &bio_flags, &prev_em_start); 4213 4214 if (em_cached) 4215 free_extent_map(em_cached); 4216 4217 BUG_ON(!list_empty(pages)); 4218 if (bio) 4219 return submit_one_bio(bio, 0, bio_flags); 4220 return 0; 4221 } 4222 4223 /* 4224 * basic invalidatepage code, this waits on any locked or writeback 4225 * ranges corresponding to the page, and then deletes any extent state 4226 * records from the tree 4227 */ 4228 int extent_invalidatepage(struct extent_io_tree *tree, 4229 struct page *page, unsigned long offset) 4230 { 4231 struct extent_state *cached_state = NULL; 4232 u64 start = page_offset(page); 4233 u64 end = start + PAGE_SIZE - 1; 4234 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4235 4236 start += ALIGN(offset, blocksize); 4237 if (start > end) 4238 return 0; 4239 4240 lock_extent_bits(tree, start, end, &cached_state); 4241 wait_on_page_writeback(page); 4242 clear_extent_bit(tree, start, end, 4243 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4244 EXTENT_DO_ACCOUNTING, 4245 1, 1, &cached_state, GFP_NOFS); 4246 return 0; 4247 } 4248 4249 /* 4250 * a helper for releasepage, this tests for areas of the page that 4251 * are locked or under IO and drops the related state bits if it is safe 4252 * to drop the page. 4253 */ 4254 static int try_release_extent_state(struct extent_map_tree *map, 4255 struct extent_io_tree *tree, 4256 struct page *page, gfp_t mask) 4257 { 4258 u64 start = page_offset(page); 4259 u64 end = start + PAGE_SIZE - 1; 4260 int ret = 1; 4261 4262 if (test_range_bit(tree, start, end, 4263 EXTENT_IOBITS, 0, NULL)) 4264 ret = 0; 4265 else { 4266 if ((mask & GFP_NOFS) == GFP_NOFS) 4267 mask = GFP_NOFS; 4268 /* 4269 * at this point we can safely clear everything except the 4270 * locked bit and the nodatasum bit 4271 */ 4272 ret = clear_extent_bit(tree, start, end, 4273 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4274 0, 0, NULL, mask); 4275 4276 /* if clear_extent_bit failed for enomem reasons, 4277 * we can't allow the release to continue. 4278 */ 4279 if (ret < 0) 4280 ret = 0; 4281 else 4282 ret = 1; 4283 } 4284 return ret; 4285 } 4286 4287 /* 4288 * a helper for releasepage. As long as there are no locked extents 4289 * in the range corresponding to the page, both state records and extent 4290 * map records are removed 4291 */ 4292 int try_release_extent_mapping(struct extent_map_tree *map, 4293 struct extent_io_tree *tree, struct page *page, 4294 gfp_t mask) 4295 { 4296 struct extent_map *em; 4297 u64 start = page_offset(page); 4298 u64 end = start + PAGE_SIZE - 1; 4299 4300 if (gfpflags_allow_blocking(mask) && 4301 page->mapping->host->i_size > SZ_16M) { 4302 u64 len; 4303 while (start <= end) { 4304 len = end - start + 1; 4305 write_lock(&map->lock); 4306 em = lookup_extent_mapping(map, start, len); 4307 if (!em) { 4308 write_unlock(&map->lock); 4309 break; 4310 } 4311 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4312 em->start != start) { 4313 write_unlock(&map->lock); 4314 free_extent_map(em); 4315 break; 4316 } 4317 if (!test_range_bit(tree, em->start, 4318 extent_map_end(em) - 1, 4319 EXTENT_LOCKED | EXTENT_WRITEBACK, 4320 0, NULL)) { 4321 remove_extent_mapping(map, em); 4322 /* once for the rb tree */ 4323 free_extent_map(em); 4324 } 4325 start = extent_map_end(em); 4326 write_unlock(&map->lock); 4327 4328 /* once for us */ 4329 free_extent_map(em); 4330 } 4331 } 4332 return try_release_extent_state(map, tree, page, mask); 4333 } 4334 4335 /* 4336 * helper function for fiemap, which doesn't want to see any holes. 4337 * This maps until we find something past 'last' 4338 */ 4339 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4340 u64 offset, 4341 u64 last, 4342 get_extent_t *get_extent) 4343 { 4344 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 4345 struct extent_map *em; 4346 u64 len; 4347 4348 if (offset >= last) 4349 return NULL; 4350 4351 while (1) { 4352 len = last - offset; 4353 if (len == 0) 4354 break; 4355 len = ALIGN(len, sectorsize); 4356 em = get_extent(inode, NULL, 0, offset, len, 0); 4357 if (IS_ERR_OR_NULL(em)) 4358 return em; 4359 4360 /* if this isn't a hole return it */ 4361 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 4362 em->block_start != EXTENT_MAP_HOLE) { 4363 return em; 4364 } 4365 4366 /* this is a hole, advance to the next extent */ 4367 offset = extent_map_end(em); 4368 free_extent_map(em); 4369 if (offset >= last) 4370 break; 4371 } 4372 return NULL; 4373 } 4374 4375 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4376 __u64 start, __u64 len, get_extent_t *get_extent) 4377 { 4378 int ret = 0; 4379 u64 off = start; 4380 u64 max = start + len; 4381 u32 flags = 0; 4382 u32 found_type; 4383 u64 last; 4384 u64 last_for_get_extent = 0; 4385 u64 disko = 0; 4386 u64 isize = i_size_read(inode); 4387 struct btrfs_key found_key; 4388 struct extent_map *em = NULL; 4389 struct extent_state *cached_state = NULL; 4390 struct btrfs_path *path; 4391 struct btrfs_root *root = BTRFS_I(inode)->root; 4392 int end = 0; 4393 u64 em_start = 0; 4394 u64 em_len = 0; 4395 u64 em_end = 0; 4396 4397 if (len == 0) 4398 return -EINVAL; 4399 4400 path = btrfs_alloc_path(); 4401 if (!path) 4402 return -ENOMEM; 4403 path->leave_spinning = 1; 4404 4405 start = round_down(start, BTRFS_I(inode)->root->sectorsize); 4406 len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start; 4407 4408 /* 4409 * lookup the last file extent. We're not using i_size here 4410 * because there might be preallocation past i_size 4411 */ 4412 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, 4413 0); 4414 if (ret < 0) { 4415 btrfs_free_path(path); 4416 return ret; 4417 } else { 4418 WARN_ON(!ret); 4419 if (ret == 1) 4420 ret = 0; 4421 } 4422 4423 path->slots[0]--; 4424 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4425 found_type = found_key.type; 4426 4427 /* No extents, but there might be delalloc bits */ 4428 if (found_key.objectid != btrfs_ino(inode) || 4429 found_type != BTRFS_EXTENT_DATA_KEY) { 4430 /* have to trust i_size as the end */ 4431 last = (u64)-1; 4432 last_for_get_extent = isize; 4433 } else { 4434 /* 4435 * remember the start of the last extent. There are a 4436 * bunch of different factors that go into the length of the 4437 * extent, so its much less complex to remember where it started 4438 */ 4439 last = found_key.offset; 4440 last_for_get_extent = last + 1; 4441 } 4442 btrfs_release_path(path); 4443 4444 /* 4445 * we might have some extents allocated but more delalloc past those 4446 * extents. so, we trust isize unless the start of the last extent is 4447 * beyond isize 4448 */ 4449 if (last < isize) { 4450 last = (u64)-1; 4451 last_for_get_extent = isize; 4452 } 4453 4454 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4455 &cached_state); 4456 4457 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4458 get_extent); 4459 if (!em) 4460 goto out; 4461 if (IS_ERR(em)) { 4462 ret = PTR_ERR(em); 4463 goto out; 4464 } 4465 4466 while (!end) { 4467 u64 offset_in_extent = 0; 4468 4469 /* break if the extent we found is outside the range */ 4470 if (em->start >= max || extent_map_end(em) < off) 4471 break; 4472 4473 /* 4474 * get_extent may return an extent that starts before our 4475 * requested range. We have to make sure the ranges 4476 * we return to fiemap always move forward and don't 4477 * overlap, so adjust the offsets here 4478 */ 4479 em_start = max(em->start, off); 4480 4481 /* 4482 * record the offset from the start of the extent 4483 * for adjusting the disk offset below. Only do this if the 4484 * extent isn't compressed since our in ram offset may be past 4485 * what we have actually allocated on disk. 4486 */ 4487 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4488 offset_in_extent = em_start - em->start; 4489 em_end = extent_map_end(em); 4490 em_len = em_end - em_start; 4491 disko = 0; 4492 flags = 0; 4493 4494 /* 4495 * bump off for our next call to get_extent 4496 */ 4497 off = extent_map_end(em); 4498 if (off >= max) 4499 end = 1; 4500 4501 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4502 end = 1; 4503 flags |= FIEMAP_EXTENT_LAST; 4504 } else if (em->block_start == EXTENT_MAP_INLINE) { 4505 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4506 FIEMAP_EXTENT_NOT_ALIGNED); 4507 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4508 flags |= (FIEMAP_EXTENT_DELALLOC | 4509 FIEMAP_EXTENT_UNKNOWN); 4510 } else if (fieinfo->fi_extents_max) { 4511 struct btrfs_trans_handle *trans; 4512 4513 u64 bytenr = em->block_start - 4514 (em->start - em->orig_start); 4515 4516 disko = em->block_start + offset_in_extent; 4517 4518 /* 4519 * We need a trans handle to get delayed refs 4520 */ 4521 trans = btrfs_join_transaction(root); 4522 /* 4523 * It's OK if we can't start a trans we can still check 4524 * from commit_root 4525 */ 4526 if (IS_ERR(trans)) 4527 trans = NULL; 4528 4529 /* 4530 * As btrfs supports shared space, this information 4531 * can be exported to userspace tools via 4532 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4533 * then we're just getting a count and we can skip the 4534 * lookup stuff. 4535 */ 4536 ret = btrfs_check_shared(trans, root->fs_info, 4537 root->objectid, 4538 btrfs_ino(inode), bytenr); 4539 if (trans) 4540 btrfs_end_transaction(trans, root); 4541 if (ret < 0) 4542 goto out_free; 4543 if (ret) 4544 flags |= FIEMAP_EXTENT_SHARED; 4545 ret = 0; 4546 } 4547 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4548 flags |= FIEMAP_EXTENT_ENCODED; 4549 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4550 flags |= FIEMAP_EXTENT_UNWRITTEN; 4551 4552 free_extent_map(em); 4553 em = NULL; 4554 if ((em_start >= last) || em_len == (u64)-1 || 4555 (last == (u64)-1 && isize <= em_end)) { 4556 flags |= FIEMAP_EXTENT_LAST; 4557 end = 1; 4558 } 4559 4560 /* now scan forward to see if this is really the last extent. */ 4561 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4562 get_extent); 4563 if (IS_ERR(em)) { 4564 ret = PTR_ERR(em); 4565 goto out; 4566 } 4567 if (!em) { 4568 flags |= FIEMAP_EXTENT_LAST; 4569 end = 1; 4570 } 4571 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4572 em_len, flags); 4573 if (ret) { 4574 if (ret == 1) 4575 ret = 0; 4576 goto out_free; 4577 } 4578 } 4579 out_free: 4580 free_extent_map(em); 4581 out: 4582 btrfs_free_path(path); 4583 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4584 &cached_state, GFP_NOFS); 4585 return ret; 4586 } 4587 4588 static void __free_extent_buffer(struct extent_buffer *eb) 4589 { 4590 btrfs_leak_debug_del(&eb->leak_list); 4591 kmem_cache_free(extent_buffer_cache, eb); 4592 } 4593 4594 int extent_buffer_under_io(struct extent_buffer *eb) 4595 { 4596 return (atomic_read(&eb->io_pages) || 4597 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4598 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4599 } 4600 4601 /* 4602 * Helper for releasing extent buffer page. 4603 */ 4604 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) 4605 { 4606 unsigned long index; 4607 struct page *page; 4608 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4609 4610 BUG_ON(extent_buffer_under_io(eb)); 4611 4612 index = num_extent_pages(eb->start, eb->len); 4613 if (index == 0) 4614 return; 4615 4616 do { 4617 index--; 4618 page = eb->pages[index]; 4619 if (!page) 4620 continue; 4621 if (mapped) 4622 spin_lock(&page->mapping->private_lock); 4623 /* 4624 * We do this since we'll remove the pages after we've 4625 * removed the eb from the radix tree, so we could race 4626 * and have this page now attached to the new eb. So 4627 * only clear page_private if it's still connected to 4628 * this eb. 4629 */ 4630 if (PagePrivate(page) && 4631 page->private == (unsigned long)eb) { 4632 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4633 BUG_ON(PageDirty(page)); 4634 BUG_ON(PageWriteback(page)); 4635 /* 4636 * We need to make sure we haven't be attached 4637 * to a new eb. 4638 */ 4639 ClearPagePrivate(page); 4640 set_page_private(page, 0); 4641 /* One for the page private */ 4642 put_page(page); 4643 } 4644 4645 if (mapped) 4646 spin_unlock(&page->mapping->private_lock); 4647 4648 /* One for when we allocated the page */ 4649 put_page(page); 4650 } while (index != 0); 4651 } 4652 4653 /* 4654 * Helper for releasing the extent buffer. 4655 */ 4656 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4657 { 4658 btrfs_release_extent_buffer_page(eb); 4659 __free_extent_buffer(eb); 4660 } 4661 4662 static struct extent_buffer * 4663 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4664 unsigned long len) 4665 { 4666 struct extent_buffer *eb = NULL; 4667 4668 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4669 eb->start = start; 4670 eb->len = len; 4671 eb->fs_info = fs_info; 4672 eb->bflags = 0; 4673 rwlock_init(&eb->lock); 4674 atomic_set(&eb->write_locks, 0); 4675 atomic_set(&eb->read_locks, 0); 4676 atomic_set(&eb->blocking_readers, 0); 4677 atomic_set(&eb->blocking_writers, 0); 4678 atomic_set(&eb->spinning_readers, 0); 4679 atomic_set(&eb->spinning_writers, 0); 4680 eb->lock_nested = 0; 4681 init_waitqueue_head(&eb->write_lock_wq); 4682 init_waitqueue_head(&eb->read_lock_wq); 4683 4684 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4685 4686 spin_lock_init(&eb->refs_lock); 4687 atomic_set(&eb->refs, 1); 4688 atomic_set(&eb->io_pages, 0); 4689 4690 /* 4691 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4692 */ 4693 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4694 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4695 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4696 4697 return eb; 4698 } 4699 4700 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4701 { 4702 unsigned long i; 4703 struct page *p; 4704 struct extent_buffer *new; 4705 unsigned long num_pages = num_extent_pages(src->start, src->len); 4706 4707 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4708 if (new == NULL) 4709 return NULL; 4710 4711 for (i = 0; i < num_pages; i++) { 4712 p = alloc_page(GFP_NOFS); 4713 if (!p) { 4714 btrfs_release_extent_buffer(new); 4715 return NULL; 4716 } 4717 attach_extent_buffer_page(new, p); 4718 WARN_ON(PageDirty(p)); 4719 SetPageUptodate(p); 4720 new->pages[i] = p; 4721 } 4722 4723 copy_extent_buffer(new, src, 0, 0, src->len); 4724 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4725 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4726 4727 return new; 4728 } 4729 4730 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4731 u64 start, unsigned long len) 4732 { 4733 struct extent_buffer *eb; 4734 unsigned long num_pages; 4735 unsigned long i; 4736 4737 num_pages = num_extent_pages(start, len); 4738 4739 eb = __alloc_extent_buffer(fs_info, start, len); 4740 if (!eb) 4741 return NULL; 4742 4743 for (i = 0; i < num_pages; i++) { 4744 eb->pages[i] = alloc_page(GFP_NOFS); 4745 if (!eb->pages[i]) 4746 goto err; 4747 } 4748 set_extent_buffer_uptodate(eb); 4749 btrfs_set_header_nritems(eb, 0); 4750 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4751 4752 return eb; 4753 err: 4754 for (; i > 0; i--) 4755 __free_page(eb->pages[i - 1]); 4756 __free_extent_buffer(eb); 4757 return NULL; 4758 } 4759 4760 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4761 u64 start, u32 nodesize) 4762 { 4763 unsigned long len; 4764 4765 if (!fs_info) { 4766 /* 4767 * Called only from tests that don't always have a fs_info 4768 * available 4769 */ 4770 len = nodesize; 4771 } else { 4772 len = fs_info->tree_root->nodesize; 4773 } 4774 4775 return __alloc_dummy_extent_buffer(fs_info, start, len); 4776 } 4777 4778 static void check_buffer_tree_ref(struct extent_buffer *eb) 4779 { 4780 int refs; 4781 /* the ref bit is tricky. We have to make sure it is set 4782 * if we have the buffer dirty. Otherwise the 4783 * code to free a buffer can end up dropping a dirty 4784 * page 4785 * 4786 * Once the ref bit is set, it won't go away while the 4787 * buffer is dirty or in writeback, and it also won't 4788 * go away while we have the reference count on the 4789 * eb bumped. 4790 * 4791 * We can't just set the ref bit without bumping the 4792 * ref on the eb because free_extent_buffer might 4793 * see the ref bit and try to clear it. If this happens 4794 * free_extent_buffer might end up dropping our original 4795 * ref by mistake and freeing the page before we are able 4796 * to add one more ref. 4797 * 4798 * So bump the ref count first, then set the bit. If someone 4799 * beat us to it, drop the ref we added. 4800 */ 4801 refs = atomic_read(&eb->refs); 4802 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4803 return; 4804 4805 spin_lock(&eb->refs_lock); 4806 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4807 atomic_inc(&eb->refs); 4808 spin_unlock(&eb->refs_lock); 4809 } 4810 4811 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 4812 struct page *accessed) 4813 { 4814 unsigned long num_pages, i; 4815 4816 check_buffer_tree_ref(eb); 4817 4818 num_pages = num_extent_pages(eb->start, eb->len); 4819 for (i = 0; i < num_pages; i++) { 4820 struct page *p = eb->pages[i]; 4821 4822 if (p != accessed) 4823 mark_page_accessed(p); 4824 } 4825 } 4826 4827 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 4828 u64 start) 4829 { 4830 struct extent_buffer *eb; 4831 4832 rcu_read_lock(); 4833 eb = radix_tree_lookup(&fs_info->buffer_radix, 4834 start >> PAGE_SHIFT); 4835 if (eb && atomic_inc_not_zero(&eb->refs)) { 4836 rcu_read_unlock(); 4837 /* 4838 * Lock our eb's refs_lock to avoid races with 4839 * free_extent_buffer. When we get our eb it might be flagged 4840 * with EXTENT_BUFFER_STALE and another task running 4841 * free_extent_buffer might have seen that flag set, 4842 * eb->refs == 2, that the buffer isn't under IO (dirty and 4843 * writeback flags not set) and it's still in the tree (flag 4844 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 4845 * of decrementing the extent buffer's reference count twice. 4846 * So here we could race and increment the eb's reference count, 4847 * clear its stale flag, mark it as dirty and drop our reference 4848 * before the other task finishes executing free_extent_buffer, 4849 * which would later result in an attempt to free an extent 4850 * buffer that is dirty. 4851 */ 4852 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 4853 spin_lock(&eb->refs_lock); 4854 spin_unlock(&eb->refs_lock); 4855 } 4856 mark_extent_buffer_accessed(eb, NULL); 4857 return eb; 4858 } 4859 rcu_read_unlock(); 4860 4861 return NULL; 4862 } 4863 4864 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4865 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 4866 u64 start, u32 nodesize) 4867 { 4868 struct extent_buffer *eb, *exists = NULL; 4869 int ret; 4870 4871 eb = find_extent_buffer(fs_info, start); 4872 if (eb) 4873 return eb; 4874 eb = alloc_dummy_extent_buffer(fs_info, start, nodesize); 4875 if (!eb) 4876 return NULL; 4877 eb->fs_info = fs_info; 4878 again: 4879 ret = radix_tree_preload(GFP_NOFS); 4880 if (ret) 4881 goto free_eb; 4882 spin_lock(&fs_info->buffer_lock); 4883 ret = radix_tree_insert(&fs_info->buffer_radix, 4884 start >> PAGE_SHIFT, eb); 4885 spin_unlock(&fs_info->buffer_lock); 4886 radix_tree_preload_end(); 4887 if (ret == -EEXIST) { 4888 exists = find_extent_buffer(fs_info, start); 4889 if (exists) 4890 goto free_eb; 4891 else 4892 goto again; 4893 } 4894 check_buffer_tree_ref(eb); 4895 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4896 4897 /* 4898 * We will free dummy extent buffer's if they come into 4899 * free_extent_buffer with a ref count of 2, but if we are using this we 4900 * want the buffers to stay in memory until we're done with them, so 4901 * bump the ref count again. 4902 */ 4903 atomic_inc(&eb->refs); 4904 return eb; 4905 free_eb: 4906 btrfs_release_extent_buffer(eb); 4907 return exists; 4908 } 4909 #endif 4910 4911 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4912 u64 start) 4913 { 4914 unsigned long len = fs_info->tree_root->nodesize; 4915 unsigned long num_pages = num_extent_pages(start, len); 4916 unsigned long i; 4917 unsigned long index = start >> PAGE_SHIFT; 4918 struct extent_buffer *eb; 4919 struct extent_buffer *exists = NULL; 4920 struct page *p; 4921 struct address_space *mapping = fs_info->btree_inode->i_mapping; 4922 int uptodate = 1; 4923 int ret; 4924 4925 if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) { 4926 btrfs_err(fs_info, "bad tree block start %llu", start); 4927 return ERR_PTR(-EINVAL); 4928 } 4929 4930 eb = find_extent_buffer(fs_info, start); 4931 if (eb) 4932 return eb; 4933 4934 eb = __alloc_extent_buffer(fs_info, start, len); 4935 if (!eb) 4936 return ERR_PTR(-ENOMEM); 4937 4938 for (i = 0; i < num_pages; i++, index++) { 4939 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 4940 if (!p) { 4941 exists = ERR_PTR(-ENOMEM); 4942 goto free_eb; 4943 } 4944 4945 spin_lock(&mapping->private_lock); 4946 if (PagePrivate(p)) { 4947 /* 4948 * We could have already allocated an eb for this page 4949 * and attached one so lets see if we can get a ref on 4950 * the existing eb, and if we can we know it's good and 4951 * we can just return that one, else we know we can just 4952 * overwrite page->private. 4953 */ 4954 exists = (struct extent_buffer *)p->private; 4955 if (atomic_inc_not_zero(&exists->refs)) { 4956 spin_unlock(&mapping->private_lock); 4957 unlock_page(p); 4958 put_page(p); 4959 mark_extent_buffer_accessed(exists, p); 4960 goto free_eb; 4961 } 4962 exists = NULL; 4963 4964 /* 4965 * Do this so attach doesn't complain and we need to 4966 * drop the ref the old guy had. 4967 */ 4968 ClearPagePrivate(p); 4969 WARN_ON(PageDirty(p)); 4970 put_page(p); 4971 } 4972 attach_extent_buffer_page(eb, p); 4973 spin_unlock(&mapping->private_lock); 4974 WARN_ON(PageDirty(p)); 4975 eb->pages[i] = p; 4976 if (!PageUptodate(p)) 4977 uptodate = 0; 4978 4979 /* 4980 * see below about how we avoid a nasty race with release page 4981 * and why we unlock later 4982 */ 4983 } 4984 if (uptodate) 4985 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4986 again: 4987 ret = radix_tree_preload(GFP_NOFS); 4988 if (ret) { 4989 exists = ERR_PTR(ret); 4990 goto free_eb; 4991 } 4992 4993 spin_lock(&fs_info->buffer_lock); 4994 ret = radix_tree_insert(&fs_info->buffer_radix, 4995 start >> PAGE_SHIFT, eb); 4996 spin_unlock(&fs_info->buffer_lock); 4997 radix_tree_preload_end(); 4998 if (ret == -EEXIST) { 4999 exists = find_extent_buffer(fs_info, start); 5000 if (exists) 5001 goto free_eb; 5002 else 5003 goto again; 5004 } 5005 /* add one reference for the tree */ 5006 check_buffer_tree_ref(eb); 5007 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5008 5009 /* 5010 * there is a race where release page may have 5011 * tried to find this extent buffer in the radix 5012 * but failed. It will tell the VM it is safe to 5013 * reclaim the, and it will clear the page private bit. 5014 * We must make sure to set the page private bit properly 5015 * after the extent buffer is in the radix tree so 5016 * it doesn't get lost 5017 */ 5018 SetPageChecked(eb->pages[0]); 5019 for (i = 1; i < num_pages; i++) { 5020 p = eb->pages[i]; 5021 ClearPageChecked(p); 5022 unlock_page(p); 5023 } 5024 unlock_page(eb->pages[0]); 5025 return eb; 5026 5027 free_eb: 5028 WARN_ON(!atomic_dec_and_test(&eb->refs)); 5029 for (i = 0; i < num_pages; i++) { 5030 if (eb->pages[i]) 5031 unlock_page(eb->pages[i]); 5032 } 5033 5034 btrfs_release_extent_buffer(eb); 5035 return exists; 5036 } 5037 5038 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5039 { 5040 struct extent_buffer *eb = 5041 container_of(head, struct extent_buffer, rcu_head); 5042 5043 __free_extent_buffer(eb); 5044 } 5045 5046 /* Expects to have eb->eb_lock already held */ 5047 static int release_extent_buffer(struct extent_buffer *eb) 5048 { 5049 WARN_ON(atomic_read(&eb->refs) == 0); 5050 if (atomic_dec_and_test(&eb->refs)) { 5051 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5052 struct btrfs_fs_info *fs_info = eb->fs_info; 5053 5054 spin_unlock(&eb->refs_lock); 5055 5056 spin_lock(&fs_info->buffer_lock); 5057 radix_tree_delete(&fs_info->buffer_radix, 5058 eb->start >> PAGE_SHIFT); 5059 spin_unlock(&fs_info->buffer_lock); 5060 } else { 5061 spin_unlock(&eb->refs_lock); 5062 } 5063 5064 /* Should be safe to release our pages at this point */ 5065 btrfs_release_extent_buffer_page(eb); 5066 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5067 if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { 5068 __free_extent_buffer(eb); 5069 return 1; 5070 } 5071 #endif 5072 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5073 return 1; 5074 } 5075 spin_unlock(&eb->refs_lock); 5076 5077 return 0; 5078 } 5079 5080 void free_extent_buffer(struct extent_buffer *eb) 5081 { 5082 int refs; 5083 int old; 5084 if (!eb) 5085 return; 5086 5087 while (1) { 5088 refs = atomic_read(&eb->refs); 5089 if (refs <= 3) 5090 break; 5091 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5092 if (old == refs) 5093 return; 5094 } 5095 5096 spin_lock(&eb->refs_lock); 5097 if (atomic_read(&eb->refs) == 2 && 5098 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 5099 atomic_dec(&eb->refs); 5100 5101 if (atomic_read(&eb->refs) == 2 && 5102 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5103 !extent_buffer_under_io(eb) && 5104 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5105 atomic_dec(&eb->refs); 5106 5107 /* 5108 * I know this is terrible, but it's temporary until we stop tracking 5109 * the uptodate bits and such for the extent buffers. 5110 */ 5111 release_extent_buffer(eb); 5112 } 5113 5114 void free_extent_buffer_stale(struct extent_buffer *eb) 5115 { 5116 if (!eb) 5117 return; 5118 5119 spin_lock(&eb->refs_lock); 5120 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5121 5122 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5123 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5124 atomic_dec(&eb->refs); 5125 release_extent_buffer(eb); 5126 } 5127 5128 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5129 { 5130 unsigned long i; 5131 unsigned long num_pages; 5132 struct page *page; 5133 5134 num_pages = num_extent_pages(eb->start, eb->len); 5135 5136 for (i = 0; i < num_pages; i++) { 5137 page = eb->pages[i]; 5138 if (!PageDirty(page)) 5139 continue; 5140 5141 lock_page(page); 5142 WARN_ON(!PagePrivate(page)); 5143 5144 clear_page_dirty_for_io(page); 5145 spin_lock_irq(&page->mapping->tree_lock); 5146 if (!PageDirty(page)) { 5147 radix_tree_tag_clear(&page->mapping->page_tree, 5148 page_index(page), 5149 PAGECACHE_TAG_DIRTY); 5150 } 5151 spin_unlock_irq(&page->mapping->tree_lock); 5152 ClearPageError(page); 5153 unlock_page(page); 5154 } 5155 WARN_ON(atomic_read(&eb->refs) == 0); 5156 } 5157 5158 int set_extent_buffer_dirty(struct extent_buffer *eb) 5159 { 5160 unsigned long i; 5161 unsigned long num_pages; 5162 int was_dirty = 0; 5163 5164 check_buffer_tree_ref(eb); 5165 5166 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5167 5168 num_pages = num_extent_pages(eb->start, eb->len); 5169 WARN_ON(atomic_read(&eb->refs) == 0); 5170 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5171 5172 for (i = 0; i < num_pages; i++) 5173 set_page_dirty(eb->pages[i]); 5174 return was_dirty; 5175 } 5176 5177 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5178 { 5179 unsigned long i; 5180 struct page *page; 5181 unsigned long num_pages; 5182 5183 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5184 num_pages = num_extent_pages(eb->start, eb->len); 5185 for (i = 0; i < num_pages; i++) { 5186 page = eb->pages[i]; 5187 if (page) 5188 ClearPageUptodate(page); 5189 } 5190 } 5191 5192 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5193 { 5194 unsigned long i; 5195 struct page *page; 5196 unsigned long num_pages; 5197 5198 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5199 num_pages = num_extent_pages(eb->start, eb->len); 5200 for (i = 0; i < num_pages; i++) { 5201 page = eb->pages[i]; 5202 SetPageUptodate(page); 5203 } 5204 } 5205 5206 int extent_buffer_uptodate(struct extent_buffer *eb) 5207 { 5208 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5209 } 5210 5211 int read_extent_buffer_pages(struct extent_io_tree *tree, 5212 struct extent_buffer *eb, int wait, 5213 get_extent_t *get_extent, int mirror_num) 5214 { 5215 unsigned long i; 5216 struct page *page; 5217 int err; 5218 int ret = 0; 5219 int locked_pages = 0; 5220 int all_uptodate = 1; 5221 unsigned long num_pages; 5222 unsigned long num_reads = 0; 5223 struct bio *bio = NULL; 5224 unsigned long bio_flags = 0; 5225 5226 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5227 return 0; 5228 5229 num_pages = num_extent_pages(eb->start, eb->len); 5230 for (i = 0; i < num_pages; i++) { 5231 page = eb->pages[i]; 5232 if (wait == WAIT_NONE) { 5233 if (!trylock_page(page)) 5234 goto unlock_exit; 5235 } else { 5236 lock_page(page); 5237 } 5238 locked_pages++; 5239 } 5240 /* 5241 * We need to firstly lock all pages to make sure that 5242 * the uptodate bit of our pages won't be affected by 5243 * clear_extent_buffer_uptodate(). 5244 */ 5245 for (i = 0; i < num_pages; i++) { 5246 page = eb->pages[i]; 5247 if (!PageUptodate(page)) { 5248 num_reads++; 5249 all_uptodate = 0; 5250 } 5251 } 5252 5253 if (all_uptodate) { 5254 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5255 goto unlock_exit; 5256 } 5257 5258 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5259 eb->read_mirror = 0; 5260 atomic_set(&eb->io_pages, num_reads); 5261 for (i = 0; i < num_pages; i++) { 5262 page = eb->pages[i]; 5263 5264 if (!PageUptodate(page)) { 5265 if (ret) { 5266 atomic_dec(&eb->io_pages); 5267 unlock_page(page); 5268 continue; 5269 } 5270 5271 ClearPageError(page); 5272 err = __extent_read_full_page(tree, page, 5273 get_extent, &bio, 5274 mirror_num, &bio_flags, 5275 REQ_META); 5276 if (err) { 5277 ret = err; 5278 /* 5279 * We use &bio in above __extent_read_full_page, 5280 * so we ensure that if it returns error, the 5281 * current page fails to add itself to bio and 5282 * it's been unlocked. 5283 * 5284 * We must dec io_pages by ourselves. 5285 */ 5286 atomic_dec(&eb->io_pages); 5287 } 5288 } else { 5289 unlock_page(page); 5290 } 5291 } 5292 5293 if (bio) { 5294 err = submit_one_bio(bio, mirror_num, bio_flags); 5295 if (err) 5296 return err; 5297 } 5298 5299 if (ret || wait != WAIT_COMPLETE) 5300 return ret; 5301 5302 for (i = 0; i < num_pages; i++) { 5303 page = eb->pages[i]; 5304 wait_on_page_locked(page); 5305 if (!PageUptodate(page)) 5306 ret = -EIO; 5307 } 5308 5309 return ret; 5310 5311 unlock_exit: 5312 while (locked_pages > 0) { 5313 locked_pages--; 5314 page = eb->pages[locked_pages]; 5315 unlock_page(page); 5316 } 5317 return ret; 5318 } 5319 5320 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 5321 unsigned long start, 5322 unsigned long len) 5323 { 5324 size_t cur; 5325 size_t offset; 5326 struct page *page; 5327 char *kaddr; 5328 char *dst = (char *)dstv; 5329 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5330 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5331 5332 WARN_ON(start > eb->len); 5333 WARN_ON(start + len > eb->start + eb->len); 5334 5335 offset = (start_offset + start) & (PAGE_SIZE - 1); 5336 5337 while (len > 0) { 5338 page = eb->pages[i]; 5339 5340 cur = min(len, (PAGE_SIZE - offset)); 5341 kaddr = page_address(page); 5342 memcpy(dst, kaddr + offset, cur); 5343 5344 dst += cur; 5345 len -= cur; 5346 offset = 0; 5347 i++; 5348 } 5349 } 5350 5351 int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, 5352 unsigned long start, 5353 unsigned long len) 5354 { 5355 size_t cur; 5356 size_t offset; 5357 struct page *page; 5358 char *kaddr; 5359 char __user *dst = (char __user *)dstv; 5360 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5361 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5362 int ret = 0; 5363 5364 WARN_ON(start > eb->len); 5365 WARN_ON(start + len > eb->start + eb->len); 5366 5367 offset = (start_offset + start) & (PAGE_SIZE - 1); 5368 5369 while (len > 0) { 5370 page = eb->pages[i]; 5371 5372 cur = min(len, (PAGE_SIZE - offset)); 5373 kaddr = page_address(page); 5374 if (copy_to_user(dst, kaddr + offset, cur)) { 5375 ret = -EFAULT; 5376 break; 5377 } 5378 5379 dst += cur; 5380 len -= cur; 5381 offset = 0; 5382 i++; 5383 } 5384 5385 return ret; 5386 } 5387 5388 /* 5389 * return 0 if the item is found within a page. 5390 * return 1 if the item spans two pages. 5391 * return -EINVAL otherwise. 5392 */ 5393 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 5394 unsigned long min_len, char **map, 5395 unsigned long *map_start, 5396 unsigned long *map_len) 5397 { 5398 size_t offset = start & (PAGE_SIZE - 1); 5399 char *kaddr; 5400 struct page *p; 5401 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5402 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5403 unsigned long end_i = (start_offset + start + min_len - 1) >> 5404 PAGE_SHIFT; 5405 5406 if (i != end_i) 5407 return 1; 5408 5409 if (i == 0) { 5410 offset = start_offset; 5411 *map_start = 0; 5412 } else { 5413 offset = 0; 5414 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5415 } 5416 5417 if (start + min_len > eb->len) { 5418 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5419 eb->start, eb->len, start, min_len); 5420 return -EINVAL; 5421 } 5422 5423 p = eb->pages[i]; 5424 kaddr = page_address(p); 5425 *map = kaddr + offset; 5426 *map_len = PAGE_SIZE - offset; 5427 return 0; 5428 } 5429 5430 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 5431 unsigned long start, 5432 unsigned long len) 5433 { 5434 size_t cur; 5435 size_t offset; 5436 struct page *page; 5437 char *kaddr; 5438 char *ptr = (char *)ptrv; 5439 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5440 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5441 int ret = 0; 5442 5443 WARN_ON(start > eb->len); 5444 WARN_ON(start + len > eb->start + eb->len); 5445 5446 offset = (start_offset + start) & (PAGE_SIZE - 1); 5447 5448 while (len > 0) { 5449 page = eb->pages[i]; 5450 5451 cur = min(len, (PAGE_SIZE - offset)); 5452 5453 kaddr = page_address(page); 5454 ret = memcmp(ptr, kaddr + offset, cur); 5455 if (ret) 5456 break; 5457 5458 ptr += cur; 5459 len -= cur; 5460 offset = 0; 5461 i++; 5462 } 5463 return ret; 5464 } 5465 5466 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5467 unsigned long start, unsigned long len) 5468 { 5469 size_t cur; 5470 size_t offset; 5471 struct page *page; 5472 char *kaddr; 5473 char *src = (char *)srcv; 5474 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5475 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5476 5477 WARN_ON(start > eb->len); 5478 WARN_ON(start + len > eb->start + eb->len); 5479 5480 offset = (start_offset + start) & (PAGE_SIZE - 1); 5481 5482 while (len > 0) { 5483 page = eb->pages[i]; 5484 WARN_ON(!PageUptodate(page)); 5485 5486 cur = min(len, PAGE_SIZE - offset); 5487 kaddr = page_address(page); 5488 memcpy(kaddr + offset, src, cur); 5489 5490 src += cur; 5491 len -= cur; 5492 offset = 0; 5493 i++; 5494 } 5495 } 5496 5497 void memset_extent_buffer(struct extent_buffer *eb, char c, 5498 unsigned long start, unsigned long len) 5499 { 5500 size_t cur; 5501 size_t offset; 5502 struct page *page; 5503 char *kaddr; 5504 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5505 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5506 5507 WARN_ON(start > eb->len); 5508 WARN_ON(start + len > eb->start + eb->len); 5509 5510 offset = (start_offset + start) & (PAGE_SIZE - 1); 5511 5512 while (len > 0) { 5513 page = eb->pages[i]; 5514 WARN_ON(!PageUptodate(page)); 5515 5516 cur = min(len, PAGE_SIZE - offset); 5517 kaddr = page_address(page); 5518 memset(kaddr + offset, c, cur); 5519 5520 len -= cur; 5521 offset = 0; 5522 i++; 5523 } 5524 } 5525 5526 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5527 unsigned long dst_offset, unsigned long src_offset, 5528 unsigned long len) 5529 { 5530 u64 dst_len = dst->len; 5531 size_t cur; 5532 size_t offset; 5533 struct page *page; 5534 char *kaddr; 5535 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5536 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5537 5538 WARN_ON(src->len != dst_len); 5539 5540 offset = (start_offset + dst_offset) & 5541 (PAGE_SIZE - 1); 5542 5543 while (len > 0) { 5544 page = dst->pages[i]; 5545 WARN_ON(!PageUptodate(page)); 5546 5547 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5548 5549 kaddr = page_address(page); 5550 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5551 5552 src_offset += cur; 5553 len -= cur; 5554 offset = 0; 5555 i++; 5556 } 5557 } 5558 5559 void le_bitmap_set(u8 *map, unsigned int start, int len) 5560 { 5561 u8 *p = map + BIT_BYTE(start); 5562 const unsigned int size = start + len; 5563 int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5564 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); 5565 5566 while (len - bits_to_set >= 0) { 5567 *p |= mask_to_set; 5568 len -= bits_to_set; 5569 bits_to_set = BITS_PER_BYTE; 5570 mask_to_set = ~0; 5571 p++; 5572 } 5573 if (len) { 5574 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5575 *p |= mask_to_set; 5576 } 5577 } 5578 5579 void le_bitmap_clear(u8 *map, unsigned int start, int len) 5580 { 5581 u8 *p = map + BIT_BYTE(start); 5582 const unsigned int size = start + len; 5583 int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5584 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); 5585 5586 while (len - bits_to_clear >= 0) { 5587 *p &= ~mask_to_clear; 5588 len -= bits_to_clear; 5589 bits_to_clear = BITS_PER_BYTE; 5590 mask_to_clear = ~0; 5591 p++; 5592 } 5593 if (len) { 5594 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5595 *p &= ~mask_to_clear; 5596 } 5597 } 5598 5599 /* 5600 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5601 * given bit number 5602 * @eb: the extent buffer 5603 * @start: offset of the bitmap item in the extent buffer 5604 * @nr: bit number 5605 * @page_index: return index of the page in the extent buffer that contains the 5606 * given bit number 5607 * @page_offset: return offset into the page given by page_index 5608 * 5609 * This helper hides the ugliness of finding the byte in an extent buffer which 5610 * contains a given bit. 5611 */ 5612 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5613 unsigned long start, unsigned long nr, 5614 unsigned long *page_index, 5615 size_t *page_offset) 5616 { 5617 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5618 size_t byte_offset = BIT_BYTE(nr); 5619 size_t offset; 5620 5621 /* 5622 * The byte we want is the offset of the extent buffer + the offset of 5623 * the bitmap item in the extent buffer + the offset of the byte in the 5624 * bitmap item. 5625 */ 5626 offset = start_offset + start + byte_offset; 5627 5628 *page_index = offset >> PAGE_SHIFT; 5629 *page_offset = offset & (PAGE_SIZE - 1); 5630 } 5631 5632 /** 5633 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5634 * @eb: the extent buffer 5635 * @start: offset of the bitmap item in the extent buffer 5636 * @nr: bit number to test 5637 */ 5638 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5639 unsigned long nr) 5640 { 5641 u8 *kaddr; 5642 struct page *page; 5643 unsigned long i; 5644 size_t offset; 5645 5646 eb_bitmap_offset(eb, start, nr, &i, &offset); 5647 page = eb->pages[i]; 5648 WARN_ON(!PageUptodate(page)); 5649 kaddr = page_address(page); 5650 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5651 } 5652 5653 /** 5654 * extent_buffer_bitmap_set - set an area of a bitmap 5655 * @eb: the extent buffer 5656 * @start: offset of the bitmap item in the extent buffer 5657 * @pos: bit number of the first bit 5658 * @len: number of bits to set 5659 */ 5660 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5661 unsigned long pos, unsigned long len) 5662 { 5663 u8 *kaddr; 5664 struct page *page; 5665 unsigned long i; 5666 size_t offset; 5667 const unsigned int size = pos + len; 5668 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5669 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5670 5671 eb_bitmap_offset(eb, start, pos, &i, &offset); 5672 page = eb->pages[i]; 5673 WARN_ON(!PageUptodate(page)); 5674 kaddr = page_address(page); 5675 5676 while (len >= bits_to_set) { 5677 kaddr[offset] |= mask_to_set; 5678 len -= bits_to_set; 5679 bits_to_set = BITS_PER_BYTE; 5680 mask_to_set = ~0; 5681 if (++offset >= PAGE_SIZE && len > 0) { 5682 offset = 0; 5683 page = eb->pages[++i]; 5684 WARN_ON(!PageUptodate(page)); 5685 kaddr = page_address(page); 5686 } 5687 } 5688 if (len) { 5689 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5690 kaddr[offset] |= mask_to_set; 5691 } 5692 } 5693 5694 5695 /** 5696 * extent_buffer_bitmap_clear - clear an area of a bitmap 5697 * @eb: the extent buffer 5698 * @start: offset of the bitmap item in the extent buffer 5699 * @pos: bit number of the first bit 5700 * @len: number of bits to clear 5701 */ 5702 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5703 unsigned long pos, unsigned long len) 5704 { 5705 u8 *kaddr; 5706 struct page *page; 5707 unsigned long i; 5708 size_t offset; 5709 const unsigned int size = pos + len; 5710 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5711 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5712 5713 eb_bitmap_offset(eb, start, pos, &i, &offset); 5714 page = eb->pages[i]; 5715 WARN_ON(!PageUptodate(page)); 5716 kaddr = page_address(page); 5717 5718 while (len >= bits_to_clear) { 5719 kaddr[offset] &= ~mask_to_clear; 5720 len -= bits_to_clear; 5721 bits_to_clear = BITS_PER_BYTE; 5722 mask_to_clear = ~0; 5723 if (++offset >= PAGE_SIZE && len > 0) { 5724 offset = 0; 5725 page = eb->pages[++i]; 5726 WARN_ON(!PageUptodate(page)); 5727 kaddr = page_address(page); 5728 } 5729 } 5730 if (len) { 5731 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5732 kaddr[offset] &= ~mask_to_clear; 5733 } 5734 } 5735 5736 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5737 { 5738 unsigned long distance = (src > dst) ? src - dst : dst - src; 5739 return distance < len; 5740 } 5741 5742 static void copy_pages(struct page *dst_page, struct page *src_page, 5743 unsigned long dst_off, unsigned long src_off, 5744 unsigned long len) 5745 { 5746 char *dst_kaddr = page_address(dst_page); 5747 char *src_kaddr; 5748 int must_memmove = 0; 5749 5750 if (dst_page != src_page) { 5751 src_kaddr = page_address(src_page); 5752 } else { 5753 src_kaddr = dst_kaddr; 5754 if (areas_overlap(src_off, dst_off, len)) 5755 must_memmove = 1; 5756 } 5757 5758 if (must_memmove) 5759 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5760 else 5761 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5762 } 5763 5764 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5765 unsigned long src_offset, unsigned long len) 5766 { 5767 size_t cur; 5768 size_t dst_off_in_page; 5769 size_t src_off_in_page; 5770 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5771 unsigned long dst_i; 5772 unsigned long src_i; 5773 5774 if (src_offset + len > dst->len) { 5775 btrfs_err(dst->fs_info, 5776 "memmove bogus src_offset %lu move len %lu dst len %lu", 5777 src_offset, len, dst->len); 5778 BUG_ON(1); 5779 } 5780 if (dst_offset + len > dst->len) { 5781 btrfs_err(dst->fs_info, 5782 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5783 dst_offset, len, dst->len); 5784 BUG_ON(1); 5785 } 5786 5787 while (len > 0) { 5788 dst_off_in_page = (start_offset + dst_offset) & 5789 (PAGE_SIZE - 1); 5790 src_off_in_page = (start_offset + src_offset) & 5791 (PAGE_SIZE - 1); 5792 5793 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5794 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5795 5796 cur = min(len, (unsigned long)(PAGE_SIZE - 5797 src_off_in_page)); 5798 cur = min_t(unsigned long, cur, 5799 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5800 5801 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5802 dst_off_in_page, src_off_in_page, cur); 5803 5804 src_offset += cur; 5805 dst_offset += cur; 5806 len -= cur; 5807 } 5808 } 5809 5810 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5811 unsigned long src_offset, unsigned long len) 5812 { 5813 size_t cur; 5814 size_t dst_off_in_page; 5815 size_t src_off_in_page; 5816 unsigned long dst_end = dst_offset + len - 1; 5817 unsigned long src_end = src_offset + len - 1; 5818 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5819 unsigned long dst_i; 5820 unsigned long src_i; 5821 5822 if (src_offset + len > dst->len) { 5823 btrfs_err(dst->fs_info, 5824 "memmove bogus src_offset %lu move len %lu len %lu", 5825 src_offset, len, dst->len); 5826 BUG_ON(1); 5827 } 5828 if (dst_offset + len > dst->len) { 5829 btrfs_err(dst->fs_info, 5830 "memmove bogus dst_offset %lu move len %lu len %lu", 5831 dst_offset, len, dst->len); 5832 BUG_ON(1); 5833 } 5834 if (dst_offset < src_offset) { 5835 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5836 return; 5837 } 5838 while (len > 0) { 5839 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 5840 src_i = (start_offset + src_end) >> PAGE_SHIFT; 5841 5842 dst_off_in_page = (start_offset + dst_end) & 5843 (PAGE_SIZE - 1); 5844 src_off_in_page = (start_offset + src_end) & 5845 (PAGE_SIZE - 1); 5846 5847 cur = min_t(unsigned long, len, src_off_in_page + 1); 5848 cur = min(cur, dst_off_in_page + 1); 5849 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5850 dst_off_in_page - cur + 1, 5851 src_off_in_page - cur + 1, cur); 5852 5853 dst_end -= cur; 5854 src_end -= cur; 5855 len -= cur; 5856 } 5857 } 5858 5859 int try_release_extent_buffer(struct page *page) 5860 { 5861 struct extent_buffer *eb; 5862 5863 /* 5864 * We need to make sure nobody is attaching this page to an eb right 5865 * now. 5866 */ 5867 spin_lock(&page->mapping->private_lock); 5868 if (!PagePrivate(page)) { 5869 spin_unlock(&page->mapping->private_lock); 5870 return 1; 5871 } 5872 5873 eb = (struct extent_buffer *)page->private; 5874 BUG_ON(!eb); 5875 5876 /* 5877 * This is a little awful but should be ok, we need to make sure that 5878 * the eb doesn't disappear out from under us while we're looking at 5879 * this page. 5880 */ 5881 spin_lock(&eb->refs_lock); 5882 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5883 spin_unlock(&eb->refs_lock); 5884 spin_unlock(&page->mapping->private_lock); 5885 return 0; 5886 } 5887 spin_unlock(&page->mapping->private_lock); 5888 5889 /* 5890 * If tree ref isn't set then we know the ref on this eb is a real ref, 5891 * so just return, this page will likely be freed soon anyway. 5892 */ 5893 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5894 spin_unlock(&eb->refs_lock); 5895 return 0; 5896 } 5897 5898 return release_extent_buffer(eb); 5899 } 5900