1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 #include "volumes.h" 19 #include "check-integrity.h" 20 #include "locking.h" 21 #include "rcu-string.h" 22 #include "backref.h" 23 #include "transaction.h" 24 25 static struct kmem_cache *extent_state_cache; 26 static struct kmem_cache *extent_buffer_cache; 27 static struct bio_set *btrfs_bioset; 28 29 static inline bool extent_state_in_tree(const struct extent_state *state) 30 { 31 return !RB_EMPTY_NODE(&state->rb_node); 32 } 33 34 #ifdef CONFIG_BTRFS_DEBUG 35 static LIST_HEAD(buffers); 36 static LIST_HEAD(states); 37 38 static DEFINE_SPINLOCK(leak_lock); 39 40 static inline 41 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 42 { 43 unsigned long flags; 44 45 spin_lock_irqsave(&leak_lock, flags); 46 list_add(new, head); 47 spin_unlock_irqrestore(&leak_lock, flags); 48 } 49 50 static inline 51 void btrfs_leak_debug_del(struct list_head *entry) 52 { 53 unsigned long flags; 54 55 spin_lock_irqsave(&leak_lock, flags); 56 list_del(entry); 57 spin_unlock_irqrestore(&leak_lock, flags); 58 } 59 60 static inline 61 void btrfs_leak_debug_check(void) 62 { 63 struct extent_state *state; 64 struct extent_buffer *eb; 65 66 while (!list_empty(&states)) { 67 state = list_entry(states.next, struct extent_state, leak_list); 68 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 69 state->start, state->end, state->state, 70 extent_state_in_tree(state), 71 refcount_read(&state->refs)); 72 list_del(&state->leak_list); 73 kmem_cache_free(extent_state_cache, state); 74 } 75 76 while (!list_empty(&buffers)) { 77 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 78 pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n", 79 eb->start, eb->len, atomic_read(&eb->refs)); 80 list_del(&eb->leak_list); 81 kmem_cache_free(extent_buffer_cache, eb); 82 } 83 } 84 85 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 86 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 87 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 88 struct extent_io_tree *tree, u64 start, u64 end) 89 { 90 struct inode *inode; 91 u64 isize; 92 93 if (!tree->mapping) 94 return; 95 96 inode = tree->mapping->host; 97 isize = i_size_read(inode); 98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 99 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 100 "%s: ino %llu isize %llu odd range [%llu,%llu]", 101 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 102 } 103 } 104 #else 105 #define btrfs_leak_debug_add(new, head) do {} while (0) 106 #define btrfs_leak_debug_del(entry) do {} while (0) 107 #define btrfs_leak_debug_check() do {} while (0) 108 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 109 #endif 110 111 #define BUFFER_LRU_MAX 64 112 113 struct tree_entry { 114 u64 start; 115 u64 end; 116 struct rb_node rb_node; 117 }; 118 119 struct extent_page_data { 120 struct bio *bio; 121 struct extent_io_tree *tree; 122 get_extent_t *get_extent; 123 unsigned long bio_flags; 124 125 /* tells writepage not to lock the state bits for this range 126 * it still does the unlocking 127 */ 128 unsigned int extent_locked:1; 129 130 /* tells the submit_bio code to use REQ_SYNC */ 131 unsigned int sync_io:1; 132 }; 133 134 static void add_extent_changeset(struct extent_state *state, unsigned bits, 135 struct extent_changeset *changeset, 136 int set) 137 { 138 int ret; 139 140 if (!changeset) 141 return; 142 if (set && (state->state & bits) == bits) 143 return; 144 if (!set && (state->state & bits) == 0) 145 return; 146 changeset->bytes_changed += state->end - state->start + 1; 147 ret = ulist_add(&changeset->range_changed, state->start, state->end, 148 GFP_ATOMIC); 149 /* ENOMEM */ 150 BUG_ON(ret < 0); 151 } 152 153 static noinline void flush_write_bio(void *data); 154 static inline struct btrfs_fs_info * 155 tree_fs_info(struct extent_io_tree *tree) 156 { 157 if (!tree->mapping) 158 return NULL; 159 return btrfs_sb(tree->mapping->host->i_sb); 160 } 161 162 int __init extent_io_init(void) 163 { 164 extent_state_cache = kmem_cache_create("btrfs_extent_state", 165 sizeof(struct extent_state), 0, 166 SLAB_MEM_SPREAD, NULL); 167 if (!extent_state_cache) 168 return -ENOMEM; 169 170 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 171 sizeof(struct extent_buffer), 0, 172 SLAB_MEM_SPREAD, NULL); 173 if (!extent_buffer_cache) 174 goto free_state_cache; 175 176 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 177 offsetof(struct btrfs_io_bio, bio)); 178 if (!btrfs_bioset) 179 goto free_buffer_cache; 180 181 if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) 182 goto free_bioset; 183 184 return 0; 185 186 free_bioset: 187 bioset_free(btrfs_bioset); 188 btrfs_bioset = NULL; 189 190 free_buffer_cache: 191 kmem_cache_destroy(extent_buffer_cache); 192 extent_buffer_cache = NULL; 193 194 free_state_cache: 195 kmem_cache_destroy(extent_state_cache); 196 extent_state_cache = NULL; 197 return -ENOMEM; 198 } 199 200 void extent_io_exit(void) 201 { 202 btrfs_leak_debug_check(); 203 204 /* 205 * Make sure all delayed rcu free are flushed before we 206 * destroy caches. 207 */ 208 rcu_barrier(); 209 kmem_cache_destroy(extent_state_cache); 210 kmem_cache_destroy(extent_buffer_cache); 211 if (btrfs_bioset) 212 bioset_free(btrfs_bioset); 213 } 214 215 void extent_io_tree_init(struct extent_io_tree *tree, 216 struct address_space *mapping) 217 { 218 tree->state = RB_ROOT; 219 tree->ops = NULL; 220 tree->dirty_bytes = 0; 221 spin_lock_init(&tree->lock); 222 tree->mapping = mapping; 223 } 224 225 static struct extent_state *alloc_extent_state(gfp_t mask) 226 { 227 struct extent_state *state; 228 229 /* 230 * The given mask might be not appropriate for the slab allocator, 231 * drop the unsupported bits 232 */ 233 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 234 state = kmem_cache_alloc(extent_state_cache, mask); 235 if (!state) 236 return state; 237 state->state = 0; 238 state->failrec = NULL; 239 RB_CLEAR_NODE(&state->rb_node); 240 btrfs_leak_debug_add(&state->leak_list, &states); 241 refcount_set(&state->refs, 1); 242 init_waitqueue_head(&state->wq); 243 trace_alloc_extent_state(state, mask, _RET_IP_); 244 return state; 245 } 246 247 void free_extent_state(struct extent_state *state) 248 { 249 if (!state) 250 return; 251 if (refcount_dec_and_test(&state->refs)) { 252 WARN_ON(extent_state_in_tree(state)); 253 btrfs_leak_debug_del(&state->leak_list); 254 trace_free_extent_state(state, _RET_IP_); 255 kmem_cache_free(extent_state_cache, state); 256 } 257 } 258 259 static struct rb_node *tree_insert(struct rb_root *root, 260 struct rb_node *search_start, 261 u64 offset, 262 struct rb_node *node, 263 struct rb_node ***p_in, 264 struct rb_node **parent_in) 265 { 266 struct rb_node **p; 267 struct rb_node *parent = NULL; 268 struct tree_entry *entry; 269 270 if (p_in && parent_in) { 271 p = *p_in; 272 parent = *parent_in; 273 goto do_insert; 274 } 275 276 p = search_start ? &search_start : &root->rb_node; 277 while (*p) { 278 parent = *p; 279 entry = rb_entry(parent, struct tree_entry, rb_node); 280 281 if (offset < entry->start) 282 p = &(*p)->rb_left; 283 else if (offset > entry->end) 284 p = &(*p)->rb_right; 285 else 286 return parent; 287 } 288 289 do_insert: 290 rb_link_node(node, parent, p); 291 rb_insert_color(node, root); 292 return NULL; 293 } 294 295 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 296 struct rb_node **prev_ret, 297 struct rb_node **next_ret, 298 struct rb_node ***p_ret, 299 struct rb_node **parent_ret) 300 { 301 struct rb_root *root = &tree->state; 302 struct rb_node **n = &root->rb_node; 303 struct rb_node *prev = NULL; 304 struct rb_node *orig_prev = NULL; 305 struct tree_entry *entry; 306 struct tree_entry *prev_entry = NULL; 307 308 while (*n) { 309 prev = *n; 310 entry = rb_entry(prev, struct tree_entry, rb_node); 311 prev_entry = entry; 312 313 if (offset < entry->start) 314 n = &(*n)->rb_left; 315 else if (offset > entry->end) 316 n = &(*n)->rb_right; 317 else 318 return *n; 319 } 320 321 if (p_ret) 322 *p_ret = n; 323 if (parent_ret) 324 *parent_ret = prev; 325 326 if (prev_ret) { 327 orig_prev = prev; 328 while (prev && offset > prev_entry->end) { 329 prev = rb_next(prev); 330 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 331 } 332 *prev_ret = prev; 333 prev = orig_prev; 334 } 335 336 if (next_ret) { 337 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 338 while (prev && offset < prev_entry->start) { 339 prev = rb_prev(prev); 340 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 341 } 342 *next_ret = prev; 343 } 344 return NULL; 345 } 346 347 static inline struct rb_node * 348 tree_search_for_insert(struct extent_io_tree *tree, 349 u64 offset, 350 struct rb_node ***p_ret, 351 struct rb_node **parent_ret) 352 { 353 struct rb_node *prev = NULL; 354 struct rb_node *ret; 355 356 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); 357 if (!ret) 358 return prev; 359 return ret; 360 } 361 362 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 363 u64 offset) 364 { 365 return tree_search_for_insert(tree, offset, NULL, NULL); 366 } 367 368 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 369 struct extent_state *other) 370 { 371 if (tree->ops && tree->ops->merge_extent_hook) 372 tree->ops->merge_extent_hook(tree->mapping->host, new, 373 other); 374 } 375 376 /* 377 * utility function to look for merge candidates inside a given range. 378 * Any extents with matching state are merged together into a single 379 * extent in the tree. Extents with EXTENT_IO in their state field 380 * are not merged because the end_io handlers need to be able to do 381 * operations on them without sleeping (or doing allocations/splits). 382 * 383 * This should be called with the tree lock held. 384 */ 385 static void merge_state(struct extent_io_tree *tree, 386 struct extent_state *state) 387 { 388 struct extent_state *other; 389 struct rb_node *other_node; 390 391 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 392 return; 393 394 other_node = rb_prev(&state->rb_node); 395 if (other_node) { 396 other = rb_entry(other_node, struct extent_state, rb_node); 397 if (other->end == state->start - 1 && 398 other->state == state->state) { 399 merge_cb(tree, state, other); 400 state->start = other->start; 401 rb_erase(&other->rb_node, &tree->state); 402 RB_CLEAR_NODE(&other->rb_node); 403 free_extent_state(other); 404 } 405 } 406 other_node = rb_next(&state->rb_node); 407 if (other_node) { 408 other = rb_entry(other_node, struct extent_state, rb_node); 409 if (other->start == state->end + 1 && 410 other->state == state->state) { 411 merge_cb(tree, state, other); 412 state->end = other->end; 413 rb_erase(&other->rb_node, &tree->state); 414 RB_CLEAR_NODE(&other->rb_node); 415 free_extent_state(other); 416 } 417 } 418 } 419 420 static void set_state_cb(struct extent_io_tree *tree, 421 struct extent_state *state, unsigned *bits) 422 { 423 if (tree->ops && tree->ops->set_bit_hook) 424 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 425 } 426 427 static void clear_state_cb(struct extent_io_tree *tree, 428 struct extent_state *state, unsigned *bits) 429 { 430 if (tree->ops && tree->ops->clear_bit_hook) 431 tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), 432 state, bits); 433 } 434 435 static void set_state_bits(struct extent_io_tree *tree, 436 struct extent_state *state, unsigned *bits, 437 struct extent_changeset *changeset); 438 439 /* 440 * insert an extent_state struct into the tree. 'bits' are set on the 441 * struct before it is inserted. 442 * 443 * This may return -EEXIST if the extent is already there, in which case the 444 * state struct is freed. 445 * 446 * The tree lock is not taken internally. This is a utility function and 447 * probably isn't what you want to call (see set/clear_extent_bit). 448 */ 449 static int insert_state(struct extent_io_tree *tree, 450 struct extent_state *state, u64 start, u64 end, 451 struct rb_node ***p, 452 struct rb_node **parent, 453 unsigned *bits, struct extent_changeset *changeset) 454 { 455 struct rb_node *node; 456 457 if (end < start) 458 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 459 end, start); 460 state->start = start; 461 state->end = end; 462 463 set_state_bits(tree, state, bits, changeset); 464 465 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 466 if (node) { 467 struct extent_state *found; 468 found = rb_entry(node, struct extent_state, rb_node); 469 pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 470 found->start, found->end, start, end); 471 return -EEXIST; 472 } 473 merge_state(tree, state); 474 return 0; 475 } 476 477 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 478 u64 split) 479 { 480 if (tree->ops && tree->ops->split_extent_hook) 481 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 482 } 483 484 /* 485 * split a given extent state struct in two, inserting the preallocated 486 * struct 'prealloc' as the newly created second half. 'split' indicates an 487 * offset inside 'orig' where it should be split. 488 * 489 * Before calling, 490 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 491 * are two extent state structs in the tree: 492 * prealloc: [orig->start, split - 1] 493 * orig: [ split, orig->end ] 494 * 495 * The tree locks are not taken by this function. They need to be held 496 * by the caller. 497 */ 498 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 499 struct extent_state *prealloc, u64 split) 500 { 501 struct rb_node *node; 502 503 split_cb(tree, orig, split); 504 505 prealloc->start = orig->start; 506 prealloc->end = split - 1; 507 prealloc->state = orig->state; 508 orig->start = split; 509 510 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 511 &prealloc->rb_node, NULL, NULL); 512 if (node) { 513 free_extent_state(prealloc); 514 return -EEXIST; 515 } 516 return 0; 517 } 518 519 static struct extent_state *next_state(struct extent_state *state) 520 { 521 struct rb_node *next = rb_next(&state->rb_node); 522 if (next) 523 return rb_entry(next, struct extent_state, rb_node); 524 else 525 return NULL; 526 } 527 528 /* 529 * utility function to clear some bits in an extent state struct. 530 * it will optionally wake up any one waiting on this state (wake == 1). 531 * 532 * If no bits are set on the state struct after clearing things, the 533 * struct is freed and removed from the tree 534 */ 535 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 536 struct extent_state *state, 537 unsigned *bits, int wake, 538 struct extent_changeset *changeset) 539 { 540 struct extent_state *next; 541 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 542 543 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 544 u64 range = state->end - state->start + 1; 545 WARN_ON(range > tree->dirty_bytes); 546 tree->dirty_bytes -= range; 547 } 548 clear_state_cb(tree, state, bits); 549 add_extent_changeset(state, bits_to_clear, changeset, 0); 550 state->state &= ~bits_to_clear; 551 if (wake) 552 wake_up(&state->wq); 553 if (state->state == 0) { 554 next = next_state(state); 555 if (extent_state_in_tree(state)) { 556 rb_erase(&state->rb_node, &tree->state); 557 RB_CLEAR_NODE(&state->rb_node); 558 free_extent_state(state); 559 } else { 560 WARN_ON(1); 561 } 562 } else { 563 merge_state(tree, state); 564 next = next_state(state); 565 } 566 return next; 567 } 568 569 static struct extent_state * 570 alloc_extent_state_atomic(struct extent_state *prealloc) 571 { 572 if (!prealloc) 573 prealloc = alloc_extent_state(GFP_ATOMIC); 574 575 return prealloc; 576 } 577 578 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 579 { 580 btrfs_panic(tree_fs_info(tree), err, 581 "Locking error: Extent tree was modified by another thread while locked."); 582 } 583 584 /* 585 * clear some bits on a range in the tree. This may require splitting 586 * or inserting elements in the tree, so the gfp mask is used to 587 * indicate which allocations or sleeping are allowed. 588 * 589 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 590 * the given range from the tree regardless of state (ie for truncate). 591 * 592 * the range [start, end] is inclusive. 593 * 594 * This takes the tree lock, and returns 0 on success and < 0 on error. 595 */ 596 static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 597 unsigned bits, int wake, int delete, 598 struct extent_state **cached_state, 599 gfp_t mask, struct extent_changeset *changeset) 600 { 601 struct extent_state *state; 602 struct extent_state *cached; 603 struct extent_state *prealloc = NULL; 604 struct rb_node *node; 605 u64 last_end; 606 int err; 607 int clear = 0; 608 609 btrfs_debug_check_extent_io_range(tree, start, end); 610 611 if (bits & EXTENT_DELALLOC) 612 bits |= EXTENT_NORESERVE; 613 614 if (delete) 615 bits |= ~EXTENT_CTLBITS; 616 bits |= EXTENT_FIRST_DELALLOC; 617 618 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 619 clear = 1; 620 again: 621 if (!prealloc && gfpflags_allow_blocking(mask)) { 622 /* 623 * Don't care for allocation failure here because we might end 624 * up not needing the pre-allocated extent state at all, which 625 * is the case if we only have in the tree extent states that 626 * cover our input range and don't cover too any other range. 627 * If we end up needing a new extent state we allocate it later. 628 */ 629 prealloc = alloc_extent_state(mask); 630 } 631 632 spin_lock(&tree->lock); 633 if (cached_state) { 634 cached = *cached_state; 635 636 if (clear) { 637 *cached_state = NULL; 638 cached_state = NULL; 639 } 640 641 if (cached && extent_state_in_tree(cached) && 642 cached->start <= start && cached->end > start) { 643 if (clear) 644 refcount_dec(&cached->refs); 645 state = cached; 646 goto hit_next; 647 } 648 if (clear) 649 free_extent_state(cached); 650 } 651 /* 652 * this search will find the extents that end after 653 * our range starts 654 */ 655 node = tree_search(tree, start); 656 if (!node) 657 goto out; 658 state = rb_entry(node, struct extent_state, rb_node); 659 hit_next: 660 if (state->start > end) 661 goto out; 662 WARN_ON(state->end < start); 663 last_end = state->end; 664 665 /* the state doesn't have the wanted bits, go ahead */ 666 if (!(state->state & bits)) { 667 state = next_state(state); 668 goto next; 669 } 670 671 /* 672 * | ---- desired range ---- | 673 * | state | or 674 * | ------------- state -------------- | 675 * 676 * We need to split the extent we found, and may flip 677 * bits on second half. 678 * 679 * If the extent we found extends past our range, we 680 * just split and search again. It'll get split again 681 * the next time though. 682 * 683 * If the extent we found is inside our range, we clear 684 * the desired bit on it. 685 */ 686 687 if (state->start < start) { 688 prealloc = alloc_extent_state_atomic(prealloc); 689 BUG_ON(!prealloc); 690 err = split_state(tree, state, prealloc, start); 691 if (err) 692 extent_io_tree_panic(tree, err); 693 694 prealloc = NULL; 695 if (err) 696 goto out; 697 if (state->end <= end) { 698 state = clear_state_bit(tree, state, &bits, wake, 699 changeset); 700 goto next; 701 } 702 goto search_again; 703 } 704 /* 705 * | ---- desired range ---- | 706 * | state | 707 * We need to split the extent, and clear the bit 708 * on the first half 709 */ 710 if (state->start <= end && state->end > end) { 711 prealloc = alloc_extent_state_atomic(prealloc); 712 BUG_ON(!prealloc); 713 err = split_state(tree, state, prealloc, end + 1); 714 if (err) 715 extent_io_tree_panic(tree, err); 716 717 if (wake) 718 wake_up(&state->wq); 719 720 clear_state_bit(tree, prealloc, &bits, wake, changeset); 721 722 prealloc = NULL; 723 goto out; 724 } 725 726 state = clear_state_bit(tree, state, &bits, wake, changeset); 727 next: 728 if (last_end == (u64)-1) 729 goto out; 730 start = last_end + 1; 731 if (start <= end && state && !need_resched()) 732 goto hit_next; 733 734 search_again: 735 if (start > end) 736 goto out; 737 spin_unlock(&tree->lock); 738 if (gfpflags_allow_blocking(mask)) 739 cond_resched(); 740 goto again; 741 742 out: 743 spin_unlock(&tree->lock); 744 if (prealloc) 745 free_extent_state(prealloc); 746 747 return 0; 748 749 } 750 751 static void wait_on_state(struct extent_io_tree *tree, 752 struct extent_state *state) 753 __releases(tree->lock) 754 __acquires(tree->lock) 755 { 756 DEFINE_WAIT(wait); 757 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 758 spin_unlock(&tree->lock); 759 schedule(); 760 spin_lock(&tree->lock); 761 finish_wait(&state->wq, &wait); 762 } 763 764 /* 765 * waits for one or more bits to clear on a range in the state tree. 766 * The range [start, end] is inclusive. 767 * The tree lock is taken by this function 768 */ 769 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 770 unsigned long bits) 771 { 772 struct extent_state *state; 773 struct rb_node *node; 774 775 btrfs_debug_check_extent_io_range(tree, start, end); 776 777 spin_lock(&tree->lock); 778 again: 779 while (1) { 780 /* 781 * this search will find all the extents that end after 782 * our range starts 783 */ 784 node = tree_search(tree, start); 785 process_node: 786 if (!node) 787 break; 788 789 state = rb_entry(node, struct extent_state, rb_node); 790 791 if (state->start > end) 792 goto out; 793 794 if (state->state & bits) { 795 start = state->start; 796 refcount_inc(&state->refs); 797 wait_on_state(tree, state); 798 free_extent_state(state); 799 goto again; 800 } 801 start = state->end + 1; 802 803 if (start > end) 804 break; 805 806 if (!cond_resched_lock(&tree->lock)) { 807 node = rb_next(node); 808 goto process_node; 809 } 810 } 811 out: 812 spin_unlock(&tree->lock); 813 } 814 815 static void set_state_bits(struct extent_io_tree *tree, 816 struct extent_state *state, 817 unsigned *bits, struct extent_changeset *changeset) 818 { 819 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 820 821 set_state_cb(tree, state, bits); 822 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 823 u64 range = state->end - state->start + 1; 824 tree->dirty_bytes += range; 825 } 826 add_extent_changeset(state, bits_to_set, changeset, 1); 827 state->state |= bits_to_set; 828 } 829 830 static void cache_state_if_flags(struct extent_state *state, 831 struct extent_state **cached_ptr, 832 unsigned flags) 833 { 834 if (cached_ptr && !(*cached_ptr)) { 835 if (!flags || (state->state & flags)) { 836 *cached_ptr = state; 837 refcount_inc(&state->refs); 838 } 839 } 840 } 841 842 static void cache_state(struct extent_state *state, 843 struct extent_state **cached_ptr) 844 { 845 return cache_state_if_flags(state, cached_ptr, 846 EXTENT_IOBITS | EXTENT_BOUNDARY); 847 } 848 849 /* 850 * set some bits on a range in the tree. This may require allocations or 851 * sleeping, so the gfp mask is used to indicate what is allowed. 852 * 853 * If any of the exclusive bits are set, this will fail with -EEXIST if some 854 * part of the range already has the desired bits set. The start of the 855 * existing range is returned in failed_start in this case. 856 * 857 * [start, end] is inclusive This takes the tree lock. 858 */ 859 860 static int __must_check 861 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 862 unsigned bits, unsigned exclusive_bits, 863 u64 *failed_start, struct extent_state **cached_state, 864 gfp_t mask, struct extent_changeset *changeset) 865 { 866 struct extent_state *state; 867 struct extent_state *prealloc = NULL; 868 struct rb_node *node; 869 struct rb_node **p; 870 struct rb_node *parent; 871 int err = 0; 872 u64 last_start; 873 u64 last_end; 874 875 btrfs_debug_check_extent_io_range(tree, start, end); 876 877 bits |= EXTENT_FIRST_DELALLOC; 878 again: 879 if (!prealloc && gfpflags_allow_blocking(mask)) { 880 /* 881 * Don't care for allocation failure here because we might end 882 * up not needing the pre-allocated extent state at all, which 883 * is the case if we only have in the tree extent states that 884 * cover our input range and don't cover too any other range. 885 * If we end up needing a new extent state we allocate it later. 886 */ 887 prealloc = alloc_extent_state(mask); 888 } 889 890 spin_lock(&tree->lock); 891 if (cached_state && *cached_state) { 892 state = *cached_state; 893 if (state->start <= start && state->end > start && 894 extent_state_in_tree(state)) { 895 node = &state->rb_node; 896 goto hit_next; 897 } 898 } 899 /* 900 * this search will find all the extents that end after 901 * our range starts. 902 */ 903 node = tree_search_for_insert(tree, start, &p, &parent); 904 if (!node) { 905 prealloc = alloc_extent_state_atomic(prealloc); 906 BUG_ON(!prealloc); 907 err = insert_state(tree, prealloc, start, end, 908 &p, &parent, &bits, changeset); 909 if (err) 910 extent_io_tree_panic(tree, err); 911 912 cache_state(prealloc, cached_state); 913 prealloc = NULL; 914 goto out; 915 } 916 state = rb_entry(node, struct extent_state, rb_node); 917 hit_next: 918 last_start = state->start; 919 last_end = state->end; 920 921 /* 922 * | ---- desired range ---- | 923 * | state | 924 * 925 * Just lock what we found and keep going 926 */ 927 if (state->start == start && state->end <= end) { 928 if (state->state & exclusive_bits) { 929 *failed_start = state->start; 930 err = -EEXIST; 931 goto out; 932 } 933 934 set_state_bits(tree, state, &bits, changeset); 935 cache_state(state, cached_state); 936 merge_state(tree, state); 937 if (last_end == (u64)-1) 938 goto out; 939 start = last_end + 1; 940 state = next_state(state); 941 if (start < end && state && state->start == start && 942 !need_resched()) 943 goto hit_next; 944 goto search_again; 945 } 946 947 /* 948 * | ---- desired range ---- | 949 * | state | 950 * or 951 * | ------------- state -------------- | 952 * 953 * We need to split the extent we found, and may flip bits on 954 * second half. 955 * 956 * If the extent we found extends past our 957 * range, we just split and search again. It'll get split 958 * again the next time though. 959 * 960 * If the extent we found is inside our range, we set the 961 * desired bit on it. 962 */ 963 if (state->start < start) { 964 if (state->state & exclusive_bits) { 965 *failed_start = start; 966 err = -EEXIST; 967 goto out; 968 } 969 970 prealloc = alloc_extent_state_atomic(prealloc); 971 BUG_ON(!prealloc); 972 err = split_state(tree, state, prealloc, start); 973 if (err) 974 extent_io_tree_panic(tree, err); 975 976 prealloc = NULL; 977 if (err) 978 goto out; 979 if (state->end <= end) { 980 set_state_bits(tree, state, &bits, changeset); 981 cache_state(state, cached_state); 982 merge_state(tree, state); 983 if (last_end == (u64)-1) 984 goto out; 985 start = last_end + 1; 986 state = next_state(state); 987 if (start < end && state && state->start == start && 988 !need_resched()) 989 goto hit_next; 990 } 991 goto search_again; 992 } 993 /* 994 * | ---- desired range ---- | 995 * | state | or | state | 996 * 997 * There's a hole, we need to insert something in it and 998 * ignore the extent we found. 999 */ 1000 if (state->start > start) { 1001 u64 this_end; 1002 if (end < last_start) 1003 this_end = end; 1004 else 1005 this_end = last_start - 1; 1006 1007 prealloc = alloc_extent_state_atomic(prealloc); 1008 BUG_ON(!prealloc); 1009 1010 /* 1011 * Avoid to free 'prealloc' if it can be merged with 1012 * the later extent. 1013 */ 1014 err = insert_state(tree, prealloc, start, this_end, 1015 NULL, NULL, &bits, changeset); 1016 if (err) 1017 extent_io_tree_panic(tree, err); 1018 1019 cache_state(prealloc, cached_state); 1020 prealloc = NULL; 1021 start = this_end + 1; 1022 goto search_again; 1023 } 1024 /* 1025 * | ---- desired range ---- | 1026 * | state | 1027 * We need to split the extent, and set the bit 1028 * on the first half 1029 */ 1030 if (state->start <= end && state->end > end) { 1031 if (state->state & exclusive_bits) { 1032 *failed_start = start; 1033 err = -EEXIST; 1034 goto out; 1035 } 1036 1037 prealloc = alloc_extent_state_atomic(prealloc); 1038 BUG_ON(!prealloc); 1039 err = split_state(tree, state, prealloc, end + 1); 1040 if (err) 1041 extent_io_tree_panic(tree, err); 1042 1043 set_state_bits(tree, prealloc, &bits, changeset); 1044 cache_state(prealloc, cached_state); 1045 merge_state(tree, prealloc); 1046 prealloc = NULL; 1047 goto out; 1048 } 1049 1050 search_again: 1051 if (start > end) 1052 goto out; 1053 spin_unlock(&tree->lock); 1054 if (gfpflags_allow_blocking(mask)) 1055 cond_resched(); 1056 goto again; 1057 1058 out: 1059 spin_unlock(&tree->lock); 1060 if (prealloc) 1061 free_extent_state(prealloc); 1062 1063 return err; 1064 1065 } 1066 1067 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1068 unsigned bits, u64 * failed_start, 1069 struct extent_state **cached_state, gfp_t mask) 1070 { 1071 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1072 cached_state, mask, NULL); 1073 } 1074 1075 1076 /** 1077 * convert_extent_bit - convert all bits in a given range from one bit to 1078 * another 1079 * @tree: the io tree to search 1080 * @start: the start offset in bytes 1081 * @end: the end offset in bytes (inclusive) 1082 * @bits: the bits to set in this range 1083 * @clear_bits: the bits to clear in this range 1084 * @cached_state: state that we're going to cache 1085 * 1086 * This will go through and set bits for the given range. If any states exist 1087 * already in this range they are set with the given bit and cleared of the 1088 * clear_bits. This is only meant to be used by things that are mergeable, ie 1089 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1090 * boundary bits like LOCK. 1091 * 1092 * All allocations are done with GFP_NOFS. 1093 */ 1094 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1095 unsigned bits, unsigned clear_bits, 1096 struct extent_state **cached_state) 1097 { 1098 struct extent_state *state; 1099 struct extent_state *prealloc = NULL; 1100 struct rb_node *node; 1101 struct rb_node **p; 1102 struct rb_node *parent; 1103 int err = 0; 1104 u64 last_start; 1105 u64 last_end; 1106 bool first_iteration = true; 1107 1108 btrfs_debug_check_extent_io_range(tree, start, end); 1109 1110 again: 1111 if (!prealloc) { 1112 /* 1113 * Best effort, don't worry if extent state allocation fails 1114 * here for the first iteration. We might have a cached state 1115 * that matches exactly the target range, in which case no 1116 * extent state allocations are needed. We'll only know this 1117 * after locking the tree. 1118 */ 1119 prealloc = alloc_extent_state(GFP_NOFS); 1120 if (!prealloc && !first_iteration) 1121 return -ENOMEM; 1122 } 1123 1124 spin_lock(&tree->lock); 1125 if (cached_state && *cached_state) { 1126 state = *cached_state; 1127 if (state->start <= start && state->end > start && 1128 extent_state_in_tree(state)) { 1129 node = &state->rb_node; 1130 goto hit_next; 1131 } 1132 } 1133 1134 /* 1135 * this search will find all the extents that end after 1136 * our range starts. 1137 */ 1138 node = tree_search_for_insert(tree, start, &p, &parent); 1139 if (!node) { 1140 prealloc = alloc_extent_state_atomic(prealloc); 1141 if (!prealloc) { 1142 err = -ENOMEM; 1143 goto out; 1144 } 1145 err = insert_state(tree, prealloc, start, end, 1146 &p, &parent, &bits, NULL); 1147 if (err) 1148 extent_io_tree_panic(tree, err); 1149 cache_state(prealloc, cached_state); 1150 prealloc = NULL; 1151 goto out; 1152 } 1153 state = rb_entry(node, struct extent_state, rb_node); 1154 hit_next: 1155 last_start = state->start; 1156 last_end = state->end; 1157 1158 /* 1159 * | ---- desired range ---- | 1160 * | state | 1161 * 1162 * Just lock what we found and keep going 1163 */ 1164 if (state->start == start && state->end <= end) { 1165 set_state_bits(tree, state, &bits, NULL); 1166 cache_state(state, cached_state); 1167 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1168 if (last_end == (u64)-1) 1169 goto out; 1170 start = last_end + 1; 1171 if (start < end && state && state->start == start && 1172 !need_resched()) 1173 goto hit_next; 1174 goto search_again; 1175 } 1176 1177 /* 1178 * | ---- desired range ---- | 1179 * | state | 1180 * or 1181 * | ------------- state -------------- | 1182 * 1183 * We need to split the extent we found, and may flip bits on 1184 * second half. 1185 * 1186 * If the extent we found extends past our 1187 * range, we just split and search again. It'll get split 1188 * again the next time though. 1189 * 1190 * If the extent we found is inside our range, we set the 1191 * desired bit on it. 1192 */ 1193 if (state->start < start) { 1194 prealloc = alloc_extent_state_atomic(prealloc); 1195 if (!prealloc) { 1196 err = -ENOMEM; 1197 goto out; 1198 } 1199 err = split_state(tree, state, prealloc, start); 1200 if (err) 1201 extent_io_tree_panic(tree, err); 1202 prealloc = NULL; 1203 if (err) 1204 goto out; 1205 if (state->end <= end) { 1206 set_state_bits(tree, state, &bits, NULL); 1207 cache_state(state, cached_state); 1208 state = clear_state_bit(tree, state, &clear_bits, 0, 1209 NULL); 1210 if (last_end == (u64)-1) 1211 goto out; 1212 start = last_end + 1; 1213 if (start < end && state && state->start == start && 1214 !need_resched()) 1215 goto hit_next; 1216 } 1217 goto search_again; 1218 } 1219 /* 1220 * | ---- desired range ---- | 1221 * | state | or | state | 1222 * 1223 * There's a hole, we need to insert something in it and 1224 * ignore the extent we found. 1225 */ 1226 if (state->start > start) { 1227 u64 this_end; 1228 if (end < last_start) 1229 this_end = end; 1230 else 1231 this_end = last_start - 1; 1232 1233 prealloc = alloc_extent_state_atomic(prealloc); 1234 if (!prealloc) { 1235 err = -ENOMEM; 1236 goto out; 1237 } 1238 1239 /* 1240 * Avoid to free 'prealloc' if it can be merged with 1241 * the later extent. 1242 */ 1243 err = insert_state(tree, prealloc, start, this_end, 1244 NULL, NULL, &bits, NULL); 1245 if (err) 1246 extent_io_tree_panic(tree, err); 1247 cache_state(prealloc, cached_state); 1248 prealloc = NULL; 1249 start = this_end + 1; 1250 goto search_again; 1251 } 1252 /* 1253 * | ---- desired range ---- | 1254 * | state | 1255 * We need to split the extent, and set the bit 1256 * on the first half 1257 */ 1258 if (state->start <= end && state->end > end) { 1259 prealloc = alloc_extent_state_atomic(prealloc); 1260 if (!prealloc) { 1261 err = -ENOMEM; 1262 goto out; 1263 } 1264 1265 err = split_state(tree, state, prealloc, end + 1); 1266 if (err) 1267 extent_io_tree_panic(tree, err); 1268 1269 set_state_bits(tree, prealloc, &bits, NULL); 1270 cache_state(prealloc, cached_state); 1271 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1272 prealloc = NULL; 1273 goto out; 1274 } 1275 1276 search_again: 1277 if (start > end) 1278 goto out; 1279 spin_unlock(&tree->lock); 1280 cond_resched(); 1281 first_iteration = false; 1282 goto again; 1283 1284 out: 1285 spin_unlock(&tree->lock); 1286 if (prealloc) 1287 free_extent_state(prealloc); 1288 1289 return err; 1290 } 1291 1292 /* wrappers around set/clear extent bit */ 1293 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1294 unsigned bits, struct extent_changeset *changeset) 1295 { 1296 /* 1297 * We don't support EXTENT_LOCKED yet, as current changeset will 1298 * record any bits changed, so for EXTENT_LOCKED case, it will 1299 * either fail with -EEXIST or changeset will record the whole 1300 * range. 1301 */ 1302 BUG_ON(bits & EXTENT_LOCKED); 1303 1304 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1305 changeset); 1306 } 1307 1308 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1309 unsigned bits, int wake, int delete, 1310 struct extent_state **cached, gfp_t mask) 1311 { 1312 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1313 cached, mask, NULL); 1314 } 1315 1316 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1317 unsigned bits, struct extent_changeset *changeset) 1318 { 1319 /* 1320 * Don't support EXTENT_LOCKED case, same reason as 1321 * set_record_extent_bits(). 1322 */ 1323 BUG_ON(bits & EXTENT_LOCKED); 1324 1325 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1326 changeset); 1327 } 1328 1329 /* 1330 * either insert or lock state struct between start and end use mask to tell 1331 * us if waiting is desired. 1332 */ 1333 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1334 struct extent_state **cached_state) 1335 { 1336 int err; 1337 u64 failed_start; 1338 1339 while (1) { 1340 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1341 EXTENT_LOCKED, &failed_start, 1342 cached_state, GFP_NOFS, NULL); 1343 if (err == -EEXIST) { 1344 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1345 start = failed_start; 1346 } else 1347 break; 1348 WARN_ON(start > end); 1349 } 1350 return err; 1351 } 1352 1353 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1354 { 1355 int err; 1356 u64 failed_start; 1357 1358 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1359 &failed_start, NULL, GFP_NOFS, NULL); 1360 if (err == -EEXIST) { 1361 if (failed_start > start) 1362 clear_extent_bit(tree, start, failed_start - 1, 1363 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1364 return 0; 1365 } 1366 return 1; 1367 } 1368 1369 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1370 { 1371 unsigned long index = start >> PAGE_SHIFT; 1372 unsigned long end_index = end >> PAGE_SHIFT; 1373 struct page *page; 1374 1375 while (index <= end_index) { 1376 page = find_get_page(inode->i_mapping, index); 1377 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1378 clear_page_dirty_for_io(page); 1379 put_page(page); 1380 index++; 1381 } 1382 } 1383 1384 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1385 { 1386 unsigned long index = start >> PAGE_SHIFT; 1387 unsigned long end_index = end >> PAGE_SHIFT; 1388 struct page *page; 1389 1390 while (index <= end_index) { 1391 page = find_get_page(inode->i_mapping, index); 1392 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1393 __set_page_dirty_nobuffers(page); 1394 account_page_redirty(page); 1395 put_page(page); 1396 index++; 1397 } 1398 } 1399 1400 /* 1401 * helper function to set both pages and extents in the tree writeback 1402 */ 1403 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1404 { 1405 unsigned long index = start >> PAGE_SHIFT; 1406 unsigned long end_index = end >> PAGE_SHIFT; 1407 struct page *page; 1408 1409 while (index <= end_index) { 1410 page = find_get_page(tree->mapping, index); 1411 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1412 set_page_writeback(page); 1413 put_page(page); 1414 index++; 1415 } 1416 } 1417 1418 /* find the first state struct with 'bits' set after 'start', and 1419 * return it. tree->lock must be held. NULL will returned if 1420 * nothing was found after 'start' 1421 */ 1422 static struct extent_state * 1423 find_first_extent_bit_state(struct extent_io_tree *tree, 1424 u64 start, unsigned bits) 1425 { 1426 struct rb_node *node; 1427 struct extent_state *state; 1428 1429 /* 1430 * this search will find all the extents that end after 1431 * our range starts. 1432 */ 1433 node = tree_search(tree, start); 1434 if (!node) 1435 goto out; 1436 1437 while (1) { 1438 state = rb_entry(node, struct extent_state, rb_node); 1439 if (state->end >= start && (state->state & bits)) 1440 return state; 1441 1442 node = rb_next(node); 1443 if (!node) 1444 break; 1445 } 1446 out: 1447 return NULL; 1448 } 1449 1450 /* 1451 * find the first offset in the io tree with 'bits' set. zero is 1452 * returned if we find something, and *start_ret and *end_ret are 1453 * set to reflect the state struct that was found. 1454 * 1455 * If nothing was found, 1 is returned. If found something, return 0. 1456 */ 1457 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1458 u64 *start_ret, u64 *end_ret, unsigned bits, 1459 struct extent_state **cached_state) 1460 { 1461 struct extent_state *state; 1462 struct rb_node *n; 1463 int ret = 1; 1464 1465 spin_lock(&tree->lock); 1466 if (cached_state && *cached_state) { 1467 state = *cached_state; 1468 if (state->end == start - 1 && extent_state_in_tree(state)) { 1469 n = rb_next(&state->rb_node); 1470 while (n) { 1471 state = rb_entry(n, struct extent_state, 1472 rb_node); 1473 if (state->state & bits) 1474 goto got_it; 1475 n = rb_next(n); 1476 } 1477 free_extent_state(*cached_state); 1478 *cached_state = NULL; 1479 goto out; 1480 } 1481 free_extent_state(*cached_state); 1482 *cached_state = NULL; 1483 } 1484 1485 state = find_first_extent_bit_state(tree, start, bits); 1486 got_it: 1487 if (state) { 1488 cache_state_if_flags(state, cached_state, 0); 1489 *start_ret = state->start; 1490 *end_ret = state->end; 1491 ret = 0; 1492 } 1493 out: 1494 spin_unlock(&tree->lock); 1495 return ret; 1496 } 1497 1498 /* 1499 * find a contiguous range of bytes in the file marked as delalloc, not 1500 * more than 'max_bytes'. start and end are used to return the range, 1501 * 1502 * 1 is returned if we find something, 0 if nothing was in the tree 1503 */ 1504 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1505 u64 *start, u64 *end, u64 max_bytes, 1506 struct extent_state **cached_state) 1507 { 1508 struct rb_node *node; 1509 struct extent_state *state; 1510 u64 cur_start = *start; 1511 u64 found = 0; 1512 u64 total_bytes = 0; 1513 1514 spin_lock(&tree->lock); 1515 1516 /* 1517 * this search will find all the extents that end after 1518 * our range starts. 1519 */ 1520 node = tree_search(tree, cur_start); 1521 if (!node) { 1522 if (!found) 1523 *end = (u64)-1; 1524 goto out; 1525 } 1526 1527 while (1) { 1528 state = rb_entry(node, struct extent_state, rb_node); 1529 if (found && (state->start != cur_start || 1530 (state->state & EXTENT_BOUNDARY))) { 1531 goto out; 1532 } 1533 if (!(state->state & EXTENT_DELALLOC)) { 1534 if (!found) 1535 *end = state->end; 1536 goto out; 1537 } 1538 if (!found) { 1539 *start = state->start; 1540 *cached_state = state; 1541 refcount_inc(&state->refs); 1542 } 1543 found++; 1544 *end = state->end; 1545 cur_start = state->end + 1; 1546 node = rb_next(node); 1547 total_bytes += state->end - state->start + 1; 1548 if (total_bytes >= max_bytes) 1549 break; 1550 if (!node) 1551 break; 1552 } 1553 out: 1554 spin_unlock(&tree->lock); 1555 return found; 1556 } 1557 1558 static int __process_pages_contig(struct address_space *mapping, 1559 struct page *locked_page, 1560 pgoff_t start_index, pgoff_t end_index, 1561 unsigned long page_ops, pgoff_t *index_ret); 1562 1563 static noinline void __unlock_for_delalloc(struct inode *inode, 1564 struct page *locked_page, 1565 u64 start, u64 end) 1566 { 1567 unsigned long index = start >> PAGE_SHIFT; 1568 unsigned long end_index = end >> PAGE_SHIFT; 1569 1570 ASSERT(locked_page); 1571 if (index == locked_page->index && end_index == index) 1572 return; 1573 1574 __process_pages_contig(inode->i_mapping, locked_page, index, end_index, 1575 PAGE_UNLOCK, NULL); 1576 } 1577 1578 static noinline int lock_delalloc_pages(struct inode *inode, 1579 struct page *locked_page, 1580 u64 delalloc_start, 1581 u64 delalloc_end) 1582 { 1583 unsigned long index = delalloc_start >> PAGE_SHIFT; 1584 unsigned long index_ret = index; 1585 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1586 int ret; 1587 1588 ASSERT(locked_page); 1589 if (index == locked_page->index && index == end_index) 1590 return 0; 1591 1592 ret = __process_pages_contig(inode->i_mapping, locked_page, index, 1593 end_index, PAGE_LOCK, &index_ret); 1594 if (ret == -EAGAIN) 1595 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1596 (u64)index_ret << PAGE_SHIFT); 1597 return ret; 1598 } 1599 1600 /* 1601 * find a contiguous range of bytes in the file marked as delalloc, not 1602 * more than 'max_bytes'. start and end are used to return the range, 1603 * 1604 * 1 is returned if we find something, 0 if nothing was in the tree 1605 */ 1606 STATIC u64 find_lock_delalloc_range(struct inode *inode, 1607 struct extent_io_tree *tree, 1608 struct page *locked_page, u64 *start, 1609 u64 *end, u64 max_bytes) 1610 { 1611 u64 delalloc_start; 1612 u64 delalloc_end; 1613 u64 found; 1614 struct extent_state *cached_state = NULL; 1615 int ret; 1616 int loops = 0; 1617 1618 again: 1619 /* step one, find a bunch of delalloc bytes starting at start */ 1620 delalloc_start = *start; 1621 delalloc_end = 0; 1622 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1623 max_bytes, &cached_state); 1624 if (!found || delalloc_end <= *start) { 1625 *start = delalloc_start; 1626 *end = delalloc_end; 1627 free_extent_state(cached_state); 1628 return 0; 1629 } 1630 1631 /* 1632 * start comes from the offset of locked_page. We have to lock 1633 * pages in order, so we can't process delalloc bytes before 1634 * locked_page 1635 */ 1636 if (delalloc_start < *start) 1637 delalloc_start = *start; 1638 1639 /* 1640 * make sure to limit the number of pages we try to lock down 1641 */ 1642 if (delalloc_end + 1 - delalloc_start > max_bytes) 1643 delalloc_end = delalloc_start + max_bytes - 1; 1644 1645 /* step two, lock all the pages after the page that has start */ 1646 ret = lock_delalloc_pages(inode, locked_page, 1647 delalloc_start, delalloc_end); 1648 if (ret == -EAGAIN) { 1649 /* some of the pages are gone, lets avoid looping by 1650 * shortening the size of the delalloc range we're searching 1651 */ 1652 free_extent_state(cached_state); 1653 cached_state = NULL; 1654 if (!loops) { 1655 max_bytes = PAGE_SIZE; 1656 loops = 1; 1657 goto again; 1658 } else { 1659 found = 0; 1660 goto out_failed; 1661 } 1662 } 1663 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1664 1665 /* step three, lock the state bits for the whole range */ 1666 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1667 1668 /* then test to make sure it is all still delalloc */ 1669 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1670 EXTENT_DELALLOC, 1, cached_state); 1671 if (!ret) { 1672 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1673 &cached_state, GFP_NOFS); 1674 __unlock_for_delalloc(inode, locked_page, 1675 delalloc_start, delalloc_end); 1676 cond_resched(); 1677 goto again; 1678 } 1679 free_extent_state(cached_state); 1680 *start = delalloc_start; 1681 *end = delalloc_end; 1682 out_failed: 1683 return found; 1684 } 1685 1686 static int __process_pages_contig(struct address_space *mapping, 1687 struct page *locked_page, 1688 pgoff_t start_index, pgoff_t end_index, 1689 unsigned long page_ops, pgoff_t *index_ret) 1690 { 1691 unsigned long nr_pages = end_index - start_index + 1; 1692 unsigned long pages_locked = 0; 1693 pgoff_t index = start_index; 1694 struct page *pages[16]; 1695 unsigned ret; 1696 int err = 0; 1697 int i; 1698 1699 if (page_ops & PAGE_LOCK) { 1700 ASSERT(page_ops == PAGE_LOCK); 1701 ASSERT(index_ret && *index_ret == start_index); 1702 } 1703 1704 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1705 mapping_set_error(mapping, -EIO); 1706 1707 while (nr_pages > 0) { 1708 ret = find_get_pages_contig(mapping, index, 1709 min_t(unsigned long, 1710 nr_pages, ARRAY_SIZE(pages)), pages); 1711 if (ret == 0) { 1712 /* 1713 * Only if we're going to lock these pages, 1714 * can we find nothing at @index. 1715 */ 1716 ASSERT(page_ops & PAGE_LOCK); 1717 err = -EAGAIN; 1718 goto out; 1719 } 1720 1721 for (i = 0; i < ret; i++) { 1722 if (page_ops & PAGE_SET_PRIVATE2) 1723 SetPagePrivate2(pages[i]); 1724 1725 if (pages[i] == locked_page) { 1726 put_page(pages[i]); 1727 pages_locked++; 1728 continue; 1729 } 1730 if (page_ops & PAGE_CLEAR_DIRTY) 1731 clear_page_dirty_for_io(pages[i]); 1732 if (page_ops & PAGE_SET_WRITEBACK) 1733 set_page_writeback(pages[i]); 1734 if (page_ops & PAGE_SET_ERROR) 1735 SetPageError(pages[i]); 1736 if (page_ops & PAGE_END_WRITEBACK) 1737 end_page_writeback(pages[i]); 1738 if (page_ops & PAGE_UNLOCK) 1739 unlock_page(pages[i]); 1740 if (page_ops & PAGE_LOCK) { 1741 lock_page(pages[i]); 1742 if (!PageDirty(pages[i]) || 1743 pages[i]->mapping != mapping) { 1744 unlock_page(pages[i]); 1745 put_page(pages[i]); 1746 err = -EAGAIN; 1747 goto out; 1748 } 1749 } 1750 put_page(pages[i]); 1751 pages_locked++; 1752 } 1753 nr_pages -= ret; 1754 index += ret; 1755 cond_resched(); 1756 } 1757 out: 1758 if (err && index_ret) 1759 *index_ret = start_index + pages_locked - 1; 1760 return err; 1761 } 1762 1763 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1764 u64 delalloc_end, struct page *locked_page, 1765 unsigned clear_bits, 1766 unsigned long page_ops) 1767 { 1768 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, 1769 NULL, GFP_NOFS); 1770 1771 __process_pages_contig(inode->i_mapping, locked_page, 1772 start >> PAGE_SHIFT, end >> PAGE_SHIFT, 1773 page_ops, NULL); 1774 } 1775 1776 /* 1777 * count the number of bytes in the tree that have a given bit(s) 1778 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1779 * cached. The total number found is returned. 1780 */ 1781 u64 count_range_bits(struct extent_io_tree *tree, 1782 u64 *start, u64 search_end, u64 max_bytes, 1783 unsigned bits, int contig) 1784 { 1785 struct rb_node *node; 1786 struct extent_state *state; 1787 u64 cur_start = *start; 1788 u64 total_bytes = 0; 1789 u64 last = 0; 1790 int found = 0; 1791 1792 if (WARN_ON(search_end <= cur_start)) 1793 return 0; 1794 1795 spin_lock(&tree->lock); 1796 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1797 total_bytes = tree->dirty_bytes; 1798 goto out; 1799 } 1800 /* 1801 * this search will find all the extents that end after 1802 * our range starts. 1803 */ 1804 node = tree_search(tree, cur_start); 1805 if (!node) 1806 goto out; 1807 1808 while (1) { 1809 state = rb_entry(node, struct extent_state, rb_node); 1810 if (state->start > search_end) 1811 break; 1812 if (contig && found && state->start > last + 1) 1813 break; 1814 if (state->end >= cur_start && (state->state & bits) == bits) { 1815 total_bytes += min(search_end, state->end) + 1 - 1816 max(cur_start, state->start); 1817 if (total_bytes >= max_bytes) 1818 break; 1819 if (!found) { 1820 *start = max(cur_start, state->start); 1821 found = 1; 1822 } 1823 last = state->end; 1824 } else if (contig && found) { 1825 break; 1826 } 1827 node = rb_next(node); 1828 if (!node) 1829 break; 1830 } 1831 out: 1832 spin_unlock(&tree->lock); 1833 return total_bytes; 1834 } 1835 1836 /* 1837 * set the private field for a given byte offset in the tree. If there isn't 1838 * an extent_state there already, this does nothing. 1839 */ 1840 static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, 1841 struct io_failure_record *failrec) 1842 { 1843 struct rb_node *node; 1844 struct extent_state *state; 1845 int ret = 0; 1846 1847 spin_lock(&tree->lock); 1848 /* 1849 * this search will find all the extents that end after 1850 * our range starts. 1851 */ 1852 node = tree_search(tree, start); 1853 if (!node) { 1854 ret = -ENOENT; 1855 goto out; 1856 } 1857 state = rb_entry(node, struct extent_state, rb_node); 1858 if (state->start != start) { 1859 ret = -ENOENT; 1860 goto out; 1861 } 1862 state->failrec = failrec; 1863 out: 1864 spin_unlock(&tree->lock); 1865 return ret; 1866 } 1867 1868 static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, 1869 struct io_failure_record **failrec) 1870 { 1871 struct rb_node *node; 1872 struct extent_state *state; 1873 int ret = 0; 1874 1875 spin_lock(&tree->lock); 1876 /* 1877 * this search will find all the extents that end after 1878 * our range starts. 1879 */ 1880 node = tree_search(tree, start); 1881 if (!node) { 1882 ret = -ENOENT; 1883 goto out; 1884 } 1885 state = rb_entry(node, struct extent_state, rb_node); 1886 if (state->start != start) { 1887 ret = -ENOENT; 1888 goto out; 1889 } 1890 *failrec = state->failrec; 1891 out: 1892 spin_unlock(&tree->lock); 1893 return ret; 1894 } 1895 1896 /* 1897 * searches a range in the state tree for a given mask. 1898 * If 'filled' == 1, this returns 1 only if every extent in the tree 1899 * has the bits set. Otherwise, 1 is returned if any bit in the 1900 * range is found set. 1901 */ 1902 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1903 unsigned bits, int filled, struct extent_state *cached) 1904 { 1905 struct extent_state *state = NULL; 1906 struct rb_node *node; 1907 int bitset = 0; 1908 1909 spin_lock(&tree->lock); 1910 if (cached && extent_state_in_tree(cached) && cached->start <= start && 1911 cached->end > start) 1912 node = &cached->rb_node; 1913 else 1914 node = tree_search(tree, start); 1915 while (node && start <= end) { 1916 state = rb_entry(node, struct extent_state, rb_node); 1917 1918 if (filled && state->start > start) { 1919 bitset = 0; 1920 break; 1921 } 1922 1923 if (state->start > end) 1924 break; 1925 1926 if (state->state & bits) { 1927 bitset = 1; 1928 if (!filled) 1929 break; 1930 } else if (filled) { 1931 bitset = 0; 1932 break; 1933 } 1934 1935 if (state->end == (u64)-1) 1936 break; 1937 1938 start = state->end + 1; 1939 if (start > end) 1940 break; 1941 node = rb_next(node); 1942 if (!node) { 1943 if (filled) 1944 bitset = 0; 1945 break; 1946 } 1947 } 1948 spin_unlock(&tree->lock); 1949 return bitset; 1950 } 1951 1952 /* 1953 * helper function to set a given page up to date if all the 1954 * extents in the tree for that page are up to date 1955 */ 1956 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1957 { 1958 u64 start = page_offset(page); 1959 u64 end = start + PAGE_SIZE - 1; 1960 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1961 SetPageUptodate(page); 1962 } 1963 1964 int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) 1965 { 1966 int ret; 1967 int err = 0; 1968 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 1969 1970 set_state_failrec(failure_tree, rec->start, NULL); 1971 ret = clear_extent_bits(failure_tree, rec->start, 1972 rec->start + rec->len - 1, 1973 EXTENT_LOCKED | EXTENT_DIRTY); 1974 if (ret) 1975 err = ret; 1976 1977 ret = clear_extent_bits(&inode->io_tree, rec->start, 1978 rec->start + rec->len - 1, 1979 EXTENT_DAMAGED); 1980 if (ret && !err) 1981 err = ret; 1982 1983 kfree(rec); 1984 return err; 1985 } 1986 1987 /* 1988 * this bypasses the standard btrfs submit functions deliberately, as 1989 * the standard behavior is to write all copies in a raid setup. here we only 1990 * want to write the one bad copy. so we do the mapping for ourselves and issue 1991 * submit_bio directly. 1992 * to avoid any synchronization issues, wait for the data after writing, which 1993 * actually prevents the read that triggered the error from finishing. 1994 * currently, there can be no more than two copies of every data bit. thus, 1995 * exactly one rewrite is required. 1996 */ 1997 int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, 1998 u64 logical, struct page *page, 1999 unsigned int pg_offset, int mirror_num) 2000 { 2001 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2002 struct bio *bio; 2003 struct btrfs_device *dev; 2004 u64 map_length = 0; 2005 u64 sector; 2006 struct btrfs_bio *bbio = NULL; 2007 int ret; 2008 2009 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); 2010 BUG_ON(!mirror_num); 2011 2012 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2013 if (!bio) 2014 return -EIO; 2015 bio->bi_iter.bi_size = 0; 2016 map_length = length; 2017 2018 /* 2019 * Avoid races with device replace and make sure our bbio has devices 2020 * associated to its stripes that don't go away while we are doing the 2021 * read repair operation. 2022 */ 2023 btrfs_bio_counter_inc_blocked(fs_info); 2024 if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { 2025 /* 2026 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2027 * to update all raid stripes, but here we just want to correct 2028 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2029 * stripe's dev and sector. 2030 */ 2031 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2032 &map_length, &bbio, 0); 2033 if (ret) { 2034 btrfs_bio_counter_dec(fs_info); 2035 bio_put(bio); 2036 return -EIO; 2037 } 2038 ASSERT(bbio->mirror_num == 1); 2039 } else { 2040 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2041 &map_length, &bbio, mirror_num); 2042 if (ret) { 2043 btrfs_bio_counter_dec(fs_info); 2044 bio_put(bio); 2045 return -EIO; 2046 } 2047 BUG_ON(mirror_num != bbio->mirror_num); 2048 } 2049 2050 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; 2051 bio->bi_iter.bi_sector = sector; 2052 dev = bbio->stripes[bbio->mirror_num - 1].dev; 2053 btrfs_put_bbio(bbio); 2054 if (!dev || !dev->bdev || !dev->writeable) { 2055 btrfs_bio_counter_dec(fs_info); 2056 bio_put(bio); 2057 return -EIO; 2058 } 2059 bio->bi_bdev = dev->bdev; 2060 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2061 bio_add_page(bio, page, length, pg_offset); 2062 2063 if (btrfsic_submit_bio_wait(bio)) { 2064 /* try to remap that extent elsewhere? */ 2065 btrfs_bio_counter_dec(fs_info); 2066 bio_put(bio); 2067 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2068 return -EIO; 2069 } 2070 2071 btrfs_info_rl_in_rcu(fs_info, 2072 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2073 btrfs_ino(inode), start, 2074 rcu_str_deref(dev->name), sector); 2075 btrfs_bio_counter_dec(fs_info); 2076 bio_put(bio); 2077 return 0; 2078 } 2079 2080 int repair_eb_io_failure(struct btrfs_fs_info *fs_info, 2081 struct extent_buffer *eb, int mirror_num) 2082 { 2083 u64 start = eb->start; 2084 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2085 int ret = 0; 2086 2087 if (fs_info->sb->s_flags & MS_RDONLY) 2088 return -EROFS; 2089 2090 for (i = 0; i < num_pages; i++) { 2091 struct page *p = eb->pages[i]; 2092 2093 ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, 2094 PAGE_SIZE, start, p, 2095 start - page_offset(p), mirror_num); 2096 if (ret) 2097 break; 2098 start += PAGE_SIZE; 2099 } 2100 2101 return ret; 2102 } 2103 2104 /* 2105 * each time an IO finishes, we do a fast check in the IO failure tree 2106 * to see if we need to process or clean up an io_failure_record 2107 */ 2108 int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, 2109 unsigned int pg_offset) 2110 { 2111 u64 private; 2112 struct io_failure_record *failrec; 2113 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2114 struct extent_state *state; 2115 int num_copies; 2116 int ret; 2117 2118 private = 0; 2119 ret = count_range_bits(&inode->io_failure_tree, &private, 2120 (u64)-1, 1, EXTENT_DIRTY, 0); 2121 if (!ret) 2122 return 0; 2123 2124 ret = get_state_failrec(&inode->io_failure_tree, start, 2125 &failrec); 2126 if (ret) 2127 return 0; 2128 2129 BUG_ON(!failrec->this_mirror); 2130 2131 if (failrec->in_validation) { 2132 /* there was no real error, just free the record */ 2133 btrfs_debug(fs_info, 2134 "clean_io_failure: freeing dummy error at %llu", 2135 failrec->start); 2136 goto out; 2137 } 2138 if (fs_info->sb->s_flags & MS_RDONLY) 2139 goto out; 2140 2141 spin_lock(&inode->io_tree.lock); 2142 state = find_first_extent_bit_state(&inode->io_tree, 2143 failrec->start, 2144 EXTENT_LOCKED); 2145 spin_unlock(&inode->io_tree.lock); 2146 2147 if (state && state->start <= failrec->start && 2148 state->end >= failrec->start + failrec->len - 1) { 2149 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2150 failrec->len); 2151 if (num_copies > 1) { 2152 repair_io_failure(inode, start, failrec->len, 2153 failrec->logical, page, 2154 pg_offset, failrec->failed_mirror); 2155 } 2156 } 2157 2158 out: 2159 free_io_failure(inode, failrec); 2160 2161 return 0; 2162 } 2163 2164 /* 2165 * Can be called when 2166 * - hold extent lock 2167 * - under ordered extent 2168 * - the inode is freeing 2169 */ 2170 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2171 { 2172 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2173 struct io_failure_record *failrec; 2174 struct extent_state *state, *next; 2175 2176 if (RB_EMPTY_ROOT(&failure_tree->state)) 2177 return; 2178 2179 spin_lock(&failure_tree->lock); 2180 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2181 while (state) { 2182 if (state->start > end) 2183 break; 2184 2185 ASSERT(state->end <= end); 2186 2187 next = next_state(state); 2188 2189 failrec = state->failrec; 2190 free_extent_state(state); 2191 kfree(failrec); 2192 2193 state = next; 2194 } 2195 spin_unlock(&failure_tree->lock); 2196 } 2197 2198 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2199 struct io_failure_record **failrec_ret) 2200 { 2201 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2202 struct io_failure_record *failrec; 2203 struct extent_map *em; 2204 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2205 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2206 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2207 int ret; 2208 u64 logical; 2209 2210 ret = get_state_failrec(failure_tree, start, &failrec); 2211 if (ret) { 2212 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2213 if (!failrec) 2214 return -ENOMEM; 2215 2216 failrec->start = start; 2217 failrec->len = end - start + 1; 2218 failrec->this_mirror = 0; 2219 failrec->bio_flags = 0; 2220 failrec->in_validation = 0; 2221 2222 read_lock(&em_tree->lock); 2223 em = lookup_extent_mapping(em_tree, start, failrec->len); 2224 if (!em) { 2225 read_unlock(&em_tree->lock); 2226 kfree(failrec); 2227 return -EIO; 2228 } 2229 2230 if (em->start > start || em->start + em->len <= start) { 2231 free_extent_map(em); 2232 em = NULL; 2233 } 2234 read_unlock(&em_tree->lock); 2235 if (!em) { 2236 kfree(failrec); 2237 return -EIO; 2238 } 2239 2240 logical = start - em->start; 2241 logical = em->block_start + logical; 2242 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2243 logical = em->block_start; 2244 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2245 extent_set_compress_type(&failrec->bio_flags, 2246 em->compress_type); 2247 } 2248 2249 btrfs_debug(fs_info, 2250 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2251 logical, start, failrec->len); 2252 2253 failrec->logical = logical; 2254 free_extent_map(em); 2255 2256 /* set the bits in the private failure tree */ 2257 ret = set_extent_bits(failure_tree, start, end, 2258 EXTENT_LOCKED | EXTENT_DIRTY); 2259 if (ret >= 0) 2260 ret = set_state_failrec(failure_tree, start, failrec); 2261 /* set the bits in the inode's tree */ 2262 if (ret >= 0) 2263 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2264 if (ret < 0) { 2265 kfree(failrec); 2266 return ret; 2267 } 2268 } else { 2269 btrfs_debug(fs_info, 2270 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2271 failrec->logical, failrec->start, failrec->len, 2272 failrec->in_validation); 2273 /* 2274 * when data can be on disk more than twice, add to failrec here 2275 * (e.g. with a list for failed_mirror) to make 2276 * clean_io_failure() clean all those errors at once. 2277 */ 2278 } 2279 2280 *failrec_ret = failrec; 2281 2282 return 0; 2283 } 2284 2285 int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, 2286 struct io_failure_record *failrec, int failed_mirror) 2287 { 2288 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2289 int num_copies; 2290 2291 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2292 if (num_copies == 1) { 2293 /* 2294 * we only have a single copy of the data, so don't bother with 2295 * all the retry and error correction code that follows. no 2296 * matter what the error is, it is very likely to persist. 2297 */ 2298 btrfs_debug(fs_info, 2299 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2300 num_copies, failrec->this_mirror, failed_mirror); 2301 return 0; 2302 } 2303 2304 /* 2305 * there are two premises: 2306 * a) deliver good data to the caller 2307 * b) correct the bad sectors on disk 2308 */ 2309 if (failed_bio->bi_vcnt > 1) { 2310 /* 2311 * to fulfill b), we need to know the exact failing sectors, as 2312 * we don't want to rewrite any more than the failed ones. thus, 2313 * we need separate read requests for the failed bio 2314 * 2315 * if the following BUG_ON triggers, our validation request got 2316 * merged. we need separate requests for our algorithm to work. 2317 */ 2318 BUG_ON(failrec->in_validation); 2319 failrec->in_validation = 1; 2320 failrec->this_mirror = failed_mirror; 2321 } else { 2322 /* 2323 * we're ready to fulfill a) and b) alongside. get a good copy 2324 * of the failed sector and if we succeed, we have setup 2325 * everything for repair_io_failure to do the rest for us. 2326 */ 2327 if (failrec->in_validation) { 2328 BUG_ON(failrec->this_mirror != failed_mirror); 2329 failrec->in_validation = 0; 2330 failrec->this_mirror = 0; 2331 } 2332 failrec->failed_mirror = failed_mirror; 2333 failrec->this_mirror++; 2334 if (failrec->this_mirror == failed_mirror) 2335 failrec->this_mirror++; 2336 } 2337 2338 if (failrec->this_mirror > num_copies) { 2339 btrfs_debug(fs_info, 2340 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2341 num_copies, failrec->this_mirror, failed_mirror); 2342 return 0; 2343 } 2344 2345 return 1; 2346 } 2347 2348 2349 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2350 struct io_failure_record *failrec, 2351 struct page *page, int pg_offset, int icsum, 2352 bio_end_io_t *endio_func, void *data) 2353 { 2354 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2355 struct bio *bio; 2356 struct btrfs_io_bio *btrfs_failed_bio; 2357 struct btrfs_io_bio *btrfs_bio; 2358 2359 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2360 if (!bio) 2361 return NULL; 2362 2363 bio->bi_end_io = endio_func; 2364 bio->bi_iter.bi_sector = failrec->logical >> 9; 2365 bio->bi_bdev = fs_info->fs_devices->latest_bdev; 2366 bio->bi_iter.bi_size = 0; 2367 bio->bi_private = data; 2368 2369 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2370 if (btrfs_failed_bio->csum) { 2371 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2372 2373 btrfs_bio = btrfs_io_bio(bio); 2374 btrfs_bio->csum = btrfs_bio->csum_inline; 2375 icsum *= csum_size; 2376 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2377 csum_size); 2378 } 2379 2380 bio_add_page(bio, page, failrec->len, pg_offset); 2381 2382 return bio; 2383 } 2384 2385 /* 2386 * this is a generic handler for readpage errors (default 2387 * readpage_io_failed_hook). if other copies exist, read those and write back 2388 * good data to the failed position. does not investigate in remapping the 2389 * failed extent elsewhere, hoping the device will be smart enough to do this as 2390 * needed 2391 */ 2392 2393 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2394 struct page *page, u64 start, u64 end, 2395 int failed_mirror) 2396 { 2397 struct io_failure_record *failrec; 2398 struct inode *inode = page->mapping->host; 2399 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2400 struct bio *bio; 2401 int read_mode = 0; 2402 int ret; 2403 2404 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2405 2406 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2407 if (ret) 2408 return ret; 2409 2410 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); 2411 if (!ret) { 2412 free_io_failure(BTRFS_I(inode), failrec); 2413 return -EIO; 2414 } 2415 2416 if (failed_bio->bi_vcnt > 1) 2417 read_mode |= REQ_FAILFAST_DEV; 2418 2419 phy_offset >>= inode->i_sb->s_blocksize_bits; 2420 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2421 start - page_offset(page), 2422 (int)phy_offset, failed_bio->bi_end_io, 2423 NULL); 2424 if (!bio) { 2425 free_io_failure(BTRFS_I(inode), failrec); 2426 return -EIO; 2427 } 2428 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 2429 2430 btrfs_debug(btrfs_sb(inode->i_sb), 2431 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2432 read_mode, failrec->this_mirror, failrec->in_validation); 2433 2434 ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, 2435 failrec->bio_flags, 0); 2436 if (ret) { 2437 free_io_failure(BTRFS_I(inode), failrec); 2438 bio_put(bio); 2439 } 2440 2441 return ret; 2442 } 2443 2444 /* lots and lots of room for performance fixes in the end_bio funcs */ 2445 2446 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2447 { 2448 int uptodate = (err == 0); 2449 struct extent_io_tree *tree; 2450 int ret = 0; 2451 2452 tree = &BTRFS_I(page->mapping->host)->io_tree; 2453 2454 if (tree->ops && tree->ops->writepage_end_io_hook) 2455 tree->ops->writepage_end_io_hook(page, start, end, NULL, 2456 uptodate); 2457 2458 if (!uptodate) { 2459 ClearPageUptodate(page); 2460 SetPageError(page); 2461 ret = ret < 0 ? ret : -EIO; 2462 mapping_set_error(page->mapping, ret); 2463 } 2464 } 2465 2466 /* 2467 * after a writepage IO is done, we need to: 2468 * clear the uptodate bits on error 2469 * clear the writeback bits in the extent tree for this IO 2470 * end_page_writeback if the page has no more pending IO 2471 * 2472 * Scheduling is not allowed, so the extent state tree is expected 2473 * to have one and only one object corresponding to this IO. 2474 */ 2475 static void end_bio_extent_writepage(struct bio *bio) 2476 { 2477 struct bio_vec *bvec; 2478 u64 start; 2479 u64 end; 2480 int i; 2481 2482 bio_for_each_segment_all(bvec, bio, i) { 2483 struct page *page = bvec->bv_page; 2484 struct inode *inode = page->mapping->host; 2485 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2486 2487 /* We always issue full-page reads, but if some block 2488 * in a page fails to read, blk_update_request() will 2489 * advance bv_offset and adjust bv_len to compensate. 2490 * Print a warning for nonzero offsets, and an error 2491 * if they don't add up to a full page. */ 2492 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2493 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2494 btrfs_err(fs_info, 2495 "partial page write in btrfs with offset %u and length %u", 2496 bvec->bv_offset, bvec->bv_len); 2497 else 2498 btrfs_info(fs_info, 2499 "incomplete page write in btrfs with offset %u and length %u", 2500 bvec->bv_offset, bvec->bv_len); 2501 } 2502 2503 start = page_offset(page); 2504 end = start + bvec->bv_offset + bvec->bv_len - 1; 2505 2506 end_extent_writepage(page, bio->bi_error, start, end); 2507 end_page_writeback(page); 2508 } 2509 2510 bio_put(bio); 2511 } 2512 2513 static void 2514 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2515 int uptodate) 2516 { 2517 struct extent_state *cached = NULL; 2518 u64 end = start + len - 1; 2519 2520 if (uptodate && tree->track_uptodate) 2521 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2522 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2523 } 2524 2525 /* 2526 * after a readpage IO is done, we need to: 2527 * clear the uptodate bits on error 2528 * set the uptodate bits if things worked 2529 * set the page up to date if all extents in the tree are uptodate 2530 * clear the lock bit in the extent tree 2531 * unlock the page if there are no other extents locked for it 2532 * 2533 * Scheduling is not allowed, so the extent state tree is expected 2534 * to have one and only one object corresponding to this IO. 2535 */ 2536 static void end_bio_extent_readpage(struct bio *bio) 2537 { 2538 struct bio_vec *bvec; 2539 int uptodate = !bio->bi_error; 2540 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2541 struct extent_io_tree *tree; 2542 u64 offset = 0; 2543 u64 start; 2544 u64 end; 2545 u64 len; 2546 u64 extent_start = 0; 2547 u64 extent_len = 0; 2548 int mirror; 2549 int ret; 2550 int i; 2551 2552 bio_for_each_segment_all(bvec, bio, i) { 2553 struct page *page = bvec->bv_page; 2554 struct inode *inode = page->mapping->host; 2555 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2556 2557 btrfs_debug(fs_info, 2558 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2559 (u64)bio->bi_iter.bi_sector, bio->bi_error, 2560 io_bio->mirror_num); 2561 tree = &BTRFS_I(inode)->io_tree; 2562 2563 /* We always issue full-page reads, but if some block 2564 * in a page fails to read, blk_update_request() will 2565 * advance bv_offset and adjust bv_len to compensate. 2566 * Print a warning for nonzero offsets, and an error 2567 * if they don't add up to a full page. */ 2568 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2569 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2570 btrfs_err(fs_info, 2571 "partial page read in btrfs with offset %u and length %u", 2572 bvec->bv_offset, bvec->bv_len); 2573 else 2574 btrfs_info(fs_info, 2575 "incomplete page read in btrfs with offset %u and length %u", 2576 bvec->bv_offset, bvec->bv_len); 2577 } 2578 2579 start = page_offset(page); 2580 end = start + bvec->bv_offset + bvec->bv_len - 1; 2581 len = bvec->bv_len; 2582 2583 mirror = io_bio->mirror_num; 2584 if (likely(uptodate && tree->ops)) { 2585 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2586 page, start, end, 2587 mirror); 2588 if (ret) 2589 uptodate = 0; 2590 else 2591 clean_io_failure(BTRFS_I(inode), start, 2592 page, 0); 2593 } 2594 2595 if (likely(uptodate)) 2596 goto readpage_ok; 2597 2598 if (tree->ops) { 2599 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2600 if (ret == -EAGAIN) { 2601 /* 2602 * Data inode's readpage_io_failed_hook() always 2603 * returns -EAGAIN. 2604 * 2605 * The generic bio_readpage_error handles errors 2606 * the following way: If possible, new read 2607 * requests are created and submitted and will 2608 * end up in end_bio_extent_readpage as well (if 2609 * we're lucky, not in the !uptodate case). In 2610 * that case it returns 0 and we just go on with 2611 * the next page in our bio. If it can't handle 2612 * the error it will return -EIO and we remain 2613 * responsible for that page. 2614 */ 2615 ret = bio_readpage_error(bio, offset, page, 2616 start, end, mirror); 2617 if (ret == 0) { 2618 uptodate = !bio->bi_error; 2619 offset += len; 2620 continue; 2621 } 2622 } 2623 2624 /* 2625 * metadata's readpage_io_failed_hook() always returns 2626 * -EIO and fixes nothing. -EIO is also returned if 2627 * data inode error could not be fixed. 2628 */ 2629 ASSERT(ret == -EIO); 2630 } 2631 readpage_ok: 2632 if (likely(uptodate)) { 2633 loff_t i_size = i_size_read(inode); 2634 pgoff_t end_index = i_size >> PAGE_SHIFT; 2635 unsigned off; 2636 2637 /* Zero out the end if this page straddles i_size */ 2638 off = i_size & (PAGE_SIZE-1); 2639 if (page->index == end_index && off) 2640 zero_user_segment(page, off, PAGE_SIZE); 2641 SetPageUptodate(page); 2642 } else { 2643 ClearPageUptodate(page); 2644 SetPageError(page); 2645 } 2646 unlock_page(page); 2647 offset += len; 2648 2649 if (unlikely(!uptodate)) { 2650 if (extent_len) { 2651 endio_readpage_release_extent(tree, 2652 extent_start, 2653 extent_len, 1); 2654 extent_start = 0; 2655 extent_len = 0; 2656 } 2657 endio_readpage_release_extent(tree, start, 2658 end - start + 1, 0); 2659 } else if (!extent_len) { 2660 extent_start = start; 2661 extent_len = end + 1 - start; 2662 } else if (extent_start + extent_len == start) { 2663 extent_len += end + 1 - start; 2664 } else { 2665 endio_readpage_release_extent(tree, extent_start, 2666 extent_len, uptodate); 2667 extent_start = start; 2668 extent_len = end + 1 - start; 2669 } 2670 } 2671 2672 if (extent_len) 2673 endio_readpage_release_extent(tree, extent_start, extent_len, 2674 uptodate); 2675 if (io_bio->end_io) 2676 io_bio->end_io(io_bio, bio->bi_error); 2677 bio_put(bio); 2678 } 2679 2680 /* 2681 * this allocates from the btrfs_bioset. We're returning a bio right now 2682 * but you can call btrfs_io_bio for the appropriate container_of magic 2683 */ 2684 struct bio * 2685 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2686 gfp_t gfp_flags) 2687 { 2688 struct btrfs_io_bio *btrfs_bio; 2689 struct bio *bio; 2690 2691 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2692 2693 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2694 while (!bio && (nr_vecs /= 2)) { 2695 bio = bio_alloc_bioset(gfp_flags, 2696 nr_vecs, btrfs_bioset); 2697 } 2698 } 2699 2700 if (bio) { 2701 bio->bi_bdev = bdev; 2702 bio->bi_iter.bi_sector = first_sector; 2703 btrfs_bio = btrfs_io_bio(bio); 2704 btrfs_bio->csum = NULL; 2705 btrfs_bio->csum_allocated = NULL; 2706 btrfs_bio->end_io = NULL; 2707 } 2708 return bio; 2709 } 2710 2711 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2712 { 2713 struct btrfs_io_bio *btrfs_bio; 2714 struct bio *new; 2715 2716 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2717 if (new) { 2718 btrfs_bio = btrfs_io_bio(new); 2719 btrfs_bio->csum = NULL; 2720 btrfs_bio->csum_allocated = NULL; 2721 btrfs_bio->end_io = NULL; 2722 } 2723 return new; 2724 } 2725 2726 /* this also allocates from the btrfs_bioset */ 2727 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2728 { 2729 struct btrfs_io_bio *btrfs_bio; 2730 struct bio *bio; 2731 2732 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2733 if (bio) { 2734 btrfs_bio = btrfs_io_bio(bio); 2735 btrfs_bio->csum = NULL; 2736 btrfs_bio->csum_allocated = NULL; 2737 btrfs_bio->end_io = NULL; 2738 } 2739 return bio; 2740 } 2741 2742 2743 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 2744 unsigned long bio_flags) 2745 { 2746 int ret = 0; 2747 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2748 struct page *page = bvec->bv_page; 2749 struct extent_io_tree *tree = bio->bi_private; 2750 u64 start; 2751 2752 start = page_offset(page) + bvec->bv_offset; 2753 2754 bio->bi_private = NULL; 2755 bio_get(bio); 2756 2757 if (tree->ops) 2758 ret = tree->ops->submit_bio_hook(page->mapping->host, bio, 2759 mirror_num, bio_flags, start); 2760 else 2761 btrfsic_submit_bio(bio); 2762 2763 bio_put(bio); 2764 return ret; 2765 } 2766 2767 static int merge_bio(struct extent_io_tree *tree, struct page *page, 2768 unsigned long offset, size_t size, struct bio *bio, 2769 unsigned long bio_flags) 2770 { 2771 int ret = 0; 2772 if (tree->ops) 2773 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2774 bio_flags); 2775 return ret; 2776 2777 } 2778 2779 static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, 2780 struct writeback_control *wbc, 2781 struct page *page, sector_t sector, 2782 size_t size, unsigned long offset, 2783 struct block_device *bdev, 2784 struct bio **bio_ret, 2785 bio_end_io_t end_io_func, 2786 int mirror_num, 2787 unsigned long prev_bio_flags, 2788 unsigned long bio_flags, 2789 bool force_bio_submit) 2790 { 2791 int ret = 0; 2792 struct bio *bio; 2793 int contig = 0; 2794 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2795 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2796 2797 if (bio_ret && *bio_ret) { 2798 bio = *bio_ret; 2799 if (old_compressed) 2800 contig = bio->bi_iter.bi_sector == sector; 2801 else 2802 contig = bio_end_sector(bio) == sector; 2803 2804 if (prev_bio_flags != bio_flags || !contig || 2805 force_bio_submit || 2806 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2807 bio_add_page(bio, page, page_size, offset) < page_size) { 2808 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2809 if (ret < 0) { 2810 *bio_ret = NULL; 2811 return ret; 2812 } 2813 bio = NULL; 2814 } else { 2815 if (wbc) 2816 wbc_account_io(wbc, page, page_size); 2817 return 0; 2818 } 2819 } 2820 2821 bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, 2822 GFP_NOFS | __GFP_HIGH); 2823 if (!bio) 2824 return -ENOMEM; 2825 2826 bio_add_page(bio, page, page_size, offset); 2827 bio->bi_end_io = end_io_func; 2828 bio->bi_private = tree; 2829 bio_set_op_attrs(bio, op, op_flags); 2830 if (wbc) { 2831 wbc_init_bio(wbc, bio); 2832 wbc_account_io(wbc, page, page_size); 2833 } 2834 2835 if (bio_ret) 2836 *bio_ret = bio; 2837 else 2838 ret = submit_one_bio(bio, mirror_num, bio_flags); 2839 2840 return ret; 2841 } 2842 2843 static void attach_extent_buffer_page(struct extent_buffer *eb, 2844 struct page *page) 2845 { 2846 if (!PagePrivate(page)) { 2847 SetPagePrivate(page); 2848 get_page(page); 2849 set_page_private(page, (unsigned long)eb); 2850 } else { 2851 WARN_ON(page->private != (unsigned long)eb); 2852 } 2853 } 2854 2855 void set_page_extent_mapped(struct page *page) 2856 { 2857 if (!PagePrivate(page)) { 2858 SetPagePrivate(page); 2859 get_page(page); 2860 set_page_private(page, EXTENT_PAGE_PRIVATE); 2861 } 2862 } 2863 2864 static struct extent_map * 2865 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2866 u64 start, u64 len, get_extent_t *get_extent, 2867 struct extent_map **em_cached) 2868 { 2869 struct extent_map *em; 2870 2871 if (em_cached && *em_cached) { 2872 em = *em_cached; 2873 if (extent_map_in_tree(em) && start >= em->start && 2874 start < extent_map_end(em)) { 2875 refcount_inc(&em->refs); 2876 return em; 2877 } 2878 2879 free_extent_map(em); 2880 *em_cached = NULL; 2881 } 2882 2883 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); 2884 if (em_cached && !IS_ERR_OR_NULL(em)) { 2885 BUG_ON(*em_cached); 2886 refcount_inc(&em->refs); 2887 *em_cached = em; 2888 } 2889 return em; 2890 } 2891 /* 2892 * basic readpage implementation. Locked extent state structs are inserted 2893 * into the tree that are removed when the IO is done (by the end_io 2894 * handlers) 2895 * XXX JDM: This needs looking at to ensure proper page locking 2896 * return 0 on success, otherwise return error 2897 */ 2898 static int __do_readpage(struct extent_io_tree *tree, 2899 struct page *page, 2900 get_extent_t *get_extent, 2901 struct extent_map **em_cached, 2902 struct bio **bio, int mirror_num, 2903 unsigned long *bio_flags, int read_flags, 2904 u64 *prev_em_start) 2905 { 2906 struct inode *inode = page->mapping->host; 2907 u64 start = page_offset(page); 2908 u64 page_end = start + PAGE_SIZE - 1; 2909 u64 end; 2910 u64 cur = start; 2911 u64 extent_offset; 2912 u64 last_byte = i_size_read(inode); 2913 u64 block_start; 2914 u64 cur_end; 2915 sector_t sector; 2916 struct extent_map *em; 2917 struct block_device *bdev; 2918 int ret = 0; 2919 int nr = 0; 2920 size_t pg_offset = 0; 2921 size_t iosize; 2922 size_t disk_io_size; 2923 size_t blocksize = inode->i_sb->s_blocksize; 2924 unsigned long this_bio_flag = 0; 2925 2926 set_page_extent_mapped(page); 2927 2928 end = page_end; 2929 if (!PageUptodate(page)) { 2930 if (cleancache_get_page(page) == 0) { 2931 BUG_ON(blocksize != PAGE_SIZE); 2932 unlock_extent(tree, start, end); 2933 goto out; 2934 } 2935 } 2936 2937 if (page->index == last_byte >> PAGE_SHIFT) { 2938 char *userpage; 2939 size_t zero_offset = last_byte & (PAGE_SIZE - 1); 2940 2941 if (zero_offset) { 2942 iosize = PAGE_SIZE - zero_offset; 2943 userpage = kmap_atomic(page); 2944 memset(userpage + zero_offset, 0, iosize); 2945 flush_dcache_page(page); 2946 kunmap_atomic(userpage); 2947 } 2948 } 2949 while (cur <= end) { 2950 bool force_bio_submit = false; 2951 2952 if (cur >= last_byte) { 2953 char *userpage; 2954 struct extent_state *cached = NULL; 2955 2956 iosize = PAGE_SIZE - pg_offset; 2957 userpage = kmap_atomic(page); 2958 memset(userpage + pg_offset, 0, iosize); 2959 flush_dcache_page(page); 2960 kunmap_atomic(userpage); 2961 set_extent_uptodate(tree, cur, cur + iosize - 1, 2962 &cached, GFP_NOFS); 2963 unlock_extent_cached(tree, cur, 2964 cur + iosize - 1, 2965 &cached, GFP_NOFS); 2966 break; 2967 } 2968 em = __get_extent_map(inode, page, pg_offset, cur, 2969 end - cur + 1, get_extent, em_cached); 2970 if (IS_ERR_OR_NULL(em)) { 2971 SetPageError(page); 2972 unlock_extent(tree, cur, end); 2973 break; 2974 } 2975 extent_offset = cur - em->start; 2976 BUG_ON(extent_map_end(em) <= cur); 2977 BUG_ON(end < cur); 2978 2979 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2980 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2981 extent_set_compress_type(&this_bio_flag, 2982 em->compress_type); 2983 } 2984 2985 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2986 cur_end = min(extent_map_end(em) - 1, end); 2987 iosize = ALIGN(iosize, blocksize); 2988 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2989 disk_io_size = em->block_len; 2990 sector = em->block_start >> 9; 2991 } else { 2992 sector = (em->block_start + extent_offset) >> 9; 2993 disk_io_size = iosize; 2994 } 2995 bdev = em->bdev; 2996 block_start = em->block_start; 2997 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2998 block_start = EXTENT_MAP_HOLE; 2999 3000 /* 3001 * If we have a file range that points to a compressed extent 3002 * and it's followed by a consecutive file range that points to 3003 * to the same compressed extent (possibly with a different 3004 * offset and/or length, so it either points to the whole extent 3005 * or only part of it), we must make sure we do not submit a 3006 * single bio to populate the pages for the 2 ranges because 3007 * this makes the compressed extent read zero out the pages 3008 * belonging to the 2nd range. Imagine the following scenario: 3009 * 3010 * File layout 3011 * [0 - 8K] [8K - 24K] 3012 * | | 3013 * | | 3014 * points to extent X, points to extent X, 3015 * offset 4K, length of 8K offset 0, length 16K 3016 * 3017 * [extent X, compressed length = 4K uncompressed length = 16K] 3018 * 3019 * If the bio to read the compressed extent covers both ranges, 3020 * it will decompress extent X into the pages belonging to the 3021 * first range and then it will stop, zeroing out the remaining 3022 * pages that belong to the other range that points to extent X. 3023 * So here we make sure we submit 2 bios, one for the first 3024 * range and another one for the third range. Both will target 3025 * the same physical extent from disk, but we can't currently 3026 * make the compressed bio endio callback populate the pages 3027 * for both ranges because each compressed bio is tightly 3028 * coupled with a single extent map, and each range can have 3029 * an extent map with a different offset value relative to the 3030 * uncompressed data of our extent and different lengths. This 3031 * is a corner case so we prioritize correctness over 3032 * non-optimal behavior (submitting 2 bios for the same extent). 3033 */ 3034 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3035 prev_em_start && *prev_em_start != (u64)-1 && 3036 *prev_em_start != em->orig_start) 3037 force_bio_submit = true; 3038 3039 if (prev_em_start) 3040 *prev_em_start = em->orig_start; 3041 3042 free_extent_map(em); 3043 em = NULL; 3044 3045 /* we've found a hole, just zero and go on */ 3046 if (block_start == EXTENT_MAP_HOLE) { 3047 char *userpage; 3048 struct extent_state *cached = NULL; 3049 3050 userpage = kmap_atomic(page); 3051 memset(userpage + pg_offset, 0, iosize); 3052 flush_dcache_page(page); 3053 kunmap_atomic(userpage); 3054 3055 set_extent_uptodate(tree, cur, cur + iosize - 1, 3056 &cached, GFP_NOFS); 3057 unlock_extent_cached(tree, cur, 3058 cur + iosize - 1, 3059 &cached, GFP_NOFS); 3060 cur = cur + iosize; 3061 pg_offset += iosize; 3062 continue; 3063 } 3064 /* the get_extent function already copied into the page */ 3065 if (test_range_bit(tree, cur, cur_end, 3066 EXTENT_UPTODATE, 1, NULL)) { 3067 check_page_uptodate(tree, page); 3068 unlock_extent(tree, cur, cur + iosize - 1); 3069 cur = cur + iosize; 3070 pg_offset += iosize; 3071 continue; 3072 } 3073 /* we have an inline extent but it didn't get marked up 3074 * to date. Error out 3075 */ 3076 if (block_start == EXTENT_MAP_INLINE) { 3077 SetPageError(page); 3078 unlock_extent(tree, cur, cur + iosize - 1); 3079 cur = cur + iosize; 3080 pg_offset += iosize; 3081 continue; 3082 } 3083 3084 ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, 3085 page, sector, disk_io_size, pg_offset, 3086 bdev, bio, 3087 end_bio_extent_readpage, mirror_num, 3088 *bio_flags, 3089 this_bio_flag, 3090 force_bio_submit); 3091 if (!ret) { 3092 nr++; 3093 *bio_flags = this_bio_flag; 3094 } else { 3095 SetPageError(page); 3096 unlock_extent(tree, cur, cur + iosize - 1); 3097 goto out; 3098 } 3099 cur = cur + iosize; 3100 pg_offset += iosize; 3101 } 3102 out: 3103 if (!nr) { 3104 if (!PageError(page)) 3105 SetPageUptodate(page); 3106 unlock_page(page); 3107 } 3108 return ret; 3109 } 3110 3111 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 3112 struct page *pages[], int nr_pages, 3113 u64 start, u64 end, 3114 get_extent_t *get_extent, 3115 struct extent_map **em_cached, 3116 struct bio **bio, int mirror_num, 3117 unsigned long *bio_flags, 3118 u64 *prev_em_start) 3119 { 3120 struct inode *inode; 3121 struct btrfs_ordered_extent *ordered; 3122 int index; 3123 3124 inode = pages[0]->mapping->host; 3125 while (1) { 3126 lock_extent(tree, start, end); 3127 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3128 end - start + 1); 3129 if (!ordered) 3130 break; 3131 unlock_extent(tree, start, end); 3132 btrfs_start_ordered_extent(inode, ordered, 1); 3133 btrfs_put_ordered_extent(ordered); 3134 } 3135 3136 for (index = 0; index < nr_pages; index++) { 3137 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 3138 mirror_num, bio_flags, 0, prev_em_start); 3139 put_page(pages[index]); 3140 } 3141 } 3142 3143 static void __extent_readpages(struct extent_io_tree *tree, 3144 struct page *pages[], 3145 int nr_pages, get_extent_t *get_extent, 3146 struct extent_map **em_cached, 3147 struct bio **bio, int mirror_num, 3148 unsigned long *bio_flags, 3149 u64 *prev_em_start) 3150 { 3151 u64 start = 0; 3152 u64 end = 0; 3153 u64 page_start; 3154 int index; 3155 int first_index = 0; 3156 3157 for (index = 0; index < nr_pages; index++) { 3158 page_start = page_offset(pages[index]); 3159 if (!end) { 3160 start = page_start; 3161 end = start + PAGE_SIZE - 1; 3162 first_index = index; 3163 } else if (end + 1 == page_start) { 3164 end += PAGE_SIZE; 3165 } else { 3166 __do_contiguous_readpages(tree, &pages[first_index], 3167 index - first_index, start, 3168 end, get_extent, em_cached, 3169 bio, mirror_num, bio_flags, 3170 prev_em_start); 3171 start = page_start; 3172 end = start + PAGE_SIZE - 1; 3173 first_index = index; 3174 } 3175 } 3176 3177 if (end) 3178 __do_contiguous_readpages(tree, &pages[first_index], 3179 index - first_index, start, 3180 end, get_extent, em_cached, bio, 3181 mirror_num, bio_flags, 3182 prev_em_start); 3183 } 3184 3185 static int __extent_read_full_page(struct extent_io_tree *tree, 3186 struct page *page, 3187 get_extent_t *get_extent, 3188 struct bio **bio, int mirror_num, 3189 unsigned long *bio_flags, int read_flags) 3190 { 3191 struct inode *inode = page->mapping->host; 3192 struct btrfs_ordered_extent *ordered; 3193 u64 start = page_offset(page); 3194 u64 end = start + PAGE_SIZE - 1; 3195 int ret; 3196 3197 while (1) { 3198 lock_extent(tree, start, end); 3199 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3200 PAGE_SIZE); 3201 if (!ordered) 3202 break; 3203 unlock_extent(tree, start, end); 3204 btrfs_start_ordered_extent(inode, ordered, 1); 3205 btrfs_put_ordered_extent(ordered); 3206 } 3207 3208 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3209 bio_flags, read_flags, NULL); 3210 return ret; 3211 } 3212 3213 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3214 get_extent_t *get_extent, int mirror_num) 3215 { 3216 struct bio *bio = NULL; 3217 unsigned long bio_flags = 0; 3218 int ret; 3219 3220 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3221 &bio_flags, 0); 3222 if (bio) 3223 ret = submit_one_bio(bio, mirror_num, bio_flags); 3224 return ret; 3225 } 3226 3227 static void update_nr_written(struct writeback_control *wbc, 3228 unsigned long nr_written) 3229 { 3230 wbc->nr_to_write -= nr_written; 3231 } 3232 3233 /* 3234 * helper for __extent_writepage, doing all of the delayed allocation setup. 3235 * 3236 * This returns 1 if our fill_delalloc function did all the work required 3237 * to write the page (copy into inline extent). In this case the IO has 3238 * been started and the page is already unlocked. 3239 * 3240 * This returns 0 if all went well (page still locked) 3241 * This returns < 0 if there were errors (page still locked) 3242 */ 3243 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3244 struct page *page, struct writeback_control *wbc, 3245 struct extent_page_data *epd, 3246 u64 delalloc_start, 3247 unsigned long *nr_written) 3248 { 3249 struct extent_io_tree *tree = epd->tree; 3250 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3251 u64 nr_delalloc; 3252 u64 delalloc_to_write = 0; 3253 u64 delalloc_end = 0; 3254 int ret; 3255 int page_started = 0; 3256 3257 if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) 3258 return 0; 3259 3260 while (delalloc_end < page_end) { 3261 nr_delalloc = find_lock_delalloc_range(inode, tree, 3262 page, 3263 &delalloc_start, 3264 &delalloc_end, 3265 BTRFS_MAX_EXTENT_SIZE); 3266 if (nr_delalloc == 0) { 3267 delalloc_start = delalloc_end + 1; 3268 continue; 3269 } 3270 ret = tree->ops->fill_delalloc(inode, page, 3271 delalloc_start, 3272 delalloc_end, 3273 &page_started, 3274 nr_written); 3275 /* File system has been set read-only */ 3276 if (ret) { 3277 SetPageError(page); 3278 /* fill_delalloc should be return < 0 for error 3279 * but just in case, we use > 0 here meaning the 3280 * IO is started, so we don't want to return > 0 3281 * unless things are going well. 3282 */ 3283 ret = ret < 0 ? ret : -EIO; 3284 goto done; 3285 } 3286 /* 3287 * delalloc_end is already one less than the total length, so 3288 * we don't subtract one from PAGE_SIZE 3289 */ 3290 delalloc_to_write += (delalloc_end - delalloc_start + 3291 PAGE_SIZE) >> PAGE_SHIFT; 3292 delalloc_start = delalloc_end + 1; 3293 } 3294 if (wbc->nr_to_write < delalloc_to_write) { 3295 int thresh = 8192; 3296 3297 if (delalloc_to_write < thresh * 2) 3298 thresh = delalloc_to_write; 3299 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3300 thresh); 3301 } 3302 3303 /* did the fill delalloc function already unlock and start 3304 * the IO? 3305 */ 3306 if (page_started) { 3307 /* 3308 * we've unlocked the page, so we can't update 3309 * the mapping's writeback index, just update 3310 * nr_to_write. 3311 */ 3312 wbc->nr_to_write -= *nr_written; 3313 return 1; 3314 } 3315 3316 ret = 0; 3317 3318 done: 3319 return ret; 3320 } 3321 3322 /* 3323 * helper for __extent_writepage. This calls the writepage start hooks, 3324 * and does the loop to map the page into extents and bios. 3325 * 3326 * We return 1 if the IO is started and the page is unlocked, 3327 * 0 if all went well (page still locked) 3328 * < 0 if there were errors (page still locked) 3329 */ 3330 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3331 struct page *page, 3332 struct writeback_control *wbc, 3333 struct extent_page_data *epd, 3334 loff_t i_size, 3335 unsigned long nr_written, 3336 int write_flags, int *nr_ret) 3337 { 3338 struct extent_io_tree *tree = epd->tree; 3339 u64 start = page_offset(page); 3340 u64 page_end = start + PAGE_SIZE - 1; 3341 u64 end; 3342 u64 cur = start; 3343 u64 extent_offset; 3344 u64 block_start; 3345 u64 iosize; 3346 sector_t sector; 3347 struct extent_map *em; 3348 struct block_device *bdev; 3349 size_t pg_offset = 0; 3350 size_t blocksize; 3351 int ret = 0; 3352 int nr = 0; 3353 bool compressed; 3354 3355 if (tree->ops && tree->ops->writepage_start_hook) { 3356 ret = tree->ops->writepage_start_hook(page, start, 3357 page_end); 3358 if (ret) { 3359 /* Fixup worker will requeue */ 3360 if (ret == -EBUSY) 3361 wbc->pages_skipped++; 3362 else 3363 redirty_page_for_writepage(wbc, page); 3364 3365 update_nr_written(wbc, nr_written); 3366 unlock_page(page); 3367 return 1; 3368 } 3369 } 3370 3371 /* 3372 * we don't want to touch the inode after unlocking the page, 3373 * so we update the mapping writeback index now 3374 */ 3375 update_nr_written(wbc, nr_written + 1); 3376 3377 end = page_end; 3378 if (i_size <= start) { 3379 if (tree->ops && tree->ops->writepage_end_io_hook) 3380 tree->ops->writepage_end_io_hook(page, start, 3381 page_end, NULL, 1); 3382 goto done; 3383 } 3384 3385 blocksize = inode->i_sb->s_blocksize; 3386 3387 while (cur <= end) { 3388 u64 em_end; 3389 3390 if (cur >= i_size) { 3391 if (tree->ops && tree->ops->writepage_end_io_hook) 3392 tree->ops->writepage_end_io_hook(page, cur, 3393 page_end, NULL, 1); 3394 break; 3395 } 3396 em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, 3397 end - cur + 1, 1); 3398 if (IS_ERR_OR_NULL(em)) { 3399 SetPageError(page); 3400 ret = PTR_ERR_OR_ZERO(em); 3401 break; 3402 } 3403 3404 extent_offset = cur - em->start; 3405 em_end = extent_map_end(em); 3406 BUG_ON(em_end <= cur); 3407 BUG_ON(end < cur); 3408 iosize = min(em_end - cur, end - cur + 1); 3409 iosize = ALIGN(iosize, blocksize); 3410 sector = (em->block_start + extent_offset) >> 9; 3411 bdev = em->bdev; 3412 block_start = em->block_start; 3413 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3414 free_extent_map(em); 3415 em = NULL; 3416 3417 /* 3418 * compressed and inline extents are written through other 3419 * paths in the FS 3420 */ 3421 if (compressed || block_start == EXTENT_MAP_HOLE || 3422 block_start == EXTENT_MAP_INLINE) { 3423 /* 3424 * end_io notification does not happen here for 3425 * compressed extents 3426 */ 3427 if (!compressed && tree->ops && 3428 tree->ops->writepage_end_io_hook) 3429 tree->ops->writepage_end_io_hook(page, cur, 3430 cur + iosize - 1, 3431 NULL, 1); 3432 else if (compressed) { 3433 /* we don't want to end_page_writeback on 3434 * a compressed extent. this happens 3435 * elsewhere 3436 */ 3437 nr++; 3438 } 3439 3440 cur += iosize; 3441 pg_offset += iosize; 3442 continue; 3443 } 3444 3445 set_range_writeback(tree, cur, cur + iosize - 1); 3446 if (!PageWriteback(page)) { 3447 btrfs_err(BTRFS_I(inode)->root->fs_info, 3448 "page %lu not writeback, cur %llu end %llu", 3449 page->index, cur, end); 3450 } 3451 3452 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3453 page, sector, iosize, pg_offset, 3454 bdev, &epd->bio, 3455 end_bio_extent_writepage, 3456 0, 0, 0, false); 3457 if (ret) { 3458 SetPageError(page); 3459 if (PageWriteback(page)) 3460 end_page_writeback(page); 3461 } 3462 3463 cur = cur + iosize; 3464 pg_offset += iosize; 3465 nr++; 3466 } 3467 done: 3468 *nr_ret = nr; 3469 return ret; 3470 } 3471 3472 /* 3473 * the writepage semantics are similar to regular writepage. extent 3474 * records are inserted to lock ranges in the tree, and as dirty areas 3475 * are found, they are marked writeback. Then the lock bits are removed 3476 * and the end_io handler clears the writeback ranges 3477 */ 3478 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3479 void *data) 3480 { 3481 struct inode *inode = page->mapping->host; 3482 struct extent_page_data *epd = data; 3483 u64 start = page_offset(page); 3484 u64 page_end = start + PAGE_SIZE - 1; 3485 int ret; 3486 int nr = 0; 3487 size_t pg_offset = 0; 3488 loff_t i_size = i_size_read(inode); 3489 unsigned long end_index = i_size >> PAGE_SHIFT; 3490 int write_flags = 0; 3491 unsigned long nr_written = 0; 3492 3493 if (wbc->sync_mode == WB_SYNC_ALL) 3494 write_flags = REQ_SYNC; 3495 3496 trace___extent_writepage(page, inode, wbc); 3497 3498 WARN_ON(!PageLocked(page)); 3499 3500 ClearPageError(page); 3501 3502 pg_offset = i_size & (PAGE_SIZE - 1); 3503 if (page->index > end_index || 3504 (page->index == end_index && !pg_offset)) { 3505 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3506 unlock_page(page); 3507 return 0; 3508 } 3509 3510 if (page->index == end_index) { 3511 char *userpage; 3512 3513 userpage = kmap_atomic(page); 3514 memset(userpage + pg_offset, 0, 3515 PAGE_SIZE - pg_offset); 3516 kunmap_atomic(userpage); 3517 flush_dcache_page(page); 3518 } 3519 3520 pg_offset = 0; 3521 3522 set_page_extent_mapped(page); 3523 3524 ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); 3525 if (ret == 1) 3526 goto done_unlocked; 3527 if (ret) 3528 goto done; 3529 3530 ret = __extent_writepage_io(inode, page, wbc, epd, 3531 i_size, nr_written, write_flags, &nr); 3532 if (ret == 1) 3533 goto done_unlocked; 3534 3535 done: 3536 if (nr == 0) { 3537 /* make sure the mapping tag for page dirty gets cleared */ 3538 set_page_writeback(page); 3539 end_page_writeback(page); 3540 } 3541 if (PageError(page)) { 3542 ret = ret < 0 ? ret : -EIO; 3543 end_extent_writepage(page, ret, start, page_end); 3544 } 3545 unlock_page(page); 3546 return ret; 3547 3548 done_unlocked: 3549 return 0; 3550 } 3551 3552 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3553 { 3554 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3555 TASK_UNINTERRUPTIBLE); 3556 } 3557 3558 static noinline_for_stack int 3559 lock_extent_buffer_for_io(struct extent_buffer *eb, 3560 struct btrfs_fs_info *fs_info, 3561 struct extent_page_data *epd) 3562 { 3563 unsigned long i, num_pages; 3564 int flush = 0; 3565 int ret = 0; 3566 3567 if (!btrfs_try_tree_write_lock(eb)) { 3568 flush = 1; 3569 flush_write_bio(epd); 3570 btrfs_tree_lock(eb); 3571 } 3572 3573 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3574 btrfs_tree_unlock(eb); 3575 if (!epd->sync_io) 3576 return 0; 3577 if (!flush) { 3578 flush_write_bio(epd); 3579 flush = 1; 3580 } 3581 while (1) { 3582 wait_on_extent_buffer_writeback(eb); 3583 btrfs_tree_lock(eb); 3584 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3585 break; 3586 btrfs_tree_unlock(eb); 3587 } 3588 } 3589 3590 /* 3591 * We need to do this to prevent races in people who check if the eb is 3592 * under IO since we can end up having no IO bits set for a short period 3593 * of time. 3594 */ 3595 spin_lock(&eb->refs_lock); 3596 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3597 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3598 spin_unlock(&eb->refs_lock); 3599 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3600 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3601 -eb->len, 3602 fs_info->dirty_metadata_batch); 3603 ret = 1; 3604 } else { 3605 spin_unlock(&eb->refs_lock); 3606 } 3607 3608 btrfs_tree_unlock(eb); 3609 3610 if (!ret) 3611 return ret; 3612 3613 num_pages = num_extent_pages(eb->start, eb->len); 3614 for (i = 0; i < num_pages; i++) { 3615 struct page *p = eb->pages[i]; 3616 3617 if (!trylock_page(p)) { 3618 if (!flush) { 3619 flush_write_bio(epd); 3620 flush = 1; 3621 } 3622 lock_page(p); 3623 } 3624 } 3625 3626 return ret; 3627 } 3628 3629 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3630 { 3631 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3632 smp_mb__after_atomic(); 3633 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3634 } 3635 3636 static void set_btree_ioerr(struct page *page) 3637 { 3638 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3639 3640 SetPageError(page); 3641 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3642 return; 3643 3644 /* 3645 * If writeback for a btree extent that doesn't belong to a log tree 3646 * failed, increment the counter transaction->eb_write_errors. 3647 * We do this because while the transaction is running and before it's 3648 * committing (when we call filemap_fdata[write|wait]_range against 3649 * the btree inode), we might have 3650 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3651 * returns an error or an error happens during writeback, when we're 3652 * committing the transaction we wouldn't know about it, since the pages 3653 * can be no longer dirty nor marked anymore for writeback (if a 3654 * subsequent modification to the extent buffer didn't happen before the 3655 * transaction commit), which makes filemap_fdata[write|wait]_range not 3656 * able to find the pages tagged with SetPageError at transaction 3657 * commit time. So if this happens we must abort the transaction, 3658 * otherwise we commit a super block with btree roots that point to 3659 * btree nodes/leafs whose content on disk is invalid - either garbage 3660 * or the content of some node/leaf from a past generation that got 3661 * cowed or deleted and is no longer valid. 3662 * 3663 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3664 * not be enough - we need to distinguish between log tree extents vs 3665 * non-log tree extents, and the next filemap_fdatawait_range() call 3666 * will catch and clear such errors in the mapping - and that call might 3667 * be from a log sync and not from a transaction commit. Also, checking 3668 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3669 * not done and would not be reliable - the eb might have been released 3670 * from memory and reading it back again means that flag would not be 3671 * set (since it's a runtime flag, not persisted on disk). 3672 * 3673 * Using the flags below in the btree inode also makes us achieve the 3674 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3675 * writeback for all dirty pages and before filemap_fdatawait_range() 3676 * is called, the writeback for all dirty pages had already finished 3677 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3678 * filemap_fdatawait_range() would return success, as it could not know 3679 * that writeback errors happened (the pages were no longer tagged for 3680 * writeback). 3681 */ 3682 switch (eb->log_index) { 3683 case -1: 3684 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3685 break; 3686 case 0: 3687 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3688 break; 3689 case 1: 3690 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3691 break; 3692 default: 3693 BUG(); /* unexpected, logic error */ 3694 } 3695 } 3696 3697 static void end_bio_extent_buffer_writepage(struct bio *bio) 3698 { 3699 struct bio_vec *bvec; 3700 struct extent_buffer *eb; 3701 int i, done; 3702 3703 bio_for_each_segment_all(bvec, bio, i) { 3704 struct page *page = bvec->bv_page; 3705 3706 eb = (struct extent_buffer *)page->private; 3707 BUG_ON(!eb); 3708 done = atomic_dec_and_test(&eb->io_pages); 3709 3710 if (bio->bi_error || 3711 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3712 ClearPageUptodate(page); 3713 set_btree_ioerr(page); 3714 } 3715 3716 end_page_writeback(page); 3717 3718 if (!done) 3719 continue; 3720 3721 end_extent_buffer_writeback(eb); 3722 } 3723 3724 bio_put(bio); 3725 } 3726 3727 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3728 struct btrfs_fs_info *fs_info, 3729 struct writeback_control *wbc, 3730 struct extent_page_data *epd) 3731 { 3732 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3733 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3734 u64 offset = eb->start; 3735 u32 nritems; 3736 unsigned long i, num_pages; 3737 unsigned long bio_flags = 0; 3738 unsigned long start, end; 3739 int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; 3740 int ret = 0; 3741 3742 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3743 num_pages = num_extent_pages(eb->start, eb->len); 3744 atomic_set(&eb->io_pages, num_pages); 3745 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3746 bio_flags = EXTENT_BIO_TREE_LOG; 3747 3748 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3749 nritems = btrfs_header_nritems(eb); 3750 if (btrfs_header_level(eb) > 0) { 3751 end = btrfs_node_key_ptr_offset(nritems); 3752 3753 memzero_extent_buffer(eb, end, eb->len - end); 3754 } else { 3755 /* 3756 * leaf: 3757 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3758 */ 3759 start = btrfs_item_nr_offset(nritems); 3760 end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); 3761 memzero_extent_buffer(eb, start, end - start); 3762 } 3763 3764 for (i = 0; i < num_pages; i++) { 3765 struct page *p = eb->pages[i]; 3766 3767 clear_page_dirty_for_io(p); 3768 set_page_writeback(p); 3769 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3770 p, offset >> 9, PAGE_SIZE, 0, bdev, 3771 &epd->bio, 3772 end_bio_extent_buffer_writepage, 3773 0, epd->bio_flags, bio_flags, false); 3774 epd->bio_flags = bio_flags; 3775 if (ret) { 3776 set_btree_ioerr(p); 3777 if (PageWriteback(p)) 3778 end_page_writeback(p); 3779 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3780 end_extent_buffer_writeback(eb); 3781 ret = -EIO; 3782 break; 3783 } 3784 offset += PAGE_SIZE; 3785 update_nr_written(wbc, 1); 3786 unlock_page(p); 3787 } 3788 3789 if (unlikely(ret)) { 3790 for (; i < num_pages; i++) { 3791 struct page *p = eb->pages[i]; 3792 clear_page_dirty_for_io(p); 3793 unlock_page(p); 3794 } 3795 } 3796 3797 return ret; 3798 } 3799 3800 int btree_write_cache_pages(struct address_space *mapping, 3801 struct writeback_control *wbc) 3802 { 3803 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3804 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3805 struct extent_buffer *eb, *prev_eb = NULL; 3806 struct extent_page_data epd = { 3807 .bio = NULL, 3808 .tree = tree, 3809 .extent_locked = 0, 3810 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3811 .bio_flags = 0, 3812 }; 3813 int ret = 0; 3814 int done = 0; 3815 int nr_to_write_done = 0; 3816 struct pagevec pvec; 3817 int nr_pages; 3818 pgoff_t index; 3819 pgoff_t end; /* Inclusive */ 3820 int scanned = 0; 3821 int tag; 3822 3823 pagevec_init(&pvec, 0); 3824 if (wbc->range_cyclic) { 3825 index = mapping->writeback_index; /* Start from prev offset */ 3826 end = -1; 3827 } else { 3828 index = wbc->range_start >> PAGE_SHIFT; 3829 end = wbc->range_end >> PAGE_SHIFT; 3830 scanned = 1; 3831 } 3832 if (wbc->sync_mode == WB_SYNC_ALL) 3833 tag = PAGECACHE_TAG_TOWRITE; 3834 else 3835 tag = PAGECACHE_TAG_DIRTY; 3836 retry: 3837 if (wbc->sync_mode == WB_SYNC_ALL) 3838 tag_pages_for_writeback(mapping, index, end); 3839 while (!done && !nr_to_write_done && (index <= end) && 3840 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3841 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3842 unsigned i; 3843 3844 scanned = 1; 3845 for (i = 0; i < nr_pages; i++) { 3846 struct page *page = pvec.pages[i]; 3847 3848 if (!PagePrivate(page)) 3849 continue; 3850 3851 if (!wbc->range_cyclic && page->index > end) { 3852 done = 1; 3853 break; 3854 } 3855 3856 spin_lock(&mapping->private_lock); 3857 if (!PagePrivate(page)) { 3858 spin_unlock(&mapping->private_lock); 3859 continue; 3860 } 3861 3862 eb = (struct extent_buffer *)page->private; 3863 3864 /* 3865 * Shouldn't happen and normally this would be a BUG_ON 3866 * but no sense in crashing the users box for something 3867 * we can survive anyway. 3868 */ 3869 if (WARN_ON(!eb)) { 3870 spin_unlock(&mapping->private_lock); 3871 continue; 3872 } 3873 3874 if (eb == prev_eb) { 3875 spin_unlock(&mapping->private_lock); 3876 continue; 3877 } 3878 3879 ret = atomic_inc_not_zero(&eb->refs); 3880 spin_unlock(&mapping->private_lock); 3881 if (!ret) 3882 continue; 3883 3884 prev_eb = eb; 3885 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3886 if (!ret) { 3887 free_extent_buffer(eb); 3888 continue; 3889 } 3890 3891 ret = write_one_eb(eb, fs_info, wbc, &epd); 3892 if (ret) { 3893 done = 1; 3894 free_extent_buffer(eb); 3895 break; 3896 } 3897 free_extent_buffer(eb); 3898 3899 /* 3900 * the filesystem may choose to bump up nr_to_write. 3901 * We have to make sure to honor the new nr_to_write 3902 * at any time 3903 */ 3904 nr_to_write_done = wbc->nr_to_write <= 0; 3905 } 3906 pagevec_release(&pvec); 3907 cond_resched(); 3908 } 3909 if (!scanned && !done) { 3910 /* 3911 * We hit the last page and there is more work to be done: wrap 3912 * back to the start of the file 3913 */ 3914 scanned = 1; 3915 index = 0; 3916 goto retry; 3917 } 3918 flush_write_bio(&epd); 3919 return ret; 3920 } 3921 3922 /** 3923 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3924 * @mapping: address space structure to write 3925 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3926 * @writepage: function called for each page 3927 * @data: data passed to writepage function 3928 * 3929 * If a page is already under I/O, write_cache_pages() skips it, even 3930 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3931 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3932 * and msync() need to guarantee that all the data which was dirty at the time 3933 * the call was made get new I/O started against them. If wbc->sync_mode is 3934 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3935 * existing IO to complete. 3936 */ 3937 static int extent_write_cache_pages(struct address_space *mapping, 3938 struct writeback_control *wbc, 3939 writepage_t writepage, void *data, 3940 void (*flush_fn)(void *)) 3941 { 3942 struct inode *inode = mapping->host; 3943 int ret = 0; 3944 int done = 0; 3945 int nr_to_write_done = 0; 3946 struct pagevec pvec; 3947 int nr_pages; 3948 pgoff_t index; 3949 pgoff_t end; /* Inclusive */ 3950 pgoff_t done_index; 3951 int range_whole = 0; 3952 int scanned = 0; 3953 int tag; 3954 3955 /* 3956 * We have to hold onto the inode so that ordered extents can do their 3957 * work when the IO finishes. The alternative to this is failing to add 3958 * an ordered extent if the igrab() fails there and that is a huge pain 3959 * to deal with, so instead just hold onto the inode throughout the 3960 * writepages operation. If it fails here we are freeing up the inode 3961 * anyway and we'd rather not waste our time writing out stuff that is 3962 * going to be truncated anyway. 3963 */ 3964 if (!igrab(inode)) 3965 return 0; 3966 3967 pagevec_init(&pvec, 0); 3968 if (wbc->range_cyclic) { 3969 index = mapping->writeback_index; /* Start from prev offset */ 3970 end = -1; 3971 } else { 3972 index = wbc->range_start >> PAGE_SHIFT; 3973 end = wbc->range_end >> PAGE_SHIFT; 3974 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 3975 range_whole = 1; 3976 scanned = 1; 3977 } 3978 if (wbc->sync_mode == WB_SYNC_ALL) 3979 tag = PAGECACHE_TAG_TOWRITE; 3980 else 3981 tag = PAGECACHE_TAG_DIRTY; 3982 retry: 3983 if (wbc->sync_mode == WB_SYNC_ALL) 3984 tag_pages_for_writeback(mapping, index, end); 3985 done_index = index; 3986 while (!done && !nr_to_write_done && (index <= end) && 3987 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3988 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3989 unsigned i; 3990 3991 scanned = 1; 3992 for (i = 0; i < nr_pages; i++) { 3993 struct page *page = pvec.pages[i]; 3994 3995 done_index = page->index; 3996 /* 3997 * At this point we hold neither mapping->tree_lock nor 3998 * lock on the page itself: the page may be truncated or 3999 * invalidated (changing page->mapping to NULL), or even 4000 * swizzled back from swapper_space to tmpfs file 4001 * mapping 4002 */ 4003 if (!trylock_page(page)) { 4004 flush_fn(data); 4005 lock_page(page); 4006 } 4007 4008 if (unlikely(page->mapping != mapping)) { 4009 unlock_page(page); 4010 continue; 4011 } 4012 4013 if (!wbc->range_cyclic && page->index > end) { 4014 done = 1; 4015 unlock_page(page); 4016 continue; 4017 } 4018 4019 if (wbc->sync_mode != WB_SYNC_NONE) { 4020 if (PageWriteback(page)) 4021 flush_fn(data); 4022 wait_on_page_writeback(page); 4023 } 4024 4025 if (PageWriteback(page) || 4026 !clear_page_dirty_for_io(page)) { 4027 unlock_page(page); 4028 continue; 4029 } 4030 4031 ret = (*writepage)(page, wbc, data); 4032 4033 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 4034 unlock_page(page); 4035 ret = 0; 4036 } 4037 if (ret < 0) { 4038 /* 4039 * done_index is set past this page, 4040 * so media errors will not choke 4041 * background writeout for the entire 4042 * file. This has consequences for 4043 * range_cyclic semantics (ie. it may 4044 * not be suitable for data integrity 4045 * writeout). 4046 */ 4047 done_index = page->index + 1; 4048 done = 1; 4049 break; 4050 } 4051 4052 /* 4053 * the filesystem may choose to bump up nr_to_write. 4054 * We have to make sure to honor the new nr_to_write 4055 * at any time 4056 */ 4057 nr_to_write_done = wbc->nr_to_write <= 0; 4058 } 4059 pagevec_release(&pvec); 4060 cond_resched(); 4061 } 4062 if (!scanned && !done) { 4063 /* 4064 * We hit the last page and there is more work to be done: wrap 4065 * back to the start of the file 4066 */ 4067 scanned = 1; 4068 index = 0; 4069 goto retry; 4070 } 4071 4072 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4073 mapping->writeback_index = done_index; 4074 4075 btrfs_add_delayed_iput(inode); 4076 return ret; 4077 } 4078 4079 static void flush_epd_write_bio(struct extent_page_data *epd) 4080 { 4081 if (epd->bio) { 4082 int ret; 4083 4084 bio_set_op_attrs(epd->bio, REQ_OP_WRITE, 4085 epd->sync_io ? REQ_SYNC : 0); 4086 4087 ret = submit_one_bio(epd->bio, 0, epd->bio_flags); 4088 BUG_ON(ret < 0); /* -ENOMEM */ 4089 epd->bio = NULL; 4090 } 4091 } 4092 4093 static noinline void flush_write_bio(void *data) 4094 { 4095 struct extent_page_data *epd = data; 4096 flush_epd_write_bio(epd); 4097 } 4098 4099 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 4100 get_extent_t *get_extent, 4101 struct writeback_control *wbc) 4102 { 4103 int ret; 4104 struct extent_page_data epd = { 4105 .bio = NULL, 4106 .tree = tree, 4107 .get_extent = get_extent, 4108 .extent_locked = 0, 4109 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4110 .bio_flags = 0, 4111 }; 4112 4113 ret = __extent_writepage(page, wbc, &epd); 4114 4115 flush_epd_write_bio(&epd); 4116 return ret; 4117 } 4118 4119 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 4120 u64 start, u64 end, get_extent_t *get_extent, 4121 int mode) 4122 { 4123 int ret = 0; 4124 struct address_space *mapping = inode->i_mapping; 4125 struct page *page; 4126 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4127 PAGE_SHIFT; 4128 4129 struct extent_page_data epd = { 4130 .bio = NULL, 4131 .tree = tree, 4132 .get_extent = get_extent, 4133 .extent_locked = 1, 4134 .sync_io = mode == WB_SYNC_ALL, 4135 .bio_flags = 0, 4136 }; 4137 struct writeback_control wbc_writepages = { 4138 .sync_mode = mode, 4139 .nr_to_write = nr_pages * 2, 4140 .range_start = start, 4141 .range_end = end + 1, 4142 }; 4143 4144 while (start <= end) { 4145 page = find_get_page(mapping, start >> PAGE_SHIFT); 4146 if (clear_page_dirty_for_io(page)) 4147 ret = __extent_writepage(page, &wbc_writepages, &epd); 4148 else { 4149 if (tree->ops && tree->ops->writepage_end_io_hook) 4150 tree->ops->writepage_end_io_hook(page, start, 4151 start + PAGE_SIZE - 1, 4152 NULL, 1); 4153 unlock_page(page); 4154 } 4155 put_page(page); 4156 start += PAGE_SIZE; 4157 } 4158 4159 flush_epd_write_bio(&epd); 4160 return ret; 4161 } 4162 4163 int extent_writepages(struct extent_io_tree *tree, 4164 struct address_space *mapping, 4165 get_extent_t *get_extent, 4166 struct writeback_control *wbc) 4167 { 4168 int ret = 0; 4169 struct extent_page_data epd = { 4170 .bio = NULL, 4171 .tree = tree, 4172 .get_extent = get_extent, 4173 .extent_locked = 0, 4174 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4175 .bio_flags = 0, 4176 }; 4177 4178 ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, 4179 flush_write_bio); 4180 flush_epd_write_bio(&epd); 4181 return ret; 4182 } 4183 4184 int extent_readpages(struct extent_io_tree *tree, 4185 struct address_space *mapping, 4186 struct list_head *pages, unsigned nr_pages, 4187 get_extent_t get_extent) 4188 { 4189 struct bio *bio = NULL; 4190 unsigned page_idx; 4191 unsigned long bio_flags = 0; 4192 struct page *pagepool[16]; 4193 struct page *page; 4194 struct extent_map *em_cached = NULL; 4195 int nr = 0; 4196 u64 prev_em_start = (u64)-1; 4197 4198 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 4199 page = list_entry(pages->prev, struct page, lru); 4200 4201 prefetchw(&page->flags); 4202 list_del(&page->lru); 4203 if (add_to_page_cache_lru(page, mapping, 4204 page->index, 4205 readahead_gfp_mask(mapping))) { 4206 put_page(page); 4207 continue; 4208 } 4209 4210 pagepool[nr++] = page; 4211 if (nr < ARRAY_SIZE(pagepool)) 4212 continue; 4213 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4214 &bio, 0, &bio_flags, &prev_em_start); 4215 nr = 0; 4216 } 4217 if (nr) 4218 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4219 &bio, 0, &bio_flags, &prev_em_start); 4220 4221 if (em_cached) 4222 free_extent_map(em_cached); 4223 4224 BUG_ON(!list_empty(pages)); 4225 if (bio) 4226 return submit_one_bio(bio, 0, bio_flags); 4227 return 0; 4228 } 4229 4230 /* 4231 * basic invalidatepage code, this waits on any locked or writeback 4232 * ranges corresponding to the page, and then deletes any extent state 4233 * records from the tree 4234 */ 4235 int extent_invalidatepage(struct extent_io_tree *tree, 4236 struct page *page, unsigned long offset) 4237 { 4238 struct extent_state *cached_state = NULL; 4239 u64 start = page_offset(page); 4240 u64 end = start + PAGE_SIZE - 1; 4241 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4242 4243 start += ALIGN(offset, blocksize); 4244 if (start > end) 4245 return 0; 4246 4247 lock_extent_bits(tree, start, end, &cached_state); 4248 wait_on_page_writeback(page); 4249 clear_extent_bit(tree, start, end, 4250 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4251 EXTENT_DO_ACCOUNTING, 4252 1, 1, &cached_state, GFP_NOFS); 4253 return 0; 4254 } 4255 4256 /* 4257 * a helper for releasepage, this tests for areas of the page that 4258 * are locked or under IO and drops the related state bits if it is safe 4259 * to drop the page. 4260 */ 4261 static int try_release_extent_state(struct extent_map_tree *map, 4262 struct extent_io_tree *tree, 4263 struct page *page, gfp_t mask) 4264 { 4265 u64 start = page_offset(page); 4266 u64 end = start + PAGE_SIZE - 1; 4267 int ret = 1; 4268 4269 if (test_range_bit(tree, start, end, 4270 EXTENT_IOBITS, 0, NULL)) 4271 ret = 0; 4272 else { 4273 /* 4274 * at this point we can safely clear everything except the 4275 * locked bit and the nodatasum bit 4276 */ 4277 ret = clear_extent_bit(tree, start, end, 4278 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4279 0, 0, NULL, mask); 4280 4281 /* if clear_extent_bit failed for enomem reasons, 4282 * we can't allow the release to continue. 4283 */ 4284 if (ret < 0) 4285 ret = 0; 4286 else 4287 ret = 1; 4288 } 4289 return ret; 4290 } 4291 4292 /* 4293 * a helper for releasepage. As long as there are no locked extents 4294 * in the range corresponding to the page, both state records and extent 4295 * map records are removed 4296 */ 4297 int try_release_extent_mapping(struct extent_map_tree *map, 4298 struct extent_io_tree *tree, struct page *page, 4299 gfp_t mask) 4300 { 4301 struct extent_map *em; 4302 u64 start = page_offset(page); 4303 u64 end = start + PAGE_SIZE - 1; 4304 4305 if (gfpflags_allow_blocking(mask) && 4306 page->mapping->host->i_size > SZ_16M) { 4307 u64 len; 4308 while (start <= end) { 4309 len = end - start + 1; 4310 write_lock(&map->lock); 4311 em = lookup_extent_mapping(map, start, len); 4312 if (!em) { 4313 write_unlock(&map->lock); 4314 break; 4315 } 4316 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4317 em->start != start) { 4318 write_unlock(&map->lock); 4319 free_extent_map(em); 4320 break; 4321 } 4322 if (!test_range_bit(tree, em->start, 4323 extent_map_end(em) - 1, 4324 EXTENT_LOCKED | EXTENT_WRITEBACK, 4325 0, NULL)) { 4326 remove_extent_mapping(map, em); 4327 /* once for the rb tree */ 4328 free_extent_map(em); 4329 } 4330 start = extent_map_end(em); 4331 write_unlock(&map->lock); 4332 4333 /* once for us */ 4334 free_extent_map(em); 4335 } 4336 } 4337 return try_release_extent_state(map, tree, page, mask); 4338 } 4339 4340 /* 4341 * helper function for fiemap, which doesn't want to see any holes. 4342 * This maps until we find something past 'last' 4343 */ 4344 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4345 u64 offset, 4346 u64 last, 4347 get_extent_t *get_extent) 4348 { 4349 u64 sectorsize = btrfs_inode_sectorsize(inode); 4350 struct extent_map *em; 4351 u64 len; 4352 4353 if (offset >= last) 4354 return NULL; 4355 4356 while (1) { 4357 len = last - offset; 4358 if (len == 0) 4359 break; 4360 len = ALIGN(len, sectorsize); 4361 em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); 4362 if (IS_ERR_OR_NULL(em)) 4363 return em; 4364 4365 /* if this isn't a hole return it */ 4366 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 4367 em->block_start != EXTENT_MAP_HOLE) { 4368 return em; 4369 } 4370 4371 /* this is a hole, advance to the next extent */ 4372 offset = extent_map_end(em); 4373 free_extent_map(em); 4374 if (offset >= last) 4375 break; 4376 } 4377 return NULL; 4378 } 4379 4380 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4381 __u64 start, __u64 len, get_extent_t *get_extent) 4382 { 4383 int ret = 0; 4384 u64 off = start; 4385 u64 max = start + len; 4386 u32 flags = 0; 4387 u32 found_type; 4388 u64 last; 4389 u64 last_for_get_extent = 0; 4390 u64 disko = 0; 4391 u64 isize = i_size_read(inode); 4392 struct btrfs_key found_key; 4393 struct extent_map *em = NULL; 4394 struct extent_state *cached_state = NULL; 4395 struct btrfs_path *path; 4396 struct btrfs_root *root = BTRFS_I(inode)->root; 4397 int end = 0; 4398 u64 em_start = 0; 4399 u64 em_len = 0; 4400 u64 em_end = 0; 4401 4402 if (len == 0) 4403 return -EINVAL; 4404 4405 path = btrfs_alloc_path(); 4406 if (!path) 4407 return -ENOMEM; 4408 path->leave_spinning = 1; 4409 4410 start = round_down(start, btrfs_inode_sectorsize(inode)); 4411 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4412 4413 /* 4414 * lookup the last file extent. We're not using i_size here 4415 * because there might be preallocation past i_size 4416 */ 4417 ret = btrfs_lookup_file_extent(NULL, root, path, 4418 btrfs_ino(BTRFS_I(inode)), -1, 0); 4419 if (ret < 0) { 4420 btrfs_free_path(path); 4421 return ret; 4422 } else { 4423 WARN_ON(!ret); 4424 if (ret == 1) 4425 ret = 0; 4426 } 4427 4428 path->slots[0]--; 4429 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4430 found_type = found_key.type; 4431 4432 /* No extents, but there might be delalloc bits */ 4433 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || 4434 found_type != BTRFS_EXTENT_DATA_KEY) { 4435 /* have to trust i_size as the end */ 4436 last = (u64)-1; 4437 last_for_get_extent = isize; 4438 } else { 4439 /* 4440 * remember the start of the last extent. There are a 4441 * bunch of different factors that go into the length of the 4442 * extent, so its much less complex to remember where it started 4443 */ 4444 last = found_key.offset; 4445 last_for_get_extent = last + 1; 4446 } 4447 btrfs_release_path(path); 4448 4449 /* 4450 * we might have some extents allocated but more delalloc past those 4451 * extents. so, we trust isize unless the start of the last extent is 4452 * beyond isize 4453 */ 4454 if (last < isize) { 4455 last = (u64)-1; 4456 last_for_get_extent = isize; 4457 } 4458 4459 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4460 &cached_state); 4461 4462 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4463 get_extent); 4464 if (!em) 4465 goto out; 4466 if (IS_ERR(em)) { 4467 ret = PTR_ERR(em); 4468 goto out; 4469 } 4470 4471 while (!end) { 4472 u64 offset_in_extent = 0; 4473 4474 /* break if the extent we found is outside the range */ 4475 if (em->start >= max || extent_map_end(em) < off) 4476 break; 4477 4478 /* 4479 * get_extent may return an extent that starts before our 4480 * requested range. We have to make sure the ranges 4481 * we return to fiemap always move forward and don't 4482 * overlap, so adjust the offsets here 4483 */ 4484 em_start = max(em->start, off); 4485 4486 /* 4487 * record the offset from the start of the extent 4488 * for adjusting the disk offset below. Only do this if the 4489 * extent isn't compressed since our in ram offset may be past 4490 * what we have actually allocated on disk. 4491 */ 4492 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4493 offset_in_extent = em_start - em->start; 4494 em_end = extent_map_end(em); 4495 em_len = em_end - em_start; 4496 disko = 0; 4497 flags = 0; 4498 4499 /* 4500 * bump off for our next call to get_extent 4501 */ 4502 off = extent_map_end(em); 4503 if (off >= max) 4504 end = 1; 4505 4506 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4507 end = 1; 4508 flags |= FIEMAP_EXTENT_LAST; 4509 } else if (em->block_start == EXTENT_MAP_INLINE) { 4510 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4511 FIEMAP_EXTENT_NOT_ALIGNED); 4512 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4513 flags |= (FIEMAP_EXTENT_DELALLOC | 4514 FIEMAP_EXTENT_UNKNOWN); 4515 } else if (fieinfo->fi_extents_max) { 4516 struct btrfs_trans_handle *trans; 4517 4518 u64 bytenr = em->block_start - 4519 (em->start - em->orig_start); 4520 4521 disko = em->block_start + offset_in_extent; 4522 4523 /* 4524 * We need a trans handle to get delayed refs 4525 */ 4526 trans = btrfs_join_transaction(root); 4527 /* 4528 * It's OK if we can't start a trans we can still check 4529 * from commit_root 4530 */ 4531 if (IS_ERR(trans)) 4532 trans = NULL; 4533 4534 /* 4535 * As btrfs supports shared space, this information 4536 * can be exported to userspace tools via 4537 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4538 * then we're just getting a count and we can skip the 4539 * lookup stuff. 4540 */ 4541 ret = btrfs_check_shared(trans, root->fs_info, 4542 root->objectid, 4543 btrfs_ino(BTRFS_I(inode)), bytenr); 4544 if (trans) 4545 btrfs_end_transaction(trans); 4546 if (ret < 0) 4547 goto out_free; 4548 if (ret) 4549 flags |= FIEMAP_EXTENT_SHARED; 4550 ret = 0; 4551 } 4552 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4553 flags |= FIEMAP_EXTENT_ENCODED; 4554 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4555 flags |= FIEMAP_EXTENT_UNWRITTEN; 4556 4557 free_extent_map(em); 4558 em = NULL; 4559 if ((em_start >= last) || em_len == (u64)-1 || 4560 (last == (u64)-1 && isize <= em_end)) { 4561 flags |= FIEMAP_EXTENT_LAST; 4562 end = 1; 4563 } 4564 4565 /* now scan forward to see if this is really the last extent. */ 4566 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4567 get_extent); 4568 if (IS_ERR(em)) { 4569 ret = PTR_ERR(em); 4570 goto out; 4571 } 4572 if (!em) { 4573 flags |= FIEMAP_EXTENT_LAST; 4574 end = 1; 4575 } 4576 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4577 em_len, flags); 4578 if (ret) { 4579 if (ret == 1) 4580 ret = 0; 4581 goto out_free; 4582 } 4583 } 4584 out_free: 4585 free_extent_map(em); 4586 out: 4587 btrfs_free_path(path); 4588 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4589 &cached_state, GFP_NOFS); 4590 return ret; 4591 } 4592 4593 static void __free_extent_buffer(struct extent_buffer *eb) 4594 { 4595 btrfs_leak_debug_del(&eb->leak_list); 4596 kmem_cache_free(extent_buffer_cache, eb); 4597 } 4598 4599 int extent_buffer_under_io(struct extent_buffer *eb) 4600 { 4601 return (atomic_read(&eb->io_pages) || 4602 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4603 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4604 } 4605 4606 /* 4607 * Helper for releasing extent buffer page. 4608 */ 4609 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) 4610 { 4611 unsigned long index; 4612 struct page *page; 4613 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4614 4615 BUG_ON(extent_buffer_under_io(eb)); 4616 4617 index = num_extent_pages(eb->start, eb->len); 4618 if (index == 0) 4619 return; 4620 4621 do { 4622 index--; 4623 page = eb->pages[index]; 4624 if (!page) 4625 continue; 4626 if (mapped) 4627 spin_lock(&page->mapping->private_lock); 4628 /* 4629 * We do this since we'll remove the pages after we've 4630 * removed the eb from the radix tree, so we could race 4631 * and have this page now attached to the new eb. So 4632 * only clear page_private if it's still connected to 4633 * this eb. 4634 */ 4635 if (PagePrivate(page) && 4636 page->private == (unsigned long)eb) { 4637 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4638 BUG_ON(PageDirty(page)); 4639 BUG_ON(PageWriteback(page)); 4640 /* 4641 * We need to make sure we haven't be attached 4642 * to a new eb. 4643 */ 4644 ClearPagePrivate(page); 4645 set_page_private(page, 0); 4646 /* One for the page private */ 4647 put_page(page); 4648 } 4649 4650 if (mapped) 4651 spin_unlock(&page->mapping->private_lock); 4652 4653 /* One for when we allocated the page */ 4654 put_page(page); 4655 } while (index != 0); 4656 } 4657 4658 /* 4659 * Helper for releasing the extent buffer. 4660 */ 4661 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4662 { 4663 btrfs_release_extent_buffer_page(eb); 4664 __free_extent_buffer(eb); 4665 } 4666 4667 static struct extent_buffer * 4668 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4669 unsigned long len) 4670 { 4671 struct extent_buffer *eb = NULL; 4672 4673 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4674 eb->start = start; 4675 eb->len = len; 4676 eb->fs_info = fs_info; 4677 eb->bflags = 0; 4678 rwlock_init(&eb->lock); 4679 atomic_set(&eb->write_locks, 0); 4680 atomic_set(&eb->read_locks, 0); 4681 atomic_set(&eb->blocking_readers, 0); 4682 atomic_set(&eb->blocking_writers, 0); 4683 atomic_set(&eb->spinning_readers, 0); 4684 atomic_set(&eb->spinning_writers, 0); 4685 eb->lock_nested = 0; 4686 init_waitqueue_head(&eb->write_lock_wq); 4687 init_waitqueue_head(&eb->read_lock_wq); 4688 4689 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4690 4691 spin_lock_init(&eb->refs_lock); 4692 atomic_set(&eb->refs, 1); 4693 atomic_set(&eb->io_pages, 0); 4694 4695 /* 4696 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4697 */ 4698 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4699 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4700 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4701 4702 return eb; 4703 } 4704 4705 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4706 { 4707 unsigned long i; 4708 struct page *p; 4709 struct extent_buffer *new; 4710 unsigned long num_pages = num_extent_pages(src->start, src->len); 4711 4712 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4713 if (new == NULL) 4714 return NULL; 4715 4716 for (i = 0; i < num_pages; i++) { 4717 p = alloc_page(GFP_NOFS); 4718 if (!p) { 4719 btrfs_release_extent_buffer(new); 4720 return NULL; 4721 } 4722 attach_extent_buffer_page(new, p); 4723 WARN_ON(PageDirty(p)); 4724 SetPageUptodate(p); 4725 new->pages[i] = p; 4726 copy_page(page_address(p), page_address(src->pages[i])); 4727 } 4728 4729 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4730 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4731 4732 return new; 4733 } 4734 4735 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4736 u64 start, unsigned long len) 4737 { 4738 struct extent_buffer *eb; 4739 unsigned long num_pages; 4740 unsigned long i; 4741 4742 num_pages = num_extent_pages(start, len); 4743 4744 eb = __alloc_extent_buffer(fs_info, start, len); 4745 if (!eb) 4746 return NULL; 4747 4748 for (i = 0; i < num_pages; i++) { 4749 eb->pages[i] = alloc_page(GFP_NOFS); 4750 if (!eb->pages[i]) 4751 goto err; 4752 } 4753 set_extent_buffer_uptodate(eb); 4754 btrfs_set_header_nritems(eb, 0); 4755 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4756 4757 return eb; 4758 err: 4759 for (; i > 0; i--) 4760 __free_page(eb->pages[i - 1]); 4761 __free_extent_buffer(eb); 4762 return NULL; 4763 } 4764 4765 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4766 u64 start) 4767 { 4768 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 4769 } 4770 4771 static void check_buffer_tree_ref(struct extent_buffer *eb) 4772 { 4773 int refs; 4774 /* the ref bit is tricky. We have to make sure it is set 4775 * if we have the buffer dirty. Otherwise the 4776 * code to free a buffer can end up dropping a dirty 4777 * page 4778 * 4779 * Once the ref bit is set, it won't go away while the 4780 * buffer is dirty or in writeback, and it also won't 4781 * go away while we have the reference count on the 4782 * eb bumped. 4783 * 4784 * We can't just set the ref bit without bumping the 4785 * ref on the eb because free_extent_buffer might 4786 * see the ref bit and try to clear it. If this happens 4787 * free_extent_buffer might end up dropping our original 4788 * ref by mistake and freeing the page before we are able 4789 * to add one more ref. 4790 * 4791 * So bump the ref count first, then set the bit. If someone 4792 * beat us to it, drop the ref we added. 4793 */ 4794 refs = atomic_read(&eb->refs); 4795 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4796 return; 4797 4798 spin_lock(&eb->refs_lock); 4799 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4800 atomic_inc(&eb->refs); 4801 spin_unlock(&eb->refs_lock); 4802 } 4803 4804 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 4805 struct page *accessed) 4806 { 4807 unsigned long num_pages, i; 4808 4809 check_buffer_tree_ref(eb); 4810 4811 num_pages = num_extent_pages(eb->start, eb->len); 4812 for (i = 0; i < num_pages; i++) { 4813 struct page *p = eb->pages[i]; 4814 4815 if (p != accessed) 4816 mark_page_accessed(p); 4817 } 4818 } 4819 4820 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 4821 u64 start) 4822 { 4823 struct extent_buffer *eb; 4824 4825 rcu_read_lock(); 4826 eb = radix_tree_lookup(&fs_info->buffer_radix, 4827 start >> PAGE_SHIFT); 4828 if (eb && atomic_inc_not_zero(&eb->refs)) { 4829 rcu_read_unlock(); 4830 /* 4831 * Lock our eb's refs_lock to avoid races with 4832 * free_extent_buffer. When we get our eb it might be flagged 4833 * with EXTENT_BUFFER_STALE and another task running 4834 * free_extent_buffer might have seen that flag set, 4835 * eb->refs == 2, that the buffer isn't under IO (dirty and 4836 * writeback flags not set) and it's still in the tree (flag 4837 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 4838 * of decrementing the extent buffer's reference count twice. 4839 * So here we could race and increment the eb's reference count, 4840 * clear its stale flag, mark it as dirty and drop our reference 4841 * before the other task finishes executing free_extent_buffer, 4842 * which would later result in an attempt to free an extent 4843 * buffer that is dirty. 4844 */ 4845 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 4846 spin_lock(&eb->refs_lock); 4847 spin_unlock(&eb->refs_lock); 4848 } 4849 mark_extent_buffer_accessed(eb, NULL); 4850 return eb; 4851 } 4852 rcu_read_unlock(); 4853 4854 return NULL; 4855 } 4856 4857 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4858 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 4859 u64 start) 4860 { 4861 struct extent_buffer *eb, *exists = NULL; 4862 int ret; 4863 4864 eb = find_extent_buffer(fs_info, start); 4865 if (eb) 4866 return eb; 4867 eb = alloc_dummy_extent_buffer(fs_info, start); 4868 if (!eb) 4869 return NULL; 4870 eb->fs_info = fs_info; 4871 again: 4872 ret = radix_tree_preload(GFP_NOFS); 4873 if (ret) 4874 goto free_eb; 4875 spin_lock(&fs_info->buffer_lock); 4876 ret = radix_tree_insert(&fs_info->buffer_radix, 4877 start >> PAGE_SHIFT, eb); 4878 spin_unlock(&fs_info->buffer_lock); 4879 radix_tree_preload_end(); 4880 if (ret == -EEXIST) { 4881 exists = find_extent_buffer(fs_info, start); 4882 if (exists) 4883 goto free_eb; 4884 else 4885 goto again; 4886 } 4887 check_buffer_tree_ref(eb); 4888 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4889 4890 /* 4891 * We will free dummy extent buffer's if they come into 4892 * free_extent_buffer with a ref count of 2, but if we are using this we 4893 * want the buffers to stay in memory until we're done with them, so 4894 * bump the ref count again. 4895 */ 4896 atomic_inc(&eb->refs); 4897 return eb; 4898 free_eb: 4899 btrfs_release_extent_buffer(eb); 4900 return exists; 4901 } 4902 #endif 4903 4904 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4905 u64 start) 4906 { 4907 unsigned long len = fs_info->nodesize; 4908 unsigned long num_pages = num_extent_pages(start, len); 4909 unsigned long i; 4910 unsigned long index = start >> PAGE_SHIFT; 4911 struct extent_buffer *eb; 4912 struct extent_buffer *exists = NULL; 4913 struct page *p; 4914 struct address_space *mapping = fs_info->btree_inode->i_mapping; 4915 int uptodate = 1; 4916 int ret; 4917 4918 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 4919 btrfs_err(fs_info, "bad tree block start %llu", start); 4920 return ERR_PTR(-EINVAL); 4921 } 4922 4923 eb = find_extent_buffer(fs_info, start); 4924 if (eb) 4925 return eb; 4926 4927 eb = __alloc_extent_buffer(fs_info, start, len); 4928 if (!eb) 4929 return ERR_PTR(-ENOMEM); 4930 4931 for (i = 0; i < num_pages; i++, index++) { 4932 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 4933 if (!p) { 4934 exists = ERR_PTR(-ENOMEM); 4935 goto free_eb; 4936 } 4937 4938 spin_lock(&mapping->private_lock); 4939 if (PagePrivate(p)) { 4940 /* 4941 * We could have already allocated an eb for this page 4942 * and attached one so lets see if we can get a ref on 4943 * the existing eb, and if we can we know it's good and 4944 * we can just return that one, else we know we can just 4945 * overwrite page->private. 4946 */ 4947 exists = (struct extent_buffer *)p->private; 4948 if (atomic_inc_not_zero(&exists->refs)) { 4949 spin_unlock(&mapping->private_lock); 4950 unlock_page(p); 4951 put_page(p); 4952 mark_extent_buffer_accessed(exists, p); 4953 goto free_eb; 4954 } 4955 exists = NULL; 4956 4957 /* 4958 * Do this so attach doesn't complain and we need to 4959 * drop the ref the old guy had. 4960 */ 4961 ClearPagePrivate(p); 4962 WARN_ON(PageDirty(p)); 4963 put_page(p); 4964 } 4965 attach_extent_buffer_page(eb, p); 4966 spin_unlock(&mapping->private_lock); 4967 WARN_ON(PageDirty(p)); 4968 eb->pages[i] = p; 4969 if (!PageUptodate(p)) 4970 uptodate = 0; 4971 4972 /* 4973 * see below about how we avoid a nasty race with release page 4974 * and why we unlock later 4975 */ 4976 } 4977 if (uptodate) 4978 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4979 again: 4980 ret = radix_tree_preload(GFP_NOFS); 4981 if (ret) { 4982 exists = ERR_PTR(ret); 4983 goto free_eb; 4984 } 4985 4986 spin_lock(&fs_info->buffer_lock); 4987 ret = radix_tree_insert(&fs_info->buffer_radix, 4988 start >> PAGE_SHIFT, eb); 4989 spin_unlock(&fs_info->buffer_lock); 4990 radix_tree_preload_end(); 4991 if (ret == -EEXIST) { 4992 exists = find_extent_buffer(fs_info, start); 4993 if (exists) 4994 goto free_eb; 4995 else 4996 goto again; 4997 } 4998 /* add one reference for the tree */ 4999 check_buffer_tree_ref(eb); 5000 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5001 5002 /* 5003 * there is a race where release page may have 5004 * tried to find this extent buffer in the radix 5005 * but failed. It will tell the VM it is safe to 5006 * reclaim the, and it will clear the page private bit. 5007 * We must make sure to set the page private bit properly 5008 * after the extent buffer is in the radix tree so 5009 * it doesn't get lost 5010 */ 5011 SetPageChecked(eb->pages[0]); 5012 for (i = 1; i < num_pages; i++) { 5013 p = eb->pages[i]; 5014 ClearPageChecked(p); 5015 unlock_page(p); 5016 } 5017 unlock_page(eb->pages[0]); 5018 return eb; 5019 5020 free_eb: 5021 WARN_ON(!atomic_dec_and_test(&eb->refs)); 5022 for (i = 0; i < num_pages; i++) { 5023 if (eb->pages[i]) 5024 unlock_page(eb->pages[i]); 5025 } 5026 5027 btrfs_release_extent_buffer(eb); 5028 return exists; 5029 } 5030 5031 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5032 { 5033 struct extent_buffer *eb = 5034 container_of(head, struct extent_buffer, rcu_head); 5035 5036 __free_extent_buffer(eb); 5037 } 5038 5039 /* Expects to have eb->eb_lock already held */ 5040 static int release_extent_buffer(struct extent_buffer *eb) 5041 { 5042 WARN_ON(atomic_read(&eb->refs) == 0); 5043 if (atomic_dec_and_test(&eb->refs)) { 5044 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5045 struct btrfs_fs_info *fs_info = eb->fs_info; 5046 5047 spin_unlock(&eb->refs_lock); 5048 5049 spin_lock(&fs_info->buffer_lock); 5050 radix_tree_delete(&fs_info->buffer_radix, 5051 eb->start >> PAGE_SHIFT); 5052 spin_unlock(&fs_info->buffer_lock); 5053 } else { 5054 spin_unlock(&eb->refs_lock); 5055 } 5056 5057 /* Should be safe to release our pages at this point */ 5058 btrfs_release_extent_buffer_page(eb); 5059 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5060 if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { 5061 __free_extent_buffer(eb); 5062 return 1; 5063 } 5064 #endif 5065 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5066 return 1; 5067 } 5068 spin_unlock(&eb->refs_lock); 5069 5070 return 0; 5071 } 5072 5073 void free_extent_buffer(struct extent_buffer *eb) 5074 { 5075 int refs; 5076 int old; 5077 if (!eb) 5078 return; 5079 5080 while (1) { 5081 refs = atomic_read(&eb->refs); 5082 if (refs <= 3) 5083 break; 5084 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5085 if (old == refs) 5086 return; 5087 } 5088 5089 spin_lock(&eb->refs_lock); 5090 if (atomic_read(&eb->refs) == 2 && 5091 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 5092 atomic_dec(&eb->refs); 5093 5094 if (atomic_read(&eb->refs) == 2 && 5095 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5096 !extent_buffer_under_io(eb) && 5097 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5098 atomic_dec(&eb->refs); 5099 5100 /* 5101 * I know this is terrible, but it's temporary until we stop tracking 5102 * the uptodate bits and such for the extent buffers. 5103 */ 5104 release_extent_buffer(eb); 5105 } 5106 5107 void free_extent_buffer_stale(struct extent_buffer *eb) 5108 { 5109 if (!eb) 5110 return; 5111 5112 spin_lock(&eb->refs_lock); 5113 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5114 5115 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5116 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5117 atomic_dec(&eb->refs); 5118 release_extent_buffer(eb); 5119 } 5120 5121 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5122 { 5123 unsigned long i; 5124 unsigned long num_pages; 5125 struct page *page; 5126 5127 num_pages = num_extent_pages(eb->start, eb->len); 5128 5129 for (i = 0; i < num_pages; i++) { 5130 page = eb->pages[i]; 5131 if (!PageDirty(page)) 5132 continue; 5133 5134 lock_page(page); 5135 WARN_ON(!PagePrivate(page)); 5136 5137 clear_page_dirty_for_io(page); 5138 spin_lock_irq(&page->mapping->tree_lock); 5139 if (!PageDirty(page)) { 5140 radix_tree_tag_clear(&page->mapping->page_tree, 5141 page_index(page), 5142 PAGECACHE_TAG_DIRTY); 5143 } 5144 spin_unlock_irq(&page->mapping->tree_lock); 5145 ClearPageError(page); 5146 unlock_page(page); 5147 } 5148 WARN_ON(atomic_read(&eb->refs) == 0); 5149 } 5150 5151 int set_extent_buffer_dirty(struct extent_buffer *eb) 5152 { 5153 unsigned long i; 5154 unsigned long num_pages; 5155 int was_dirty = 0; 5156 5157 check_buffer_tree_ref(eb); 5158 5159 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5160 5161 num_pages = num_extent_pages(eb->start, eb->len); 5162 WARN_ON(atomic_read(&eb->refs) == 0); 5163 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5164 5165 for (i = 0; i < num_pages; i++) 5166 set_page_dirty(eb->pages[i]); 5167 return was_dirty; 5168 } 5169 5170 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5171 { 5172 unsigned long i; 5173 struct page *page; 5174 unsigned long num_pages; 5175 5176 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5177 num_pages = num_extent_pages(eb->start, eb->len); 5178 for (i = 0; i < num_pages; i++) { 5179 page = eb->pages[i]; 5180 if (page) 5181 ClearPageUptodate(page); 5182 } 5183 } 5184 5185 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5186 { 5187 unsigned long i; 5188 struct page *page; 5189 unsigned long num_pages; 5190 5191 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5192 num_pages = num_extent_pages(eb->start, eb->len); 5193 for (i = 0; i < num_pages; i++) { 5194 page = eb->pages[i]; 5195 SetPageUptodate(page); 5196 } 5197 } 5198 5199 int extent_buffer_uptodate(struct extent_buffer *eb) 5200 { 5201 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5202 } 5203 5204 int read_extent_buffer_pages(struct extent_io_tree *tree, 5205 struct extent_buffer *eb, int wait, 5206 get_extent_t *get_extent, int mirror_num) 5207 { 5208 unsigned long i; 5209 struct page *page; 5210 int err; 5211 int ret = 0; 5212 int locked_pages = 0; 5213 int all_uptodate = 1; 5214 unsigned long num_pages; 5215 unsigned long num_reads = 0; 5216 struct bio *bio = NULL; 5217 unsigned long bio_flags = 0; 5218 5219 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5220 return 0; 5221 5222 num_pages = num_extent_pages(eb->start, eb->len); 5223 for (i = 0; i < num_pages; i++) { 5224 page = eb->pages[i]; 5225 if (wait == WAIT_NONE) { 5226 if (!trylock_page(page)) 5227 goto unlock_exit; 5228 } else { 5229 lock_page(page); 5230 } 5231 locked_pages++; 5232 } 5233 /* 5234 * We need to firstly lock all pages to make sure that 5235 * the uptodate bit of our pages won't be affected by 5236 * clear_extent_buffer_uptodate(). 5237 */ 5238 for (i = 0; i < num_pages; i++) { 5239 page = eb->pages[i]; 5240 if (!PageUptodate(page)) { 5241 num_reads++; 5242 all_uptodate = 0; 5243 } 5244 } 5245 5246 if (all_uptodate) { 5247 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5248 goto unlock_exit; 5249 } 5250 5251 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5252 eb->read_mirror = 0; 5253 atomic_set(&eb->io_pages, num_reads); 5254 for (i = 0; i < num_pages; i++) { 5255 page = eb->pages[i]; 5256 5257 if (!PageUptodate(page)) { 5258 if (ret) { 5259 atomic_dec(&eb->io_pages); 5260 unlock_page(page); 5261 continue; 5262 } 5263 5264 ClearPageError(page); 5265 err = __extent_read_full_page(tree, page, 5266 get_extent, &bio, 5267 mirror_num, &bio_flags, 5268 REQ_META); 5269 if (err) { 5270 ret = err; 5271 /* 5272 * We use &bio in above __extent_read_full_page, 5273 * so we ensure that if it returns error, the 5274 * current page fails to add itself to bio and 5275 * it's been unlocked. 5276 * 5277 * We must dec io_pages by ourselves. 5278 */ 5279 atomic_dec(&eb->io_pages); 5280 } 5281 } else { 5282 unlock_page(page); 5283 } 5284 } 5285 5286 if (bio) { 5287 err = submit_one_bio(bio, mirror_num, bio_flags); 5288 if (err) 5289 return err; 5290 } 5291 5292 if (ret || wait != WAIT_COMPLETE) 5293 return ret; 5294 5295 for (i = 0; i < num_pages; i++) { 5296 page = eb->pages[i]; 5297 wait_on_page_locked(page); 5298 if (!PageUptodate(page)) 5299 ret = -EIO; 5300 } 5301 5302 return ret; 5303 5304 unlock_exit: 5305 while (locked_pages > 0) { 5306 locked_pages--; 5307 page = eb->pages[locked_pages]; 5308 unlock_page(page); 5309 } 5310 return ret; 5311 } 5312 5313 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 5314 unsigned long start, 5315 unsigned long len) 5316 { 5317 size_t cur; 5318 size_t offset; 5319 struct page *page; 5320 char *kaddr; 5321 char *dst = (char *)dstv; 5322 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5323 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5324 5325 WARN_ON(start > eb->len); 5326 WARN_ON(start + len > eb->start + eb->len); 5327 5328 offset = (start_offset + start) & (PAGE_SIZE - 1); 5329 5330 while (len > 0) { 5331 page = eb->pages[i]; 5332 5333 cur = min(len, (PAGE_SIZE - offset)); 5334 kaddr = page_address(page); 5335 memcpy(dst, kaddr + offset, cur); 5336 5337 dst += cur; 5338 len -= cur; 5339 offset = 0; 5340 i++; 5341 } 5342 } 5343 5344 int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, 5345 unsigned long start, 5346 unsigned long len) 5347 { 5348 size_t cur; 5349 size_t offset; 5350 struct page *page; 5351 char *kaddr; 5352 char __user *dst = (char __user *)dstv; 5353 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5354 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5355 int ret = 0; 5356 5357 WARN_ON(start > eb->len); 5358 WARN_ON(start + len > eb->start + eb->len); 5359 5360 offset = (start_offset + start) & (PAGE_SIZE - 1); 5361 5362 while (len > 0) { 5363 page = eb->pages[i]; 5364 5365 cur = min(len, (PAGE_SIZE - offset)); 5366 kaddr = page_address(page); 5367 if (copy_to_user(dst, kaddr + offset, cur)) { 5368 ret = -EFAULT; 5369 break; 5370 } 5371 5372 dst += cur; 5373 len -= cur; 5374 offset = 0; 5375 i++; 5376 } 5377 5378 return ret; 5379 } 5380 5381 /* 5382 * return 0 if the item is found within a page. 5383 * return 1 if the item spans two pages. 5384 * return -EINVAL otherwise. 5385 */ 5386 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 5387 unsigned long min_len, char **map, 5388 unsigned long *map_start, 5389 unsigned long *map_len) 5390 { 5391 size_t offset = start & (PAGE_SIZE - 1); 5392 char *kaddr; 5393 struct page *p; 5394 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5395 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5396 unsigned long end_i = (start_offset + start + min_len - 1) >> 5397 PAGE_SHIFT; 5398 5399 if (i != end_i) 5400 return 1; 5401 5402 if (i == 0) { 5403 offset = start_offset; 5404 *map_start = 0; 5405 } else { 5406 offset = 0; 5407 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5408 } 5409 5410 if (start + min_len > eb->len) { 5411 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5412 eb->start, eb->len, start, min_len); 5413 return -EINVAL; 5414 } 5415 5416 p = eb->pages[i]; 5417 kaddr = page_address(p); 5418 *map = kaddr + offset; 5419 *map_len = PAGE_SIZE - offset; 5420 return 0; 5421 } 5422 5423 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 5424 unsigned long start, 5425 unsigned long len) 5426 { 5427 size_t cur; 5428 size_t offset; 5429 struct page *page; 5430 char *kaddr; 5431 char *ptr = (char *)ptrv; 5432 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5433 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5434 int ret = 0; 5435 5436 WARN_ON(start > eb->len); 5437 WARN_ON(start + len > eb->start + eb->len); 5438 5439 offset = (start_offset + start) & (PAGE_SIZE - 1); 5440 5441 while (len > 0) { 5442 page = eb->pages[i]; 5443 5444 cur = min(len, (PAGE_SIZE - offset)); 5445 5446 kaddr = page_address(page); 5447 ret = memcmp(ptr, kaddr + offset, cur); 5448 if (ret) 5449 break; 5450 5451 ptr += cur; 5452 len -= cur; 5453 offset = 0; 5454 i++; 5455 } 5456 return ret; 5457 } 5458 5459 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, 5460 const void *srcv) 5461 { 5462 char *kaddr; 5463 5464 WARN_ON(!PageUptodate(eb->pages[0])); 5465 kaddr = page_address(eb->pages[0]); 5466 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 5467 BTRFS_FSID_SIZE); 5468 } 5469 5470 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) 5471 { 5472 char *kaddr; 5473 5474 WARN_ON(!PageUptodate(eb->pages[0])); 5475 kaddr = page_address(eb->pages[0]); 5476 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 5477 BTRFS_FSID_SIZE); 5478 } 5479 5480 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5481 unsigned long start, unsigned long len) 5482 { 5483 size_t cur; 5484 size_t offset; 5485 struct page *page; 5486 char *kaddr; 5487 char *src = (char *)srcv; 5488 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5489 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5490 5491 WARN_ON(start > eb->len); 5492 WARN_ON(start + len > eb->start + eb->len); 5493 5494 offset = (start_offset + start) & (PAGE_SIZE - 1); 5495 5496 while (len > 0) { 5497 page = eb->pages[i]; 5498 WARN_ON(!PageUptodate(page)); 5499 5500 cur = min(len, PAGE_SIZE - offset); 5501 kaddr = page_address(page); 5502 memcpy(kaddr + offset, src, cur); 5503 5504 src += cur; 5505 len -= cur; 5506 offset = 0; 5507 i++; 5508 } 5509 } 5510 5511 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, 5512 unsigned long len) 5513 { 5514 size_t cur; 5515 size_t offset; 5516 struct page *page; 5517 char *kaddr; 5518 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5519 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5520 5521 WARN_ON(start > eb->len); 5522 WARN_ON(start + len > eb->start + eb->len); 5523 5524 offset = (start_offset + start) & (PAGE_SIZE - 1); 5525 5526 while (len > 0) { 5527 page = eb->pages[i]; 5528 WARN_ON(!PageUptodate(page)); 5529 5530 cur = min(len, PAGE_SIZE - offset); 5531 kaddr = page_address(page); 5532 memset(kaddr + offset, 0, cur); 5533 5534 len -= cur; 5535 offset = 0; 5536 i++; 5537 } 5538 } 5539 5540 void copy_extent_buffer_full(struct extent_buffer *dst, 5541 struct extent_buffer *src) 5542 { 5543 int i; 5544 unsigned num_pages; 5545 5546 ASSERT(dst->len == src->len); 5547 5548 num_pages = num_extent_pages(dst->start, dst->len); 5549 for (i = 0; i < num_pages; i++) 5550 copy_page(page_address(dst->pages[i]), 5551 page_address(src->pages[i])); 5552 } 5553 5554 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5555 unsigned long dst_offset, unsigned long src_offset, 5556 unsigned long len) 5557 { 5558 u64 dst_len = dst->len; 5559 size_t cur; 5560 size_t offset; 5561 struct page *page; 5562 char *kaddr; 5563 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5564 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5565 5566 WARN_ON(src->len != dst_len); 5567 5568 offset = (start_offset + dst_offset) & 5569 (PAGE_SIZE - 1); 5570 5571 while (len > 0) { 5572 page = dst->pages[i]; 5573 WARN_ON(!PageUptodate(page)); 5574 5575 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5576 5577 kaddr = page_address(page); 5578 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5579 5580 src_offset += cur; 5581 len -= cur; 5582 offset = 0; 5583 i++; 5584 } 5585 } 5586 5587 void le_bitmap_set(u8 *map, unsigned int start, int len) 5588 { 5589 u8 *p = map + BIT_BYTE(start); 5590 const unsigned int size = start + len; 5591 int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5592 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); 5593 5594 while (len - bits_to_set >= 0) { 5595 *p |= mask_to_set; 5596 len -= bits_to_set; 5597 bits_to_set = BITS_PER_BYTE; 5598 mask_to_set = ~0; 5599 p++; 5600 } 5601 if (len) { 5602 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5603 *p |= mask_to_set; 5604 } 5605 } 5606 5607 void le_bitmap_clear(u8 *map, unsigned int start, int len) 5608 { 5609 u8 *p = map + BIT_BYTE(start); 5610 const unsigned int size = start + len; 5611 int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5612 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); 5613 5614 while (len - bits_to_clear >= 0) { 5615 *p &= ~mask_to_clear; 5616 len -= bits_to_clear; 5617 bits_to_clear = BITS_PER_BYTE; 5618 mask_to_clear = ~0; 5619 p++; 5620 } 5621 if (len) { 5622 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5623 *p &= ~mask_to_clear; 5624 } 5625 } 5626 5627 /* 5628 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5629 * given bit number 5630 * @eb: the extent buffer 5631 * @start: offset of the bitmap item in the extent buffer 5632 * @nr: bit number 5633 * @page_index: return index of the page in the extent buffer that contains the 5634 * given bit number 5635 * @page_offset: return offset into the page given by page_index 5636 * 5637 * This helper hides the ugliness of finding the byte in an extent buffer which 5638 * contains a given bit. 5639 */ 5640 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5641 unsigned long start, unsigned long nr, 5642 unsigned long *page_index, 5643 size_t *page_offset) 5644 { 5645 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5646 size_t byte_offset = BIT_BYTE(nr); 5647 size_t offset; 5648 5649 /* 5650 * The byte we want is the offset of the extent buffer + the offset of 5651 * the bitmap item in the extent buffer + the offset of the byte in the 5652 * bitmap item. 5653 */ 5654 offset = start_offset + start + byte_offset; 5655 5656 *page_index = offset >> PAGE_SHIFT; 5657 *page_offset = offset & (PAGE_SIZE - 1); 5658 } 5659 5660 /** 5661 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5662 * @eb: the extent buffer 5663 * @start: offset of the bitmap item in the extent buffer 5664 * @nr: bit number to test 5665 */ 5666 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5667 unsigned long nr) 5668 { 5669 u8 *kaddr; 5670 struct page *page; 5671 unsigned long i; 5672 size_t offset; 5673 5674 eb_bitmap_offset(eb, start, nr, &i, &offset); 5675 page = eb->pages[i]; 5676 WARN_ON(!PageUptodate(page)); 5677 kaddr = page_address(page); 5678 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5679 } 5680 5681 /** 5682 * extent_buffer_bitmap_set - set an area of a bitmap 5683 * @eb: the extent buffer 5684 * @start: offset of the bitmap item in the extent buffer 5685 * @pos: bit number of the first bit 5686 * @len: number of bits to set 5687 */ 5688 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5689 unsigned long pos, unsigned long len) 5690 { 5691 u8 *kaddr; 5692 struct page *page; 5693 unsigned long i; 5694 size_t offset; 5695 const unsigned int size = pos + len; 5696 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5697 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5698 5699 eb_bitmap_offset(eb, start, pos, &i, &offset); 5700 page = eb->pages[i]; 5701 WARN_ON(!PageUptodate(page)); 5702 kaddr = page_address(page); 5703 5704 while (len >= bits_to_set) { 5705 kaddr[offset] |= mask_to_set; 5706 len -= bits_to_set; 5707 bits_to_set = BITS_PER_BYTE; 5708 mask_to_set = ~0; 5709 if (++offset >= PAGE_SIZE && len > 0) { 5710 offset = 0; 5711 page = eb->pages[++i]; 5712 WARN_ON(!PageUptodate(page)); 5713 kaddr = page_address(page); 5714 } 5715 } 5716 if (len) { 5717 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5718 kaddr[offset] |= mask_to_set; 5719 } 5720 } 5721 5722 5723 /** 5724 * extent_buffer_bitmap_clear - clear an area of a bitmap 5725 * @eb: the extent buffer 5726 * @start: offset of the bitmap item in the extent buffer 5727 * @pos: bit number of the first bit 5728 * @len: number of bits to clear 5729 */ 5730 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5731 unsigned long pos, unsigned long len) 5732 { 5733 u8 *kaddr; 5734 struct page *page; 5735 unsigned long i; 5736 size_t offset; 5737 const unsigned int size = pos + len; 5738 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5739 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5740 5741 eb_bitmap_offset(eb, start, pos, &i, &offset); 5742 page = eb->pages[i]; 5743 WARN_ON(!PageUptodate(page)); 5744 kaddr = page_address(page); 5745 5746 while (len >= bits_to_clear) { 5747 kaddr[offset] &= ~mask_to_clear; 5748 len -= bits_to_clear; 5749 bits_to_clear = BITS_PER_BYTE; 5750 mask_to_clear = ~0; 5751 if (++offset >= PAGE_SIZE && len > 0) { 5752 offset = 0; 5753 page = eb->pages[++i]; 5754 WARN_ON(!PageUptodate(page)); 5755 kaddr = page_address(page); 5756 } 5757 } 5758 if (len) { 5759 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5760 kaddr[offset] &= ~mask_to_clear; 5761 } 5762 } 5763 5764 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5765 { 5766 unsigned long distance = (src > dst) ? src - dst : dst - src; 5767 return distance < len; 5768 } 5769 5770 static void copy_pages(struct page *dst_page, struct page *src_page, 5771 unsigned long dst_off, unsigned long src_off, 5772 unsigned long len) 5773 { 5774 char *dst_kaddr = page_address(dst_page); 5775 char *src_kaddr; 5776 int must_memmove = 0; 5777 5778 if (dst_page != src_page) { 5779 src_kaddr = page_address(src_page); 5780 } else { 5781 src_kaddr = dst_kaddr; 5782 if (areas_overlap(src_off, dst_off, len)) 5783 must_memmove = 1; 5784 } 5785 5786 if (must_memmove) 5787 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5788 else 5789 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5790 } 5791 5792 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5793 unsigned long src_offset, unsigned long len) 5794 { 5795 struct btrfs_fs_info *fs_info = dst->fs_info; 5796 size_t cur; 5797 size_t dst_off_in_page; 5798 size_t src_off_in_page; 5799 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5800 unsigned long dst_i; 5801 unsigned long src_i; 5802 5803 if (src_offset + len > dst->len) { 5804 btrfs_err(fs_info, 5805 "memmove bogus src_offset %lu move len %lu dst len %lu", 5806 src_offset, len, dst->len); 5807 BUG_ON(1); 5808 } 5809 if (dst_offset + len > dst->len) { 5810 btrfs_err(fs_info, 5811 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5812 dst_offset, len, dst->len); 5813 BUG_ON(1); 5814 } 5815 5816 while (len > 0) { 5817 dst_off_in_page = (start_offset + dst_offset) & 5818 (PAGE_SIZE - 1); 5819 src_off_in_page = (start_offset + src_offset) & 5820 (PAGE_SIZE - 1); 5821 5822 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5823 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5824 5825 cur = min(len, (unsigned long)(PAGE_SIZE - 5826 src_off_in_page)); 5827 cur = min_t(unsigned long, cur, 5828 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5829 5830 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5831 dst_off_in_page, src_off_in_page, cur); 5832 5833 src_offset += cur; 5834 dst_offset += cur; 5835 len -= cur; 5836 } 5837 } 5838 5839 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5840 unsigned long src_offset, unsigned long len) 5841 { 5842 struct btrfs_fs_info *fs_info = dst->fs_info; 5843 size_t cur; 5844 size_t dst_off_in_page; 5845 size_t src_off_in_page; 5846 unsigned long dst_end = dst_offset + len - 1; 5847 unsigned long src_end = src_offset + len - 1; 5848 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5849 unsigned long dst_i; 5850 unsigned long src_i; 5851 5852 if (src_offset + len > dst->len) { 5853 btrfs_err(fs_info, 5854 "memmove bogus src_offset %lu move len %lu len %lu", 5855 src_offset, len, dst->len); 5856 BUG_ON(1); 5857 } 5858 if (dst_offset + len > dst->len) { 5859 btrfs_err(fs_info, 5860 "memmove bogus dst_offset %lu move len %lu len %lu", 5861 dst_offset, len, dst->len); 5862 BUG_ON(1); 5863 } 5864 if (dst_offset < src_offset) { 5865 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5866 return; 5867 } 5868 while (len > 0) { 5869 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 5870 src_i = (start_offset + src_end) >> PAGE_SHIFT; 5871 5872 dst_off_in_page = (start_offset + dst_end) & 5873 (PAGE_SIZE - 1); 5874 src_off_in_page = (start_offset + src_end) & 5875 (PAGE_SIZE - 1); 5876 5877 cur = min_t(unsigned long, len, src_off_in_page + 1); 5878 cur = min(cur, dst_off_in_page + 1); 5879 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5880 dst_off_in_page - cur + 1, 5881 src_off_in_page - cur + 1, cur); 5882 5883 dst_end -= cur; 5884 src_end -= cur; 5885 len -= cur; 5886 } 5887 } 5888 5889 int try_release_extent_buffer(struct page *page) 5890 { 5891 struct extent_buffer *eb; 5892 5893 /* 5894 * We need to make sure nobody is attaching this page to an eb right 5895 * now. 5896 */ 5897 spin_lock(&page->mapping->private_lock); 5898 if (!PagePrivate(page)) { 5899 spin_unlock(&page->mapping->private_lock); 5900 return 1; 5901 } 5902 5903 eb = (struct extent_buffer *)page->private; 5904 BUG_ON(!eb); 5905 5906 /* 5907 * This is a little awful but should be ok, we need to make sure that 5908 * the eb doesn't disappear out from under us while we're looking at 5909 * this page. 5910 */ 5911 spin_lock(&eb->refs_lock); 5912 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5913 spin_unlock(&eb->refs_lock); 5914 spin_unlock(&page->mapping->private_lock); 5915 return 0; 5916 } 5917 spin_unlock(&page->mapping->private_lock); 5918 5919 /* 5920 * If tree ref isn't set then we know the ref on this eb is a real ref, 5921 * so just return, this page will likely be freed soon anyway. 5922 */ 5923 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5924 spin_unlock(&eb->refs_lock); 5925 return 0; 5926 } 5927 5928 return release_extent_buffer(eb); 5929 } 5930